From b6c435f5147ee1e9dffdb2bd83e544993402b832 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:35:58 -0500 Subject: [PATCH 01/14] chore(deps): upgrade to Go 1.25 and golangci-lint v2.12.2 Bump the toolchain to Go 1.25 and golangci-lint v2.12.2 and align the CI workflows and Makefile with the new versions. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/lint.yml | 4 +- .github/workflows/publish.yaml | 3 + .github/workflows/test-e2e.yml | 2 +- .github/workflows/test.yml | 2 +- .gitignore | 7 +- .golangci.yml | 10 ++ Makefile | 2 +- go.mod | 117 +++++++------ go.sum | 309 +++++++++++++++------------------ 9 files changed, 222 insertions(+), 234 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ddfaa170..caa00ff3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,9 +15,9 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.5 + version: v2.12.2 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76b..5dcc90bb 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8429bf2d..9bede775 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Install the latest version of kind run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834d33a0..462cbf3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Running Tests run: | diff --git a/.gitignore b/.gitignore index 2b0c6e44..d5cc564d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,8 @@ # Output of the go coverage tool, specifically when used with LiteIDE *.out -# Dependency directories (remove the comment below to include it) -# vendor/ +# Dependency directories +vendor/ # Go workspace file go.work @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/.golangci.yml b/.golangci.yml index a7246fbb..e0342bda 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -35,6 +35,16 @@ linters: - dupl - lll path: internal/* + # field.ErrorList{} is the idiomatic Kubernetes validation init pattern; + # preallocating requires knowing the error count in advance which is not + # possible in recursive validation helpers. + - linters: + - prealloc + path: internal/validation/ + # Append-built slices in this package are clearer without prealloc. + - linters: + - prealloc + path: internal/controller/instancecontrol/ paths: - third_party$ - builtin$ diff --git a/Makefile b/Makefile index 61744a36..3d6a3e2e 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ KUSTOMIZE_VERSION ?= v5.5.0 CONTROLLER_TOOLS_VERSION ?= v0.16.4 DEFAULTER_GEN_VERSION ?= v0.32.3 ENVTEST_VERSION ?= release-0.19 -GOLANGCI_LINT_VERSION ?= v2.1.5 +GOLANGCI_LINT_VERSION ?= v2.12.2 # renovate: datasource=go depName=fybrik.io/crdoc CRDOC_VERSION ?= v0.6.4 diff --git a/go.mod b/go.mod index 19fc0103..48bab65b 100644 --- a/go.mod +++ b/go.mod @@ -1,31 +1,34 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.25.0 require ( + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.37.0 + github.com/karmada-io/api v1.15.0 + github.com/onsi/ginkgo/v2 v2.27.2 + github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 - go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 - golang.org/x/crypto v0.39.0 - golang.org/x/sync v0.16.0 + go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 + go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 + golang.org/x/crypto v0.45.0 + golang.org/x/sync v0.18.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 - k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.2.1 - sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + k8s.io/component-base v0.35.0 + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + sigs.k8s.io/controller-runtime v0.23.3 + sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c + sigs.k8s.io/multicluster-runtime v0.23.3 ) require ( - cel.dev/expr v0.19.1 // indirect - github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + cel.dev/expr v0.24.0 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -35,74 +38,70 @@ require ( github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.26.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.64.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + github.com/spf13/cobra v1.10.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/stoewer/go-strcase v1.3.1 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.41.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.33.0 // indirect + golang.org/x/tools v0.38.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.71.1 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/grpc v1.74.2 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index c472bd8b..42a98554 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,9 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= -github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -17,16 +19,22 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -42,17 +50,16 @@ github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZ github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI= +github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,18 +69,18 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -84,42 +91,43 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= -github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= +github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= +github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.1 h1:iS0MdW+kVTxgMoE1LAZyMiYJFKlOzLooE4MxjirtkAs= +github.com/stoewer/go-strcase v1.3.1/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -129,160 +137,125 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= -go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 h1:P3dePA6cCXKimZzE6d7Xxpj2rz54BxOHI8K8ic7VQ+c= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359/go.mod h1:Nr0PsCodkTW31vWVxR9dhAP9w0y+WHUYeyrcRnchcIE= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 h1:LSHyqLt/jus6iEMvo8pc731L+PyrTHP2bqfMMtHPSWc= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42/go.mod h1:p9O2kk194mvoL8rhqjwb+LWB+GIyY4vQqiTowwibVWo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 h1:m639+BofXTvcY1q8CGs4ItwQarYtJPOWmVobfM1HpVI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0/go.mod h1:LjReUci/F4BUyv+y4dwnq3h/26iNOeC3wAIqgvTIZVo= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 h1:0UOBWO4dC+e51ui0NFKSPbkHHiQ4TmrEfEZMLDyRmY8= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0/go.mod h1:8ytArBbtOy2xfht+y2fqKd5DRDJRUQhqbyEnQ4bDChs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 h1:MAKi5q709QWfnkkpNQ0M12hYJ1+e8qYVDyowc4U1XZM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= +google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= +k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= +k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= -sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 h1:Pq69tTKfN8ADw8m8A3wUtP8wJ9SPQbbOsgapm3BZEPw= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8/go.mod h1:CpBzLMLQKdm+UCchd2FiGPiDdCxM5dgCCPKuaQ6Fsv0= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c h1:GS4VnGRV90GEUjrgQ2GT5ii6yzWj3KtgUg+sVMdhs5c= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/multicluster-runtime v0.23.3 h1:vrzlXRzHTDsjspUAfoW2rCtr0agoI4q20p9x4Fz4png= +sigs.k8s.io/multicluster-runtime v0.23.3/go.mod h1:r/UA4GHgFoXCcR4tcvlZz7SiLx3l1kJKDuBAhILNIHs= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= From 74914897db6ea142e512bbb75aa9fb78af3ac230 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:36:22 -0500 Subject: [PATCH 02/14] refactor(controller): remove the central WorkloadDeployment scheduler Delete the central scheduler that placed WorkloadDeployments from a single control plane, and drop its registration from main. Placement now happens through the distributed federator and per-cell controllers introduced in the following commits. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/main.go | 4 - .../workloaddeployment_scheduler.go | 153 ------------------ 2 files changed, 157 deletions(-) delete mode 100644 internal/controller/workloaddeployment_scheduler.go diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc9..7d6a682d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -184,10 +184,6 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") os.Exit(1) } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) - } if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Instance") os.Exit(1) diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d64..00000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} From 0eb948f6a5ca0730259915e4a95e220ac1e353ff Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:36:38 -0500 Subject: [PATCH 03/14] feat(controller): add the WorkloadDeployment federator Introduce the federator that fans a WorkloadDeployment out to the cells selected for its placement, replacing the central scheduler. Add the city-code field indexer it uses to map subnet/location events back to the deployments that depend on them. Beyond fanning the spec out, the federator watches the downstream Karmada WorkloadDeployment (milosource cluster source with cluster-name-preserving enqueue) so aggregated status mirrors back to the project WorkloadDeployment immediately instead of waiting on an informer resync. Downstream events map back to the bare project cluster name the multicluster provider keys on, dropping events for clusters that are not engaged yet. The "cluster-" label encoding (project path with "/" -> "_") is centralized in EncodeClusterName/DecodeClusterName so the wire format lives in one place; the federator wraps the shared decoder and trims to the last path segment to recover the provider cluster key. Co-Authored-By: Claude Opus 4.8 (1M context) Co-Authored-By: Claude Sonnet 4.6 --- internal/controller/clustername.go | 21 + internal/controller/clustername_test.go | 28 + internal/controller/indexers.go | 31 +- internal/controller/indexers_test.go | 30 + .../workloaddeployment_federator.go | 559 +++++++++++++++++ .../workloaddeployment_federator_test.go | 576 ++++++++++++++++++ 6 files changed, 1230 insertions(+), 15 deletions(-) create mode 100644 internal/controller/clustername.go create mode 100644 internal/controller/clustername_test.go create mode 100644 internal/controller/indexers_test.go create mode 100644 internal/controller/workloaddeployment_federator.go create mode 100644 internal/controller/workloaddeployment_federator_test.go diff --git a/internal/controller/clustername.go b/internal/controller/clustername.go new file mode 100644 index 00000000..e726cf81 --- /dev/null +++ b/internal/controller/clustername.go @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import "strings" + +// The cross-plane cluster/project identity travels as a single Kubernetes label +// value: NSO's MappedNamespaceResourceStrategy encodes the name as "cluster-" +// with "/" replaced by "_", so a full project path ("org/project") survives as a +// legal label value ("cluster-org_project"). + +// EncodeClusterName renders a project/cluster name into the label wire form +// "cluster-" with "/" replaced by "_". +func EncodeClusterName(name string) string { + return "cluster-" + strings.ReplaceAll(name, "/", "_") +} + +// DecodeClusterName reverses EncodeClusterName, returning the full path. +func DecodeClusterName(encoded string) string { + return strings.ReplaceAll(strings.TrimPrefix(encoded, "cluster-"), "_", "/") +} diff --git a/internal/controller/clustername_test.go b/internal/controller/clustername_test.go new file mode 100644 index 00000000..269e9fc7 --- /dev/null +++ b/internal/controller/clustername_test.go @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "testing" +) + +func TestEncodeDecodeClusterName_RoundTrip(t *testing.T) { + t.Parallel() + cases := []struct { + name string + input string + }{ + {name: "simple name", input: "datum-cloud"}, + {name: "org/project path", input: "org/project"}, + {name: "three-segment path", input: "a/b/c"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := DecodeClusterName(EncodeClusterName(tc.input)) + if got != tc.input { + t.Errorf("round-trip(%q): got %q, want %q", tc.input, got, tc.input) + } + }) + } +} diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe88..311337e0 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,32 +33,30 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil } func deploymentWorkloadUIDIndexFunc(o client.Object) []string { - return []string{ - string(o.(*computev1alpha.WorkloadDeployment).Spec.WorkloadRef.UID), + // Skip deployments without a workload UID: indexing them under the empty + // key would make them matchable by a GC query built from a corrupt (empty) + // UID, mirroring deploymentCityCodeIndexFunc. + uid := string(o.(*computev1alpha.WorkloadDeployment).Spec.WorkloadRef.UID) + if uid == "" { + return nil } + return []string{uid} } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/indexers_test.go b/internal/controller/indexers_test.go new file mode 100644 index 00000000..a5afd919 --- /dev/null +++ b/internal/controller/indexers_test.go @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/types" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// TestDeploymentWorkloadUIDIndexFunc verifies that deployments without a +// workload UID are excluded from the index: indexing them under the empty key +// would make them matchable by a GC query built from a corrupt (empty) UID. +func TestDeploymentWorkloadUIDIndexFunc(t *testing.T) { + t.Parallel() + + withUID := &computev1alpha.WorkloadDeployment{ + Spec: computev1alpha.WorkloadDeploymentSpec{ + WorkloadRef: computev1alpha.WorkloadReference{UID: types.UID("wl-uid-1")}, + }, + } + assert.Equal(t, []string{"wl-uid-1"}, deploymentWorkloadUIDIndexFunc(withUID)) + + withoutUID := &computev1alpha.WorkloadDeployment{} + assert.Nil(t, deploymentWorkloadUIDIndexFunc(withoutUID), + "a deployment without a workload UID must not be indexed under the empty key") +} diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 00000000..332978d7 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mchandler "sigs.k8s.io/multicluster-runtime/pkg/handler" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" + milosource "go.miloapis.com/milo/pkg/multicluster-runtime/source" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" + + kindWorkloadDeployment = "WorkloadDeployment" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // FederationClient is a client pointed at the Karmada federation control + // plane (the federation hub that the management controllers read and write + // through). The caller (cmd/main.go) constructs it from --federation-kubeconfig. + FederationClient client.Client + // FederationCluster is a watchable cluster handle for the same Karmada + // federation control plane that FederationClient talks to. It is used to set + // up an informer-backed watch on the downstream WorkloadDeployment objects so + // that status aggregated by Karmada onto the downstream WD is mirrored back to + // the project-namespace WD immediately, rather than waiting for the next + // informer resync. When nil (e.g. in unit tests), the downstream watch is + // skipped and the controller falls back to watching only the VCP WD. + FederationCluster cluster.Cluster + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.FederationClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + // An empty cluster name resolves to the local host management cluster, which + // has no compute CRDs — any Get would fail with "no matches for kind" and + // requeue in a hot loop. The For watch (EngageWithLocalCluster=false) and the + // preservation-wrapped downstream watch both set a real project cluster name, + // so an empty name here is never legitimate. Drop it without erroring. + if req.ClusterName == "" { + logger.V(1).Info("dropping reconcile with empty cluster name") + return ctrl.Result{}, nil + } + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(req.ClusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, string(req.ClusterName)); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(clusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.FederationClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = EncodeClusterName(clusterName) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: kindWorkloadDeployment, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + // The webhook requires cityCode, so an empty value here is corruption. An + // empty-valued label selector would match the wrong deployment set and + // mis-decide whether the PropagationPolicy is still in use. + if cityCode == "" { + return fmt.Errorf("cannot evaluate PropagationPolicy usage in namespace %q: city code is empty", downstreamNS) + } + + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when FederationClient is non-nil. +// +// The controller watches two control planes: +// +// - The VCP/project WorkloadDeployment (via For), so spec changes in the +// project namespace trigger federation to the downstream control plane. +// - The downstream Karmada WorkloadDeployment (via WatchesRawSource against +// FederationCluster), so when Karmada aggregates new status onto the +// downstream WD the corresponding project WD is reconciled immediately and +// the status is mirrored back. Without this second watch the federator only +// caught up on the next informer resync (~10h), causing status lag. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + + b := mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator") + + // Watch the downstream Karmada WorkloadDeployment whose status we mirror. + // FederationCluster is a watchable handle for the federation control plane; + // it is nil in unit tests, where only the For watch is exercised. + // + // The handler MUST preserve the ClusterName that mapDownstreamDeploymentToRequest + // sets. milosource binds the raw source to the empty cluster name, and the + // default TypedEnqueueRequestsFromMapFunc wraps the map in TypedInjectCluster, + // which overwrites each request's ClusterName with that bound empty name — so + // every request would resolve to the local host cluster (no compute CRDs) and + // fail with "no matches for kind WorkloadDeployment". The preservation variant + // skips that injection so our project-cluster ClusterName survives to Reconcile. + if r.FederationCluster != nil { + preserveClusterName := func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[*computev1alpha.WorkloadDeployment, mcreconcile.Request] { + return mchandler.TypedEnqueueRequestsFromMapFuncWithClusterPreservation(r.mapDownstreamDeploymentToRequest) + } + b = b.WatchesRawSource(milosource.MustNewClusterSource( + r.FederationCluster, + &computev1alpha.WorkloadDeployment{}, + preserveClusterName, + )) + } + + return b.Complete(r) +} + +// mapDownstreamDeploymentToRequest maps an event on a downstream Karmada +// WorkloadDeployment to a reconcile request for the corresponding +// project-namespace WorkloadDeployment. +// +// Correlation mirrors the identity the federator establishes when it mirrors the +// object downstream (see upsertDownstreamDeployment / ensureDownstreamNamespace): +// +// - The WD name is stable across all planes, so the request name equals the +// downstream WD name. +// - upsertDownstreamDeployment stamps the downstream WD with +// UpstreamOwnerNamespaceLabel = the project namespace, which becomes the +// request namespace. +// - The project cluster name is not on the WD itself; ensureDownstreamNamespace +// stamps it as UpstreamOwnerClusterNameLabel on the downstream namespace +// (encoded "cluster-" with "/" -> "_"). We read the namespace from the +// federation plane to recover and decode it. +// +// Both correlation labels are stamped unconditionally by this controller +// (upsertDownstreamDeployment / ensureDownstreamNamespace), so a downstream WD +// or namespace lacking one is corruption, not a foreign object. Map functions +// cannot return errors and there is no polling backstop — a dropped event +// means permanently stale status on the project WD — so those drops are logged +// at error level to make the corruption visible. +func (r *WorkloadDeploymentFederator) mapDownstreamDeploymentToRequest( + ctx context.Context, + downstream *computev1alpha.WorkloadDeployment, +) []mcreconcile.Request { + logger := log.FromContext(ctx) + + projectNamespace := downstream.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNamespace == "" { + logger.Error(nil, "downstream WorkloadDeployment is missing the upstream-namespace label; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerNamespaceLabel) + return nil + } + + var ns corev1.Namespace + if err := r.FederationCluster.GetClient().Get(ctx, types.NamespacedName{Name: downstream.Namespace}, &ns); err != nil { + logger.V(1).Info("unable to resolve downstream namespace for status mapping; dropping event", + "downstreamNamespace", downstream.Namespace, "error", err) + return nil + } + encodedClusterName := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + logger.Error(nil, "downstream namespace is missing the upstream-cluster-name label; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerClusterNameLabel) + return nil + } + clusterName := projectClusterNameFromLabel(encodedClusterName) + if clusterName == "" { + logger.Error(nil, "undecodable upstream-cluster-name label on downstream namespace; dropping status event", + "downstreamNamespace", downstream.Namespace, "name", downstream.Name, + "label", downstreamclient.UpstreamOwnerClusterNameLabel, "encoded", encodedClusterName) + return nil + } + + // Verify the project cluster is engaged before enqueuing. The Milo + // multicluster provider keys clusters by bare project name, and GetCluster + // returns an error for an unknown name. Without this guard, an unresolvable + // name — or the empty string, which mcmanager routes to the local host + // cluster that has no compute CRDs — would make Reconcile fail with + // "no matches for kind WorkloadDeployment" in a hot loop. Dropping the event + // is safe: once the provider engages the project cluster, the For watch + // reconciles it and the next downstream status event maps cleanly. + if _, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)); err != nil { + logger.V(1).Info("project cluster not engaged for downstream status mapping; dropping event", + "clusterName", clusterName, "downstreamNamespace", downstream.Namespace, "error", err) + return nil + } + + return []mcreconcile.Request{ + { + ClusterName: multicluster.ClusterName(clusterName), + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: projectNamespace, + Name: downstream.Name, + }, + }, + }, + } +} + +// projectClusterNameFromLabel extracts the project cluster name that the Milo +// multicluster provider uses as its cluster key from a downstream namespace's +// UpstreamOwnerClusterNameLabel value. +// +// MappedNamespaceResourceStrategy encodes the label as "cluster-_" +// (with "/" replaced by "_"), e.g. "cluster-datum-cloud" (no org) or +// "cluster-_test-project-abc" (empty org). The provider, however, keys clusters +// by bare project name only (multicluster provider: key = project.Name), so we +// strip the "cluster-" prefix, decode "_" back to "/", and return the final path +// segment — the project name. Examples: +// +// "cluster-datum-cloud" -> "datum-cloud" +// "cluster-_test-project-abc" -> "test-project-abc" +func projectClusterNameFromLabel(encoded string) string { + name := DecodeClusterName(encoded) + if i := strings.LastIndex(name, "/"); i >= 0 { + name = name[i+1:] + } + return name +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 00000000..0b71f0a0 --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: testDefaultPlacement, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + FederationClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +// TestMapDownstreamDeploymentToRequest verifies the downstream-WD → project-WD +// mapping used by the cross-plane status watch: the request name equals the +// downstream WD name, the namespace comes from the WD's upstream-namespace label, +// and the cluster name is decoded from the downstream namespace's +// upstream-cluster-name label. Events lacking correlation metadata are dropped. +func TestMapDownstreamDeploymentToRequest(t *testing.T) { + t.Parallel() + + // The encoded cluster name on the downstream namespace decodes to testCluster. + encodedCluster := EncodeClusterName(testCluster) + + downstreamNS := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testKarmadaNSStr, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster, + }, + }, + } + + // A downstream namespace whose cluster label decodes to a project cluster the + // manager has not engaged — used to verify the not-engaged drop path. + unknownClusterNS := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testKarmadaNSStr, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: "cluster-unregistered-project", + }, + }, + } + + newDownstreamWD := func(labels map[string]string) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: labels, + }, + } + } + + tests := []struct { + name string + karmadaObjs []client.Object + downstreamWD *computev1alpha.WorkloadDeployment + want []mcreconcile.Request + }{ + { + name: "maps to project WD request", + karmadaObjs: []client.Object{downstreamNS}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: []mcreconcile.Request{ + { + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: testProjNS, + Name: testWDName, + }, + }, + }, + }, + }, + { + name: "missing upstream-namespace label is dropped", + karmadaObjs: []client.Object{downstreamNS}, + downstreamWD: newDownstreamWD(nil), + want: nil, + }, + { + name: "missing downstream namespace is dropped", + karmadaObjs: nil, // namespace not present in federation cluster + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + { + name: "namespace without cluster label is dropped", + karmadaObjs: []client.Object{&corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}, + }}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + { + name: "project cluster not engaged is dropped", + karmadaObjs: []client.Object{unknownClusterNS}, + downstreamWD: newDownstreamWD(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: testProjNS, + }), + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + karmadaClient := newKarmadaFakeClient(tt.karmadaObjs...) + r := &WorkloadDeploymentFederator{ + // Only testCluster is engaged; the not-engaged case decodes to a + // different project name and must be dropped by the GetCluster guard. + mgr: newFakeMCManager(testCluster, newFakeCluster(karmadaClient)), + FederationClient: karmadaClient, + FederationCluster: newFakeCluster(karmadaClient), + } + + got := r.mapDownstreamDeploymentToRequest(context.Background(), tt.downstreamWD) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestProjectClusterNameFromLabel(t *testing.T) { + t.Parallel() + + tests := []struct { + encoded string + want string + }{ + {"cluster-datum-cloud", "datum-cloud"}, + // Org-scoped encodings decode to org/project; the provider keys on the + // bare project name, so only the final path segment is returned. + {"cluster-org_project", "project"}, + {"cluster-_test-project-abc", "test-project-abc"}, + {"cluster-test-project-cluster", "test-project-cluster"}, + } + for _, tt := range tests { + t.Run(tt.encoded, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, projectClusterNameFromLabel(tt.encoded)) + }) + } +} + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoFederationClient verifies that the reconciler +// is a no-op when FederationClient is nil. +func TestWorkloadDeploymentFederator_NoFederationClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.FederationClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_EmptyClusterNameDropped verifies that a +// reconcile request carrying an empty cluster name is dropped without error +// (and without touching GetCluster), so it can never fall back to the local +// host cluster and spin in a "no matches for kind" requeue loop. +func TestWorkloadDeploymentFederator_EmptyClusterNameDropped(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + req := mcreconcile.Request{ + ClusterName: "", + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: testWDName, Namespace: testProjNS}, + }, + } + result, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestCleanupPropagationPolicyIfUnused_EmptyCityCode verifies the guard +// against listing with an empty city-code label value, which would match the +// wrong deployment set and mis-decide PropagationPolicy cleanup. +func TestCleanupPropagationPolicyIfUnused_EmptyCityCode(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + err := r.cleanupPropagationPolicyIfUnused(context.Background(), testKarmadaNSStr, "") + require.Error(t, err) + assert.Contains(t, err.Error(), "city code is empty") +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} From 7d3e9eefb5b97cf6b2e4709d38df5dfd8e20db62 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:36:51 -0500 Subject: [PATCH 04/14] feat(controller): add the InstanceProjector Add the projector that mirrors cell-side Instances back to the management plane, writing their status (readiness, placement, blocking reasons) onto the project-scoped Instance so callers see a single view across cells. Include the shared controller test helpers that build the project/Karmada fake clients and multi-cluster manager used by the federation tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_projector.go | 184 ++++++ .../controller/instance_projector_test.go | 460 ++++++++++++++ .../controller/instance_writeback_test.go | 598 ++++++++++++++++++ internal/controller/testing_helpers_test.go | 101 +++ 4 files changed, 1343 insertions(+) create mode 100644 internal/controller/instance_projector.go create mode 100644 internal/controller/instance_projector_test.go create mode 100644 internal/controller/instance_writeback_test.go create mode 100644 internal/controller/testing_helpers_test.go diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 00000000..4ac3e508 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the upstream +// Karmada/management control plane by POP-cell InstanceReconcilers and creates +// read-only projections in the corresponding project namespace within each +// project cluster. +// +// Namespace resolution: an upstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// upstream Karmada control plane — NOT the multicluster-runtime manager — so +// informer watches are scoped to the upstream control plane. +type InstanceProjector struct { + // FederationClient reads Instance objects from the Karmada federation control + // plane (configured via --federation-kubeconfig). Must be set before + // SetupWithManager is called. + FederationClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + var downstreamInstance computev1alpha.Instance + if err := r.FederationClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the upstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting upstream instance: %w", err) + } + + // Federation-plane Instances exist exclusively as write-back copies, and + // the InstanceReconciler stamps both upstream-owner labels atomically when + // it writes the copy — "not ours" cannot occur. A missing cluster label is + // a stamping-invariant violation that never self-heals, so surface it as an + // error for backoff and visibility rather than silently dropping the + // projection. + encodedClusterName := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s is missing the %s label; cannot resolve the project cluster", + downstreamInstance.Namespace, downstreamInstance.Name, + downstreamclient.UpstreamOwnerClusterNameLabel) + } + + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := DecodeClusterName(encodedClusterName) + + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the upstream Karmada namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. Both + // upstream-owner labels are stamped together with non-empty values, so a + // cluster label without a namespace label is an invariant violation that + // never self-heals — surface it as an error for backoff and visibility + // rather than requeueing at a flat rate. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s carries %s but is missing the %s label; cannot resolve the project namespace", + downstreamInstance.Namespace, downstreamInstance.Name, + downstreamclient.UpstreamOwnerClusterNameLabel, downstreamclient.UpstreamOwnerNamespaceLabel) + } + + // Resolve the owning WorkloadDeployment by NAME in the project cluster. + // Core invariant: the ownerReference MUST be built from a project-cluster + // object obtained via projectClient.Get — never from any edge/Karmada + // identity. The WD name is stable across all planes (project cluster, + // Karmada, edge) and is the correct cross-plane identifier, carried by + // WorkloadDeploymentNameLabel (stamped by the edge stateful control + // strategy). + wdName := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if wdName == "" { + // A write-back copy that cannot identify its WorkloadDeployment violates + // the same stamping invariant as the labels above — surface it as an + // error for backoff and visibility instead of silently dropping the + // projection. + return ctrl.Result{}, fmt.Errorf("downstream instance %s/%s is missing the %s label; cannot resolve its WorkloadDeployment", + downstreamInstance.Namespace, downstreamInstance.Name, computev1alpha.WorkloadDeploymentNameLabel) + } + + // Fetch the project-cluster WD directly by name. The returned object carries + // the project-cluster metadata.uid — the only UID that GC in the project + // cluster can act on. + var ownerWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: targetNamespace, Name: wdName}, &ownerWD); err != nil { + if apierrors.IsNotFound(err) { + // Never create an ownerless projection. The controller only watches + // Instances, so no event fires when the WD appears — returning an + // error retries with backoff and surfaces the wait in error metrics. + // A transient ordering race (Instance projected before + // WorkloadReconciler created the project WD) resolves on retry; a + // deleted WD ends the retries once its write-back copies are gone. + return ctrl.Result{}, fmt.Errorf("workload deployment %q not found in project cluster %q for instance %s/%s", + wdName, clusterName, downstreamInstance.Namespace, downstreamInstance.Name) + } + return ctrl.Result{}, fmt.Errorf("failed getting WorkloadDeployment %s/%s in project cluster %s: %w", + targetNamespace, wdName, clusterName, err) + } + + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference using the live project-cluster WD object. + // controllerutil.SetOwnerReference reads UID and GVK from ownerWD, which + // was fetched from projectClient — satisfying the core invariant. + return controllerutil.SetOwnerReference(&ownerWD, projection, projectCluster.GetScheme()) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// SetupWithManager registers the InstanceProjector with upstreamMgr, a standard +// manager.Manager configured against the upstream Karmada/federation control plane +// REST config. FederationClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(upstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(upstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 00000000..ad9c374f --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "maps" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + // Follows the "-" convention: "my-wd-0". + projTestInstanceName = "my-wd-0" + + // projTestWDUID is the UID of the owning WorkloadDeployment as it exists in + // the PROJECT cluster. This is the UID that owner references must use, since + // Kubernetes GC in the project cluster only knows this UID. + projTestWDUID = types.UID("project-wd-uid-9999-aaaa-bbbb-cccc") + + // projTestEdgeWDUID is the UID of the WorkloadDeployment as it exists on the + // EDGE/Karmada plane. Each plane mints its own UID, so this is intentionally + // distinct from projTestWDUID. The WorkloadDeploymentUIDLabel on downstream + // Instances carries this edge UID — NOT the project UID. + projTestEdgeWDUID = types.UID("edge-uid-0000-1111-2222-3333") + + // projTestWDName is the name of the owning WorkloadDeployment. The name is + // the same across all planes (project cluster, Karmada, edge) and is the + // correct cross-plane stable identifier. + projTestWDName = "my-wd" + + // projTestWorkloadUID is the UID of the owning Workload (carried via WorkloadUIDLabel). + projTestWorkloadUID = "wl-uid-1111-2222-3333-4444" + + // projTestInstanceIndex is the ordinal index of the instance (carried via InstanceIndexLabel). + projTestInstanceIndex = "0" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // WorkloadDeploymentUIDLabel carries the EDGE UID — intentionally distinct + // from projTestWDUID (the project-cluster WD UID). Owner references must + // never be built from this value. + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestEdgeWDUID), + computev1alpha.WorkloadDeploymentNameLabel: projTestWDName, + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + } + maps.Copy(labels, labelOverrides) + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + FederationClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Cross-plane UID regression test: the Karmada Instance carries the EDGE + // WD UID in WorkloadDeploymentUIDLabel (projTestEdgeWDUID), which is + // intentionally different from the project-cluster WD UID (projTestWDUID). + // The owner reference on the projection must use the project-cluster UID. + // This test fails if someone reintroduces UID-based matching against the + // edge/Karmada plane. + name: "WD name label present, edge UID differs from project UID — owner ref UID equals project WD UID", + karmadaInstance: projTestKarmadaInstance(nil), // carries projTestEdgeWDUID, not projTestWDUID + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), // UID is projTestWDUID + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // When the project WD does not yet exist (transient ordering race — + // Instance projected before WorkloadReconciler created the project WD) + // the projector must return an error and NOT create an ownerless + // projection: its only watch is the Instance, so nothing fires when + // the WD appears — error backoff is the retry mechanism. + name: "project WD not found — error, no ownerless projection created", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment — simulates the transient ordering race. + }, + wantProjection: false, + wantErr: true, + }, + { + // A write-back copy that cannot identify its WorkloadDeployment + // violates the stamping invariant — the projector must return an + // error rather than silently drop the projection. + name: "WD name label absent — error, no projection", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + computev1alpha.WorkloadDeploymentNameLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + // Federation-plane Instances are exclusively write-back copies and the + // write-back stamps both upstream-owner labels atomically, so a missing + // cluster label is a stamping-invariant violation, not a foreign object. + name: "missing upstream-cluster-name label — error", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + // The write-back stamps both upstream-owner labels together, so a + // cluster label without a namespace label is an invariant violation + // that never self-heals — the projector must return an error rather + // than requeue at a flat rate. + name: "missing upstream-namespace label — error", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantErr: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + // Verify that all linking labels (WorkloadUID, WorkloadDeploymentUID, + // WorkloadDeploymentNameLabel, InstanceIndex) survive from the Karmada + // write-back object through to the projection. + name: "all linking labels propagated from Karmada to projection", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + req := projectorRequest() + result, err := r.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + assert.Zero(t, result.RequeueAfter, + "errors rely on controller backoff, not a flat requeue") + // No error path may leave a projection behind — in particular, + // an ownerless projection must never be created. + var projection computev1alpha.Instance + getErr := projectClient.Get(context.Background(), types.NamespacedName{ + Name: req.Name, + Namespace: projTestProjNS, + }, &projection) + assert.True(t, isNotFound(getErr), + "expected no projection in project namespace on error, but found one (or unexpected error: %v)", getErr) + return + } + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Linking labels must survive from the Karmada instance to the projection + // so that the CLI can resolve Workload name, city, and instance ordinal. + if tt.wantProjection && tt.karmadaInstance != nil { + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], + projection.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.InstanceIndexLabel], + projection.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated to the projection") + } + + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + // Core invariant: owner ref UID must be the PROJECT-cluster WD UID. + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID must match the project-cluster WorkloadDeployment UID") + // Regression guard: the edge UID must NOT appear in the owner ref. + // If this assertion fails, someone reintroduced cross-plane UID matching. + assert.NotEqual(t, string(projTestEdgeWDUID), string(ownerRef.UID), + "owner reference UID must NOT be the edge/Karmada WD UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true only when err is a Kubernetes not-found error; a nil +// error means the object exists and returns false. +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go new file mode 100644 index 00000000..5c5020cf --- /dev/null +++ b/internal/controller/instance_writeback_test.go @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── write-back test constants ──────────────────────────────────────────────── + +const ( + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + + // The four self-describing labels. + wbTestWDName = "my-workload-deployment" + wbTestCityCode = "DFW" + wbTestWorkloadName = "my-workload" + wbTestPlacement = "us-central" +) + +// wbTestCellInstance builds a cell-side Instance with all seven owned labels +// pre-populated, as addInstanceControllerLabels would produce. +func wbTestCellInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + computev1alpha.WorkloadDeploymentNameLabel: wbTestWDName, + computev1alpha.CityCodeLabel: wbTestCityCode, + computev1alpha.WorkloadNameLabel: wbTestWorkloadName, + computev1alpha.PlacementNameLabel: wbTestPlacement, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceReadyReasonAvailable, + Message: "Instance is ready", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + +// wbTestDownstreamNS returns a Namespace object in the downstream (Karmada) +// control plane that carries the upstream routing labels, simulating the +// namespace stamped by NSO's MappedNamespaceResourceStrategy. +func wbTestDownstreamNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + }, + }, + } +} + +// newWriteBackReconciler wires an InstanceReconciler whose FederationClient is set +// to federationClient and whose local cluster has a single cell instance. +func newWriteBackReconciler(federationClient client.Client) *InstanceReconciler { + return &InstanceReconciler{ + FederationClient: federationClient, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestWriteBackToUpstream_CreatePath_AllLabels verifies that the first +// write-back to an empty Karmada control plane creates an Instance with all five +// expected labels (two routing + three linking) and also writes the cell-side +// status via Status().Update. +func TestWriteBackToUpstream_CreatePath_AllLabels(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + // Verify the created Karmada Instance carries all five expected labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestEncodedCluster, created.Labels[downstreamclient.UpstreamOwnerClusterNameLabel], + "UpstreamOwnerClusterNameLabel must be set") + assert.Equal(t, wbTestUpstreamNS, created.Labels[downstreamclient.UpstreamOwnerNamespaceLabel], + "UpstreamOwnerNamespaceLabel must be set") + assert.Equal(t, wbTestWorkloadUID, created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestWDUID, created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestInstanceIndex, created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated from cell instance") + + // Status must have been written via Status().Update after Create. + require.Len(t, created.Status.Conditions, 1, + "Status().Update must be called after Create; condition should be present") + assert.Equal(t, computev1alpha.InstanceReady, created.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionTrue, created.Status.Conditions[0].Status) +} + +// TestWriteBackToUpstream_UpdatePath_LabelMerge verifies that an +// existing Karmada Instance with a Karmada-managed label retains that label +// after the update path runs, while all five owned labels are written correctly. +func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + // Pre-populate the Karmada control plane with a pre-existing Instance + // carrying only the two linking labels plus a simulated Karmada-managed label. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + // All five owned labels must be present with correct values. + assert.Equal(t, wbTestEncodedCluster, updated.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]) + assert.Equal(t, wbTestUpstreamNS, updated.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]) + assert.Equal(t, wbTestWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel]) + assert.Equal(t, wbTestWDUID, updated.Labels[computev1alpha.WorkloadDeploymentUIDLabel]) + assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) + + // The Karmada-managed label must survive the merge (not be replaced/deleted). + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after merge; should not be overwritten") +} + +// TestWriteBackToUpstream_LabelChangeTriggerUpdate verifies that +// a changed linking label on the cell instance causes the Karmada object to +// be updated with the new value. +func TestWriteBackToUpstream_LabelChangeTriggerUpdate(t *testing.T) { + t.Parallel() + + newWorkloadUID := "wl-uid-CHANGED" + + // Pre-populate with the five-label map from a previous write-back. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Modify the WorkloadUIDLabel on the cell instance. + cellInstance := wbTestCellInstance() + cellInstance.Labels[computev1alpha.WorkloadUIDLabel] = newWorkloadUID + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, newWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel change on the cell instance must be reflected in the Karmada object") +} + +// TestWriteBackToUpstream_MissingLinkingLabels_Error verifies that +// writeBackToUpstream refuses to create an upstream copy when the cell-side +// Instance lacks the linking labels (e.g. before the stateful control +// strategy's backfill has converged it). The error must name every missing +// label so the wait is diagnosable, and no upstream object may be created — +// an Instance with empty identity labels could never be linked back to its +// owners. +func TestWriteBackToUpstream_MissingLinkingLabels_Error(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Instance with nil Labels — simulates an early reconcile before the + // linking labels are stamped. + cellInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + computev1alpha.WorkloadDeploymentNameLabel, + computev1alpha.CityCodeLabel, + computev1alpha.WorkloadNameLabel, + computev1alpha.PlacementNameLabel, + } { + assert.Contains(t, err.Error(), key, + "error must name missing label %q", key) + } + + // No upstream Instance may be created with empty identity labels. + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when linking labels are missing (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_MissingLinkingLabels_NoUpdate verifies that an +// existing upstream copy is left untouched when the cell-side Instance has +// lost its linking labels: the write-back must error out before the update +// path can overwrite the previously written identity with empty values. +func TestWriteBackToUpstream_MissingLinkingLabels_NoUpdate(t *testing.T) { + t.Parallel() + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Cell instance lost its labels (only the index label remains). + cellInstance := wbTestCellInstance() + delete(cellInstance.Labels, computev1alpha.WorkloadUIDLabel) + delete(cellInstance.Labels, computev1alpha.WorkloadDeploymentUIDLabel) + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + assert.Contains(t, err.Error(), computev1alpha.WorkloadUIDLabel) + assert.Contains(t, err.Error(), computev1alpha.WorkloadDeploymentUIDLabel) + assert.NotContains(t, err.Error(), computev1alpha.InstanceIndexLabel, + "a present label must not be reported missing") + + // The existing upstream copy must keep its previously written identity. + var existing computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &existing)) + assert.Equal(t, wbTestWorkloadUID, existing.Labels[computev1alpha.WorkloadUIDLabel], + "existing WorkloadUIDLabel must not be overwritten with an empty value") + assert.Equal(t, wbTestWDUID, existing.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "existing WorkloadDeploymentUIDLabel must not be overwritten with an empty value") +} + +// TestWriteBackToUpstream_MissingSelfDescribingLabel_Error verifies that the +// self-describing labels are required, not best-effort: a cell Instance +// missing only WorkloadDeploymentNameLabel must fail write-back with an error +// naming exactly that label, and no upstream copy may be created. +func TestWriteBackToUpstream_MissingSelfDescribingLabel_Error(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + cellInstance := wbTestCellInstance() + delete(cellInstance.Labels, computev1alpha.WorkloadDeploymentNameLabel) + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.Error(t, err) + assert.Contains(t, err.Error(), computev1alpha.WorkloadDeploymentNameLabel, + "error must name the missing label") + assert.NotContains(t, err.Error(), computev1alpha.CityCodeLabel, + "a present label must not be reported missing") + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when a required label is missing (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_NamespaceIdentity_Errors verifies that the +// federation-plane namespace is the strict source of upstream identity: +// a missing namespace or a namespace lacking either upstream-owner label must +// fail the write-back with an error naming the namespace (and label), and no +// upstream copy may be created — there are no fallback identity values. +func TestWriteBackToUpstream_NamespaceIdentity_Errors(t *testing.T) { + t.Parallel() + + nsWithoutLabel := func(missing string) *corev1.Namespace { + ns := wbTestDownstreamNS() + delete(ns.Labels, missing) + return ns + } + + tests := []struct { + name string + // ns is the federation-plane namespace; nil means it does not exist. + ns *corev1.Namespace + // wantInError must all appear in the returned error. + wantInError []string + }{ + { + name: "namespace missing — error, no copy", + ns: nil, + wantInError: []string{wbTestNamespace}, + }, + { + name: "namespace lacks upstream-namespace label — error names namespace and label", + ns: nsWithoutLabel(downstreamclient.UpstreamOwnerNamespaceLabel), + wantInError: []string{wbTestNamespace, downstreamclient.UpstreamOwnerNamespaceLabel}, + }, + { + name: "namespace lacks upstream-cluster-name label — error names namespace and label", + ns: nsWithoutLabel(downstreamclient.UpstreamOwnerClusterNameLabel), + wantInError: []string{wbTestNamespace, downstreamclient.UpstreamOwnerClusterNameLabel}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + builder := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithStatusSubresource(&computev1alpha.Instance{}) + if tt.ns != nil { + builder = builder.WithObjects(tt.ns) + } + upstreamClient := builder.Build() + + r := newWriteBackReconciler(upstreamClient) + + err := r.writeBackToUpstream(context.Background(), wbTestCellInstance()) + require.Error(t, err) + for _, want := range tt.wantInError { + assert.Contains(t, err.Error(), want) + } + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when upstream identity is unresolvable (got err: %v)", getErr) + }) + } +} + +// TestWriteBackToUpstream_NamespaceGetFailure_Error verifies that a transient +// failure reading the federation-plane namespace aborts the write-back instead +// of proceeding with derived identity values. +func TestWriteBackToUpstream_NamespaceGetFailure_Error(t *testing.T) { + t.Parallel() + + getFailure := errors.New("federation API unavailable") + upstreamClient := fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, cl client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + if _, ok := obj.(*corev1.Namespace); ok { + return getFailure + } + return cl.Get(ctx, key, obj, opts...) + }, + }). + Build() + + r := newWriteBackReconciler(upstreamClient) + + err := r.writeBackToUpstream(context.Background(), wbTestCellInstance()) + require.ErrorIs(t, err, getFailure) + + var created computev1alpha.Instance + getErr := upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created) + assert.True(t, apierrors.IsNotFound(getErr), + "no upstream write-back copy may be created when the namespace read fails (got err: %v)", getErr) +} + +// TestWriteBackToUpstream_FourNewLabels_CreatePath verifies that the four +// self-describing labels (WorkloadDeploymentName, CityCode, WorkloadName, +// PlacementName) are written to the Karmada object on the create path. +func TestWriteBackToUpstream_FourNewLabels_CreatePath(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestWDName, created.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestCityCode, created.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must propagate to Karmada object") + assert.Equal(t, wbTestWorkloadName, created.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestPlacement, created.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must propagate to Karmada object") +} + +// TestWriteBackToUpstream_FourNewLabels_UpdatePath verifies that the four +// self-describing labels are written on the update path and existing Karmada- +// managed labels on the downstream object are preserved. +func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, wbTestWDName, updated.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on update path") + assert.Equal(t, wbTestCityCode, updated.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must be set on update path") + assert.Equal(t, wbTestWorkloadName, updated.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on update path") + assert.Equal(t, wbTestPlacement, updated.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on update path") + + // Karmada-managed label must survive the merge. + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after the update merge") +} diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 00000000..cc3d3d9f --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name multicluster.ClusterName) (cluster.Cluster, error) { + if c, ok := m.clusters[string(name)]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} From 1ac631e0e440a0b1e89b444bd15bc1c4aed21c1a Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:37:25 -0500 Subject: [PATCH 05/14] feat(controller): distributed WorkloadDeployment and Workload reconciliation Rework the WorkloadDeployment and Workload controllers to run per cell, resolving networks and Locations locally and driving Instance lifecycle through the stateful instance-control logic rather than a central scheduler. Update the instance-control packages to manage Instances within a cell's control plane. The reconciler requeues explicitly after adding its finalizer (the metadata-only Update can be dropped by watch-side event filtering, which would otherwise leave a new cell WorkloadDeployment unreconciled), and the scheduling-gate clearing path guards the nilable Spec.Controller that the infra provider populates independently of networking readiness. A deployment whose city has no Location yet has no other wake-up event (SubnetClaims/Subnets only exist after a Location resolved), so the controller watches Locations to re-reconcile waiting deployments, and surfaces the wait on the Available condition (NoMatchingLocation, naming the city code) instead of only logging it. Co-Authored-By: Claude Opus 4.8 (1M context) Co-Authored-By: Claude Fable 5 --- .../instancecontrol/instancecontrol.go | 28 +- .../stateful/stateful_control.go | 106 ++++- .../stateful/stateful_control_test.go | 407 ++++++++++++++++++ internal/controller/workload_controller.go | 14 +- .../workloaddeployment_controller.go | 317 +++++++++----- .../workloaddeployment_location_test.go | 295 +++++++++++++ 6 files changed, 1046 insertions(+), 121 deletions(-) create mode 100644 internal/controller/workloaddeployment_location_test.go diff --git a/internal/controller/instancecontrol/instancecontrol.go b/internal/controller/instancecontrol/instancecontrol.go index 6de9df99..d2c83692 100644 --- a/internal/controller/instancecontrol/instancecontrol.go +++ b/internal/controller/instancecontrol/instancecontrol.go @@ -26,10 +26,11 @@ type Strategy interface { type ActionType string const ( - ActionTypeCreate ActionType = "Create" - ActionTypeUpdate ActionType = "Update" - ActionTypeDelete ActionType = "Delete" - ActionTypeWait ActionType = "Wait" + ActionTypeCreate ActionType = "Create" + ActionTypeUpdate ActionType = "Update" + ActionTypeDelete ActionType = "Delete" + ActionTypeWait ActionType = "Wait" + ActionTypePatchLabels ActionType = "PatchLabels" ) type Action struct { @@ -104,3 +105,22 @@ func NewWaitAction(object client.Object) Action { fn: func(ctx context.Context, c client.Client) error { return nil }, } } + +// NewPatchLabelsAction returns an action that applies a metadata-only labels +// patch to the given object. It uses a MergeFrom patch so only the labels +// field is sent to the API server — the spec, template, and template-hash are +// never touched. This is intentionally separate from ActionTypeUpdate so that +// label backfill never participates in the ordered rolling-update flow. +func NewPatchLabelsAction(updated client.Object, base client.Object) Action { + patch := client.MergeFrom(base) + return Action{ + Object: updated, + actionType: ActionTypePatchLabels, + fn: func(ctx context.Context, c client.Client) error { + if err := c.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to patch labels on %T %s: %w", updated, updated.GetName(), err) + } + return nil + }, + } +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652c..de79bd2d 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -15,13 +15,30 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" ) +// Options controls optional behaviours of the stateful instance control strategy. +type Options struct { + // NetworkingEnabled controls whether the Network scheduling gate is added to + // newly created Instances. Set to false when the networking integration is + // disabled so that Instances are not blocked waiting for a NetworkBinding. + // Defaults to true. + NetworkingEnabled bool +} + // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset // Does not currently implement exact behavior. type statefulControl struct { + opts Options } +// New returns a stateful instance control strategy with networking enabled. func New() instancecontrol.Strategy { - return &statefulControl{} + return NewWithOptions(Options{NetworkingEnabled: true}) +} + +// NewWithOptions returns a stateful instance control strategy with the given +// options. +func NewWithOptions(opts Options) instancecontrol.Strategy { + return &statefulControl{opts: opts} } func (c *statefulControl) GetActions( @@ -68,15 +85,25 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } + // Set Location best-effort: when Status.Location is nil (no matching + // Location object for the city code) Instance.Spec.Location stays nil and + // instance creation proceeds normally — this must not block scheduling. desiredInstances[i].Spec.Location = deployment.Status.Location // TODO(jreese) consider adding scheduling gates via mutating webhooks - desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ - TemplateHash: instanceTemplateHash, - SchedulingGates: []v1alpha.SchedulingGate{ + gates := []v1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + } + if c.opts.NetworkingEnabled { + // Prepend the Network gate so it is cleared first; quota is + // independent and evaluated in parallel by InstanceReconciler. + gates = append([]v1alpha.SchedulingGate{ {Name: instancecontrol.NetworkSchedulingGate.String()}, - {Name: instancecontrol.QuotaSchedulingGate.String()}, - }, + }, gates...) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ + TemplateHash: instanceTemplateHash, + SchedulingGates: gates, } addInstanceControllerLabels(desiredInstances[i], getInstanceOrdinal(desiredInstances[i].Name), deployment) @@ -114,10 +141,37 @@ func (c *statefulControl) GetActions( } } + // Backfill controller-managed labels on every existing instance, regardless + // of Ready state or template hash. This ensures newly-introduced labels + // (e.g. city-code, workload-name) are applied to pre-existing instances that + // were never touched by a rolling update. The patch is metadata-only and is + // emitted outside the ordered rollout decision so it never gates or reorders + // instance creation/updates. + var patchLabelActions []instancecontrol.Action + for _, instance := range desiredInstances { + if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { + // Skip instances that don't exist yet or are being deleted. + continue + } + + desiredLabels := desiredControllerLabels(getInstanceOrdinal(instance.Name), deployment) + if labelsNeedBackfill(instance.Labels, desiredLabels) { + base := instance.DeepCopy() + patched := instance.DeepCopy() + for k, v := range desiredLabels { + if patched.Labels == nil { + patched.Labels = make(map[string]string) + } + patched.Labels[k] = v + } + patchLabelActions = append(patchLabelActions, instancecontrol.NewPatchLabelsAction(patched, base)) + } + } + slices.SortFunc(updateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -144,6 +198,8 @@ func (c *statefulControl) GetActions( } + actions = append(actions, patchLabelActions...) + return actions, nil } @@ -152,7 +208,37 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme instance.Labels = map[string]string{} } - instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(index) - instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) - instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + for k, v := range desiredControllerLabels(index, deployment) { + instance.Labels[k] = v + } +} + +// desiredControllerLabels returns the full set of controller-managed labels +// that every instance should carry. Used both when stamping a new/updated +// instance and when checking whether an existing instance needs a backfill +// patch. +func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { + return map[string]string{ + v1alpha.InstanceIndexLabel: strconv.Itoa(index), + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + // Self-describing labels for routing, filtering, and observability. + // Backfilled on every reconcile so they stay accurate even for instances + // that pre-date the labels or that were not reached by a rolling update. + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } +} + +// labelsNeedBackfill reports whether any of the desired controller-managed +// label key/value pairs are absent or incorrect on the current instance labels. +func labelsNeedBackfill(current map[string]string, desired map[string]string) bool { + for k, v := range desired { + if current[k] != v { + return true + } + } + return false } diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d45b24b3..229a224b 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -13,6 +13,8 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" ) @@ -150,16 +152,407 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { assert.False(t, actions[0].IsSkipped()) } +// TestNetworkingEnabledAddsNetworkGate verifies that when networking is enabled +// (the default), newly created Instances receive both the Network and Quota +// scheduling gates so that they are held until the network is provisioned. +func TestNetworkingEnabledAddsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: true}) + + deployment := getWorkloadDeployment("test-deploy-net-on", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.Contains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must be present when networking is enabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must be present") +} + +// TestNetworkingDisabledOmitsNetworkGate verifies that when networking is +// disabled, newly created Instances do NOT receive the Network scheduling gate, +// so they are not blocked on network provisioning. The Quota gate is still +// added so quota enforcement remains active. +func TestNetworkingDisabledOmitsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: false}) + + deployment := getWorkloadDeployment("test-deploy-net-off", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.NotContains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must NOT be present when networking is disabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must still be present when networking is disabled") +} + // Add more test functions below for different scenarios. +// TestInstanceLabels_FourNewLabelsStamped verifies that all four +// self-describing labels are stamped on newly created Instances, with values +// sourced from the WorkloadDeployment spec. +func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-deploy", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must equal deployment name") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must equal deployment.Spec.CityCode") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must equal deployment.Spec.WorkloadRef.Name") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must equal deployment.Spec.PlacementName") +} + +// TestInstanceLabels_PropagatedOnUpdate verifies that when an existing instance +// is updated (rolling update path), the four new labels are refreshed from the +// deployment so they remain accurate after spec changes. +func TestInstanceLabels_PropagatedOnUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-update", 1) + + // Build a ready existing instance. + currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} + + // Trigger a rolling update by changing the image. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must be refreshed on update") +} + +// TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when +// deployment.Status.Location is set, the new Instance receives it as Spec.Location. +func TestInstanceLocation_SetWhenDeploymentStatusLocationPresent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-set", 1) + deployment.Status.Location = &networkingv1alpha.LocationReference{ + Name: "loc-dfw-1", + Namespace: "networking-system", + } + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Location, + "Spec.Location must be set when deployment.Status.Location is non-nil") + assert.Equal(t, "loc-dfw-1", instance.Spec.Location.Name) + assert.Equal(t, "networking-system", instance.Spec.Location.Namespace) +} + +// TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent verifies that when +// deployment.Status.Location is nil (no Location object matches the city code), +// instance creation still succeeds and Spec.Location remains nil — no regression +// on the "create instances regardless of Location" contract. +func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-nil", 1) + // deployment.Status.Location is intentionally not set (nil) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err, "instance creation must succeed even when Status.Location is nil") + assert.Len(t, actions, 1, "exactly one create action must be produced") + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Nil(t, instance.Spec.Location, + "Spec.Location must remain nil when deployment.Status.Location is not set") + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType(), + "action must be a Create, proving instance creation is not gated on Location") +} + +// TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance +// with an unchanged template hash receives a PatchLabels action when it is +// missing controller-managed labels. The action must not be a rollout Update, +// must not alter spec/template, and must not block subsequent instances. +func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-notready", 2) + + // Instance 0: not-Ready, correct template hash, but missing city-code/workload-name labels. + instance0 := getInstanceForDeployment(deployment, 0) + apimeta.SetStatusCondition(&instance0.Status.Conditions, metav1.Condition{ + Type: v1alpha.InstanceReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + Message: "Instance is not ready", + LastTransitionTime: metav1.Now(), + }) + // Simulate pre-existing instance that only has the index label (missing the newer labels). + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + } + + // Instance 1: needs to be created (nil in desiredInstances), so we only provide instance0. + currentInstances := []v1alpha.Instance{*instance0} + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + // Collect actions by type. + var waitActions, createActions, updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeWait: + waitActions = append(waitActions, a) + case instancecontrol.ActionTypeCreate: + createActions = append(createActions, a) + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // The not-Ready instance must still produce a Wait (rollout is gated). + assert.Len(t, waitActions, 1, "not-Ready instance must still produce a Wait action") + assert.Equal(t, "test-backfill-notready-0", waitActions[0].Object.GetName()) + + // The missing instance-1 create is skipped (ordered policy, Wait is first). + assert.Len(t, createActions, 1, "instance-1 create action must be present") + assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") + + // No template Update actions must be produced. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash instance") + + // A PatchLabels action must be produced for instance-0. + assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") + assert.Equal(t, "test-backfill-notready-0", patchActions[0].Object.GetName()) + assert.False(t, patchActions[0].IsSkipped(), "PatchLabels must not be skipped by the rollout skip-loop") + + // The patched object must carry all desired labels. + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.GetName(), patched.Labels[v1alpha.WorkloadDeploymentNameLabel]) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel]) + assert.Equal(t, deployment.Spec.WorkloadRef.Name, patched.Labels[v1alpha.WorkloadNameLabel]) + assert.Equal(t, deployment.Spec.PlacementName, patched.Labels[v1alpha.PlacementNameLabel]) + + // The patched object's spec and template-hash must be unchanged. + assert.Equal(t, instancecontrol.ComputeHash(deployment.Spec.Template), patched.Spec.Controller.TemplateHash, + "template hash must be unchanged by the label backfill") + assert.Equal(t, deployment.Spec.Template.Spec.Runtime, patched.Spec.Runtime, + "spec must be unchanged by the label backfill") +} + +// TestLabelBackfill_Idempotent verifies that an instance already carrying all +// correct controller-managed labels produces no PatchLabels action. +func TestLabelBackfill_Idempotent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-idempotent", 1) + + // Instance already has all controller-managed labels set correctly. + instance := getInstanceForDeployment(deployment, 0) + instance.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + for _, a := range actions { + assert.NotEqual(t, instancecontrol.ActionTypePatchLabels, a.ActionType(), + "no PatchLabels action must be produced when all labels are already correct") + } +} + +// TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with +// correct template hash but drifted labels receives a PatchLabels action +// without triggering a template rollout Update. +func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-ready", 1) + + // Ready instance with matching hash but missing city-code label. + instance := getInstanceForDeployment(deployment, 0) + // Remove the city-code label to simulate drift. + delete(instance.Labels, v1alpha.CityCodeLabel) + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // No template Update must be produced — template hash matches. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash ready instance") + + // A PatchLabels action must be produced. + assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel], + "city-code label must be corrected by the backfill") +} + +// TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template +// change on a Ready instance still produces a normal ordered Update action and +// that the PatchLabels path does not interfere with or duplicate it. +func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-rolling", 2) + + // Two ready instances with all correct labels and matching current hash. + instance0 := getInstanceForDeployment(deployment, 0) + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + instance1 := getInstanceForDeployment(deployment, 1) + instance1.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "1", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + // Trigger a template change. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "rolling-update-image" + + currentInstances := []v1alpha.Instance{*instance0, *instance1} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // Two Update actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, updateActions, 2, "both instances must produce Update actions on template change") + assert.Equal(t, "test-backfill-rolling-1", updateActions[0].Object.GetName(), + "Update actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", updateActions[1].Object.GetName()) + assert.False(t, updateActions[0].IsSkipped(), "first Update must be active") + assert.True(t, updateActions[1].IsSkipped(), "second Update must be skipped (ordered rollout)") + + // No PatchLabels — all labels are already correct. + assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") +} + func getWorkloadDeployment(name string, minReplicas int32) *v1alpha.WorkloadDeployment { instance := getInstanceTemplate(name, 0) deployment := &v1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", + UID: "test-wd-uid", }, Spec: v1alpha.WorkloadDeploymentSpec{ + WorkloadRef: v1alpha.WorkloadReference{ + Name: "test-workload", + UID: "test-workload-uid", + }, + PlacementName: "test-placement", + CityCode: "DFW", ScaleSettings: v1alpha.HorizontalScaleSettings{ MinReplicas: minReplicas, InstanceManagementPolicy: v1alpha.OrderedReadyInstanceManagementPolicyType, @@ -180,6 +573,20 @@ func getInstanceForDeployment(deployment *v1alpha.WorkloadDeployment, ordinal in TemplateHash: instancecontrol.ComputeHash(deployment.Spec.Template), } + // Stamp all controller-managed labels so that the label-backfill path is a + // no-op for instances built by this helper. Tests that specifically exercise + // label drift should manipulate the labels directly after calling this helper. + if instance.Labels == nil { + instance.Labels = map[string]string{} + } + instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(ordinal) + instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) + instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + instance.Labels[v1alpha.WorkloadDeploymentNameLabel] = deployment.GetName() + instance.Labels[v1alpha.CityCodeLabel] = deployment.Spec.CityCode + instance.Labels[v1alpha.WorkloadNameLabel] = deployment.Spec.WorkloadRef.Name + instance.Labels[v1alpha.PlacementNameLabel] = deployment.Spec.PlacementName + return instance } diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6e907b65..6ca92e03 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -26,13 +26,17 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) -const workloadControllerFinalizer = "compute.datumapis.com/workload-controller" +const ( + workloadControllerFinalizer = "compute.datumapis.com/workload-controller" + workloadConditionTypeAvailable = "Available" +) // WorkloadReconciler reconciles a Workload object type WorkloadReconciler struct { @@ -118,7 +122,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ if len(notFoundNetworks) > 0 { missingNetworks := strings.Join(notFoundNetworks.UnsortedList(), ", ") changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, Reason: "NetworkNotFound", Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), @@ -383,9 +387,9 @@ func (r *WorkloadReconciler) getDeploymentsForWorkload( existingDeployments.Insert(deployment.Name) } - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := upstreamClient.List(ctx, &locations); err != nil { - return nil, nil, fmt.Errorf("failed to list locations: %w", err) + return nil, nil, fmt.Errorf("failed to list location bindings: %w", err) } if len(locations.Items) == 0 { @@ -463,7 +467,7 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Workload{}, mcbuilder.WithEngageWithLocalCluster(false)). Owns(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Watches(&networkingv1alpha.Network{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + Watches(&networkingv1alpha.Network{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, network client.Object) []mcreconcile.Request { logger := log.FromContext(ctx) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef0..76216cb2 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -24,6 +24,7 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" @@ -37,11 +38,23 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + + // NetworkingEnabled controls whether the networking integration with + // network-services-operator is active. When false, NetworkBinding creation is + // skipped, the Network scheduling gate is never added to Instances (and is + // actively removed if present), and the networking step is treated as + // immediately ready. Defaults to true. + NetworkingEnabled bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=locations,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkbindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkcontexts,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnetclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnets,verbs=get;list;watch func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) @@ -76,7 +89,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco if err = cl.GetClient().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) } - return ctrl.Result{}, nil + // The finalizer-add Update is metadata-only and may be filtered by event + // predicates or handlers, so requeue explicitly to guarantee the + // deployment is reconciled past this point. + return ctrl.Result{Requeue: true}, nil } if !deployment.DeletionTimestamp.IsZero() { @@ -86,10 +102,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -100,7 +112,9 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, fmt.Errorf("failed listing instances: %w", err) } - instanceControl := instancecontrolstateful.New() + instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ + NetworkingEnabled: r.NetworkingEnabled, + }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) if err != nil { @@ -122,9 +136,28 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } } - networkReady, err := r.reconcileNetworks(ctx, cl.GetClient(), &deployment) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + // When networking is disabled, bypass the entire network provisioning path. + // The Network scheduling gate is treated as cleared and no NetworkBindings + // are created. This lets Instances reach the runtime on cells where + // network-services-operator (VPC) is not yet available. + var networkReady bool + locationResolved := true + if !r.NetworkingEnabled { + networkReady = true + } else { + var resolvedLocation *networkingv1alpha.LocationReference + networkReady, resolvedLocation, err = r.reconcileNetworks(ctx, cl.GetClient(), &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + } + // Persist the resolved Location to status so downstream components (e.g. + // the stateful instance control strategy) can propagate it to Instances. + // When no matching Location exists, resolvedLocation is nil and + // Status.Location remains nil — instance creation is not blocked. + locationResolved = resolvedLocation != nil + if resolvedLocation != nil { + deployment.Status.Location = resolvedLocation + } } // Networks are all ready with subnets ready to use, remove any scheduling @@ -143,59 +176,66 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !locationResolved { + // Network provisioning cannot even start without a Location, so surface + // the unresolved city rather than the generic provisioning reason — it is + // the only user-visible signal while the deployment waits for the city's + // Location to be created. + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "NoMatchingLocation", + Message: fmt.Sprintf("No Location matches city code %q", deployment.Spec.CityCode), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -213,7 +253,10 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( quotaBlockedReplicas++ } - if networkReady && len(instance.Spec.Controller.SchedulingGates) > 0 { + // Spec.Controller is a nilable pointer; guard it before dereferencing the + // scheduling gates so an instance without controller state cannot panic + // the reconcile (mirrors the Status.Controller guard below). + if networkReady && instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0 { newGates := slices.DeleteFunc(instance.Spec.Controller.SchedulingGates, func(gate computev1alpha.SchedulingGate) bool { return gate.Name == instancecontrol.NetworkSchedulingGate.String() }) @@ -240,13 +283,44 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all +// network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). +// resolvedLocation is non-nil when a Location matching the deployment's city code +// was found; nil otherwise. Instance creation is never gated on resolvedLocation +// being non-nil — callers must treat a nil location as best-effort only. func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, deployment *computev1alpha.WorkloadDeployment, -) (bool, error) { +) (bool, *networkingv1alpha.LocationReference, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, nil, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + // Surfaced to users via the Available condition (NoMatchingLocation); the + // log is debug-level detail only. + logger.V(1).Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -260,7 +334,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkBindingObjectKey, &networkBinding); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network binding: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network binding: %w", err) } if networkBinding.CreationTimestamp.IsZero() { @@ -271,16 +345,16 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetControllerReference(deployment, &networkBinding, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on network binding: %w", err) + return false, nil, fmt.Errorf("failed to set controller on network binding: %w", err) } if err := c.Create(ctx, &networkBinding); err != nil { - return false, fmt.Errorf("failed creating network binding: %w", err) + return false, nil, fmt.Errorf("failed creating network binding: %w", err) } } @@ -293,7 +367,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( if !allNetworkBindingsReady { logger.Info("waiting for network bindings to be ready") - return false, nil + return false, locationRef, nil } // TODO(jreese): Currently this makes a SubnetClaim that will be used by @@ -312,12 +386,12 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkContextObjectKey, &networkContext); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network context: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network context: %w", err) } if !apimeta.IsStatusConditionTrue(networkContext.Status.Conditions, networkingv1alpha.NetworkContextReady) { logger.Info("waiting for network context to be ready", "network_context", networkContext.Name) - return false, nil + return false, locationRef, nil } var subnetClaims networkingv1alpha.SubnetClaimList @@ -326,7 +400,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.List(ctx, &subnetClaims, listOpts...); err != nil { - return false, fmt.Errorf("failed listing subnet claims: %w", err) + return false, nil, fmt.Errorf("failed listing subnet claims: %w", err) } var subnetClaim networkingv1alpha.SubnetClaim @@ -347,8 +421,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,28 +445,28 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetOwnerReference(&networkContext, &subnetClaim, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on subnet claim: %w", err) + return false, nil, fmt.Errorf("failed to set controller on subnet claim: %w", err) } if err := c.Create(ctx, &subnetClaim); err != nil { - return false, fmt.Errorf("failed creating subnet claim: %w", err) + return false, nil, fmt.Errorf("failed creating subnet claim: %w", err) } logger.Info("created subnet claim", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } logger.Info("found subnet claim", "subnetClaim", subnetClaim.Name) if !apimeta.IsStatusConditionTrue(subnetClaim.Status.Conditions, "Ready") { logger.Info("waiting for subnet claim to be ready", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } var subnet networkingv1alpha.Subnet @@ -401,19 +475,19 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( Name: subnetClaim.Status.SubnetRef.Name, } if err := c.Get(ctx, subnetObjectKey, &subnet); err != nil { - return false, fmt.Errorf("failed fetching subnet: %w", err) + return false, nil, fmt.Errorf("failed fetching subnet: %w", err) } if !apimeta.IsStatusConditionTrue(subnet.Status.Conditions, "Ready") { logger.Info("waiting for subnet to be ready", "subnet", subnet.Name) - return false, nil + return false, locationRef, nil } logger.Info("subnet is ready", "subnet", subnet.Name) } - return true, nil + return true, locationRef, nil } var errDeploymentHasInstances = errors.New("deployment has instances") @@ -468,47 +542,86 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } - return mcbuilder.ControllerManagedBy(mgr). + + b := mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Owns(&computev1alpha.Instance{}). - Owns(&networkingv1alpha.NetworkBinding{}). - Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnetClaim := o.(*networkingv1alpha.SubnetClaim) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + Owns(&computev1alpha.Instance{}) + + // Only watch networking resources when the networking integration is enabled. + // On cells without network-services-operator these watches would log spurious + // errors for missing CRDs. + if r.NetworkingEnabled { + b = b. + Owns(&networkingv1alpha.NetworkBinding{}). + // A deployment whose city has no Location yet waits without any other + // wake-up event: NetworkBindings/SubnetClaims/Subnets only exist after + // a Location resolved, and the reconciler does not poll. Watching + // Locations re-reconciles the waiting deployments when their city's + // Location appears (or its topology changes). + Watches(&networkingv1alpha.Location{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + location := o.(*networkingv1alpha.Location) + return enqueueWorkloadDeploymentsForLocation(ctx, cl.GetClient(), clusterName, location) + }) + }). + Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnetClaim := o.(*networkingv1alpha.SubnetClaim) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + }) + }). + Watches(&networkingv1alpha.Subnet{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnet := o.(*networkingv1alpha.Subnet) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + }) }) - }). - Watches(&networkingv1alpha.Subnet{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnet := o.(*networkingv1alpha.Subnet) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) - }) - }). - Complete(r) + } + + return b.Complete(r) } -func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { +// enqueueWorkloadDeploymentByLocation maps an object that carries a +// LocationReference (SubnetClaim, Subnet) to the WorkloadDeployments targeting +// the referenced Location's city. The reference must be resolved to the Location +// object first because only its topology carries the city code. +func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName multicluster.ClusterName, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + return enqueueWorkloadDeploymentsForLocation(ctx, clusterClient, clusterName, &location) +} - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") +// enqueueWorkloadDeploymentsForLocation maps a Location to the +// WorkloadDeployments that target its city, via the deploymentCityCodeIndex. +func enqueueWorkloadDeploymentsForLocation(ctx context.Context, c client.Client, clusterName multicluster.ClusterName, location *networkingv1alpha.Location) []mcreconcile.Request { + logger := log.FromContext(ctx) + + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } + + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := c.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go new file mode 100644 index 00000000..60aba65a --- /dev/null +++ b/internal/controller/workloaddeployment_location_test.go @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + + "go.datum.net/compute/internal/controller/instancecontrol" +) + +const ( + // locTestCityCode / locTestOtherCityCode: deployments under test target + // locTestCityCode; locTestOtherCityCode identifies a decoy Location that + // must never match. + locTestCityCode = "DFW" + locTestOtherCityCode = "ORD" + + // locTestNamespace mirrors where Location objects live in real clusters. + locTestNamespace = "networking-system" + + // locTestWDNamespace is the namespace of the deployments under test. + locTestWDNamespace = "default" + + // locTestTopologyKey is the production topology key that carries a + // Location's city code. + locTestTopologyKey = "topology.datum.net/city-code" +) + +// newNetworkingScheme returns a scheme with compute + networkingv1alpha types. +func newNetworkingScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = computev1alpha.AddToScheme(s) + _ = networkingv1alpha.AddToScheme(s) + return s +} + +// newTestLocation builds a Location fixture shaped like production: the city +// code is carried in Spec.Topology under the topology.datum.net/city-code key. +func newTestLocation(name, cityCode string) *networkingv1alpha.Location { + return &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: locTestNamespace}, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{locTestTopologyKey: cityCode}, + }, + } +} + +// TestReconcileNetworks_PersistsLocation_WhenLocationFound verifies that when a +// Location object matching the deployment's city code exists in the cluster, the +// resolved LocationReference is returned by reconcileNetworks and can be persisted +// to deployment.Status.Location. Instance creation must not be blocked — the +// function returns networkReady=false only because no NetworkInterfaces exist on +// the deployment in this scenario (short-circuit before bindings), not because +// Location was absent. +func TestReconcileNetworks_PersistsLocation_WhenLocationFound(t *testing.T) { + t.Parallel() + + const locationName = "loc-dfw-1" + + location := newTestLocation(locationName, locTestCityCode) + + s := newNetworkingScheme() + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(location).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, + // No NetworkInterfaces — the function returns false,locationRef,nil + // after the location is found but before bindings are checked. + }, + } + + r := &WorkloadDeploymentReconciler{} + _, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err) + require.NotNil(t, resolvedLocation, + "resolved location must be non-nil when a matching Location object exists") + assert.Equal(t, locationName, resolvedLocation.Name) + assert.Equal(t, locTestNamespace, resolvedLocation.Namespace) + + // Simulate what the Reconcile loop does: persist resolvedLocation to Status. + deployment.Status.Location = resolvedLocation + assert.Equal(t, locationName, deployment.Status.Location.Name, + "Status.Location.Name must match the resolved Location object name") +} + +// TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound verifies that +// when no Location object in the cluster matches the deployment's city code, +// reconcileNetworks returns (false, nil, nil) — no error and no resolved +// location. The caller must treat nil location as best-effort and must NOT block +// instance creation. +func TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound(t *testing.T) { + t.Parallel() + + s := newNetworkingScheme() + // Cluster has a Location for a DIFFERENT city code. + otherLocation := newTestLocation("loc-ord-1", locTestOtherCityCode) + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(otherLocation).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, // no matching Location + }, + } + + r := &WorkloadDeploymentReconciler{} + networkReady, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err, "missing location must not cause an error") + assert.False(t, networkReady, "network is not ready when no location is found") + assert.Nil(t, resolvedLocation, + "resolved location must be nil when no matching Location object exists") + + // Status.Location remains nil — callers must not update it in this case. + // Confirm the deployment's Status.Location is unaffected (nil → nil). + assert.Nil(t, deployment.Status.Location, + "Status.Location must remain nil when no Location matches the city code") +} + +// newLocationTestWDReconciler builds a WorkloadDeploymentReconciler with +// networking enabled, wired to a fake cluster, with the controller finalizer +// pre-registered the same way SetupWithManager does. Networking must be enabled +// so Reconcile exercises Location resolution. +func newLocationTestWDReconciler(cl client.Client) *WorkloadDeploymentReconciler { + r := &WorkloadDeploymentReconciler{ + mgr: newFakeMCManager(testCluster, newFakeCluster(cl)), + NetworkingEnabled: true, + } + feds := finalizer.NewFinalizers() + if err := feds.Register(workloadControllerFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition verifies the +// user-visible surface while a deployment waits for its city's Location: the +// Available condition must name the unresolved city (reason NoMatchingLocation), +// and once a matching Location appears the next reconcile must replace that +// reason — the unresolved-city signal must not outlive its cause. +func TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition(t *testing.T) { + t.Parallel() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "location-test-wd", + Namespace: locTestWDNamespace, + UID: "location-test-wd-uid", + // Pre-set the finalizer so Reconcile proceeds past the finalizer-add + // branch. + Finalizers: []string{workloadControllerFinalizer}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: locTestCityCode, + WorkloadRef: computev1alpha.WorkloadReference{Name: "location-test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + // Production deployments always carry the kubebuilder-defaulted + // policy; without it the instance-control strategy emits no actions. + InstanceManagementPolicy: computev1alpha.OrderedReadyInstanceManagementPolicyType, + }, + }, + } + + // An instance shaped the way the instance-control strategy creates it: + // ordinal name, controller labels, and the scheduling gates stamped at + // creation. Pre-seeding it (with a CreationTimestamp, which the fake client + // does not stamp on Create) keeps the strategy in its wait path so the test + // exercises only the condition transitions. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name + "-0", + Namespace: deployment.Namespace, + CreationTimestamp: metav1.Now(), + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.UID), + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + }, + } + + // The only Location in the cluster serves a different city. + otherLocation := newTestLocation("loc-ord-1", locTestOtherCityCode) + + cl := fake.NewClientBuilder(). + WithScheme(newNetworkingScheme()). + WithObjects(deployment, instance, otherLocation). + WithStatusSubresource(deployment). + Build() + r := newLocationTestWDReconciler(cl) + + req := mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: deployment.Name, Namespace: deployment.Namespace}, + }, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.WorkloadDeployment + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + require.NotNil(t, cond, "Available must be set while the city has no Location") + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, "NoMatchingLocation", cond.Reason) + assert.Contains(t, cond.Message, locTestCityCode, + "the condition message must name the unresolved city code") + assert.Nil(t, updated.Status.Location) + + // Provision the city's Location; the next reconcile resolves it and must + // replace the NoMatchingLocation reason. + matchingLocation := newTestLocation("loc-dfw-2", locTestCityCode) + require.NoError(t, cl.Create(context.Background(), matchingLocation)) + + _, err = r.Reconcile(context.Background(), req) + require.NoError(t, err) + + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + cond = apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) + require.NotNil(t, cond) + assert.Equal(t, "ProvisioningInstances", cond.Reason, + "the unresolved-city reason must give way once the Location resolves") + require.NotNil(t, updated.Status.Location) + assert.Equal(t, matchingLocation.Name, updated.Status.Location.Name) +} + +// TestEnqueueWorkloadDeploymentsForLocation verifies the Location watch mapping: +// a Location event must enqueue exactly the WorkloadDeployments whose CityCode +// matches the Location's topology (via deploymentCityCodeIndex), and a Location +// without a city code in its topology must map to nothing. +func TestEnqueueWorkloadDeploymentsForLocation(t *testing.T) { + t.Parallel() + + wdDFW := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "wd-dfw", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{CityCode: locTestCityCode}, + } + wdORD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "wd-ord", Namespace: locTestWDNamespace}, + Spec: computev1alpha.WorkloadDeploymentSpec{CityCode: locTestOtherCityCode}, + } + + cl := fake.NewClientBuilder(). + WithScheme(newNetworkingScheme()). + WithIndex(&computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc). + WithObjects(wdDFW, wdORD). + Build() + + location := newTestLocation("loc-dfw-1", locTestCityCode) + + requests := enqueueWorkloadDeploymentsForLocation(context.Background(), cl, testCluster, location) + require.Len(t, requests, 1, "only deployments whose CityCode matches the Location must be enqueued") + assert.Equal(t, wdDFW.Name, requests[0].Name) + assert.Equal(t, locTestWDNamespace, requests[0].Namespace) + assert.Equal(t, multicluster.ClusterName(testCluster), requests[0].ClusterName) + + // A Location without a city code in its topology identifies no city, so no + // deployment can match it. + noCityLocation := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: "loc-no-city", Namespace: locTestNamespace}, + Spec: networkingv1alpha.LocationSpec{Topology: map[string]string{}}, + } + assert.Empty(t, enqueueWorkloadDeploymentsForLocation(context.Background(), cl, testCluster, noCityLocation)) +} From 84cf551ffc3acc859a5a4705c8b442d675035f8e Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:39:24 -0500 Subject: [PATCH 06/14] feat(controller): instance controller for federated scheduling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the Instance controller to compute the Ready/Available conditions and apply the per-project quota gate within a single reconcile pass, so federated placement reflects real allocatable capacity. Quota flow: the ResourceClaim is named after the Instance (unique within the project control plane, "instance-" prefixed so it cannot collide with other kinds' claims) and carries an instance-namespace label so a grant event maps back to the owning Instance for immediate re-enqueue. Because the grant lives on the project control plane and the watch event can be missed (informer engagement races, relist gaps), a backing-off safety-net requeue runs while QuotaGranted != True — anchored on the Instance creation time, computed up front so every return path honors it, logged for observability, and falling back to the bounded quota interval on write conflicts instead of controller-runtime's error backoff. The controller also emits Warning events explaining why an Instance is blocked (QuotaNoBudget, NetworkFailedToCreate, ...) so the signal reaches kubectl describe and the activity timeline. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_controller.go | 955 ++++++++++-- .../controller/instance_controller_test.go | 1329 +++++++++++++++-- 2 files changed, 2037 insertions(+), 247 deletions(-) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 820609c1..5a4f53f8 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -4,53 +4,197 @@ package controller import ( "context" + "errors" "fmt" + "maps" "strings" + "time" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - ctrlsource "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" + quotametrics "go.datum.net/compute/internal/quota" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" + + // instanceQuotaClaimSourceLabel is stamped on ResourceClaim objects with the + // name of the edge cluster that created them. The claim watch predicate uses + // this label to filter out claims written by other edge controllers targeting + // the same project control planes. + instanceQuotaClaimSourceLabel = "compute.datumapis.com/source-cluster" + + // instanceQuotaClaimNamespaceLabel records the source Instance's namespace on + // the ResourceClaim. The claim lives in the project's quota namespace (not the + // Instance's namespace), so the claim watch reads this label to map a grant + // back to the owning Instance. + instanceQuotaClaimNamespaceLabel = "compute.datumapis.com/instance-namespace" + + // instanceQuotaClaimNamePrefix namespaces an Instance's ResourceClaim name by + // resource type. Claims for different resource kinds share the project quota + // namespace, so the Instance name alone (unique among Instances, but not + // across kinds) could collide with another kind's claim — the prefix prevents + // that. The claim watch strips it to recover the Instance name. + instanceQuotaClaimNamePrefix = "instance-" + + quotaResourceTypeInstances = "compute.datumapis.com/instances" + + miloProjectAPIGroup = "resourcemanager.miloapis.com" + + miloProjectKind = "Project" + + msgNotProgrammed = "Instance has not been programmed" + + msgInstanceReady = "Instance is ready" + + msgInstanceProgrammed = "Instance has been programmed" + + msgInstanceAvailable = "Instance is available" + + // reasonNetworkFailedToCreate is the reason code for network creation failure. + reasonNetworkFailedToCreate = "NetworkFailedToCreate" +) + +// Quota-pending requeue backoff. The instance controller is normally re-queued by +// the ResourceClaim watch when a claim is granted, but that grant event lives on +// the project control plane and can be missed (informer engagement races, watch +// relist gaps), wedging the instance at QuotaGranted!=True indefinitely. While +// quota is pending we requeue on a backing-off schedule as a safety net so a +// missed grant self-heals. The interval lengthens the longer the instance waits: +// +// elapsed < 60s : every 1s (catch a grant landing almost immediately) +// 60s – 5m : every 15s +// 5m – 10m : every 60s +// >= 10m : every 300s +const ( + quotaPendingRequeueFast = 1 * time.Second + quotaPendingRequeueMedium = 15 * time.Second + quotaPendingRequeueSlow = 60 * time.Second + quotaPendingRequeueIdle = 300 * time.Second + + quotaPendingFastWindow = 60 * time.Second + quotaPendingMediumWindow = 5 * time.Minute + quotaPendingSlowWindow = 10 * time.Minute +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. type clusterGetter interface { - GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) + GetCluster(ctx context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) } +// errProjectIdentityUnresolvable is the sentinel wrapped by project-identity +// resolvers when the edge namespace is missing one of the identity labels +// stamped by NSO's MappedNamespaceResourceStrategy. Both labels are written +// atomically at namespace creation, before any Instance can exist in the +// namespace, so absence is misconfiguration — not a propagation race — and +// retrying cannot fix it. Callers use errors.Is to distinguish this from +// transient resolution failures. +var errProjectIdentityUnresolvable = errors.New("project identity unresolvable") + +// InstanceProjectIDFunc derives the Milo project ID for a given Instance. +// In Milo mode the project ID equals the multicluster ClusterName. In +// single-cell mode it is decoded from the upstream-cluster-name namespace label. +// Returns an error wrapping errProjectIdentityUnresolvable when the identity +// label is missing (misconfiguration); transient failures return ordinary +// errors that should trigger a requeue. +type InstanceProjectIDFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + +// InstanceProjectNamespaceFunc derives the in-project namespace where +// ResourceClaims for a given Instance should be created. In Milo mode this +// equals instance.Namespace. In single-cell mode it comes from the +// upstream-namespace namespace label. +// Returns an error wrapping errProjectIdentityUnresolvable when the identity +// label is missing (misconfiguration); transient failures return ordinary +// errors that should trigger a requeue. +type InstanceProjectNamespaceFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + // InstanceReconciler reconciles an Instance object type InstanceReconciler struct { - mgr clusterGetter - managementCluster cluster.Cluster + mgr clusterGetter + scheme *runtime.Scheme + quotaClientManager *quotametrics.ProjectQuotaClientManager + edgeClusterName string + // recorder emits Kubernetes events on the Instance object for quota failure + // modes so operators can diagnose issues via `kubectl describe`. + recorder record.EventRecorder + // projectIDForInstance derives the Milo project ID used for quota + // ResourceClaim management. In Milo mode it returns string(clusterName); in + // single-cell mode it reads the upstream-cluster-name label from the edge + // namespace and decodes "cluster-" → "". + projectIDForInstance InstanceProjectIDFunc + // projectNamespaceForInstance derives the in-project namespace where + // ResourceClaims must be created. In Milo mode the ResourceClaim lives in + // instance.Namespace (the project-level namespace); in single-cell mode the + // edge namespace is ns-{uid} which does not exist in the project control + // plane — the real namespace is the upstream-namespace label value (e.g. + // "default"). When nil, falls back to instance.Namespace. + projectNamespaceForInstance InstanceProjectNamespaceFunc + // clusterNameForProject maps a Milo project ID back to the multicluster + // ClusterName that owns that project's workloads. In Milo mode the + // ClusterName equals the project ID. In single-cell mode the only registered + // cluster is "single" regardless of project ID. When nil, falls back to + // multicluster.ClusterName(projectID), which is correct for Milo mode. + clusterNameForProject func(projectID string) multicluster.ClusterName + // FederationClient is an optional client pointing at the upstream + // Karmada/federation control plane (configured via --federation-kubeconfig). + // When non-nil, the reconciler writes a copy of each Instance back to the + // federation control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + FederationClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/finalizers,verbs=update // +kubebuilder:rbac:groups=quota.miloapis.com,resources=resourceclaims,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -69,29 +213,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), req.ClusterName, &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,94 +241,530 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, quotaErr := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) + + // Safety-net requeue while quota is not yet granted, computed up front so + // every return path below honors it. A conflict during the pending window + // must not drop the instance onto controller-runtime's exponential + // error-backoff (which can stretch to minutes), which would defeat recovery + // from a missed ResourceClaim grant event. Logged so the requeue is + // observable: a re-firing requeue prints this every pass while pending. + quotaReq := quotaPendingRequeueAfter(&instance, time.Now()) + if quotaReq > 0 { + logger.Info("quota pending; scheduling safety-net requeue", + "after", quotaReq.String(), "cluster", req.ClusterName.String(), "instance", instance.Name) + } + + // Transient errors from the quota and Ready-condition reconciles are + // returned only after any condition change has been persisted, so the + // failure reason is visible on the Instance while controller-runtime + // requeues with backoff. + readyChanged, readyErr := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) + + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + if quotaReq > 0 && apierrors.IsConflict(err) { + logger.Info("status update conflicted while quota pending; requeuing instead of error-backoff", + "after", quotaReq.String(), "instance", instance.Name) + return ctrl.Result{RequeueAfter: quotaReq}, nil + } + return ctrl.Result{}, err + } + if readyErr != nil { + return ctrl.Result{}, readyErr + } + // Return with the quota error (nil or transient) so controller-runtime + // requeues with backoff on failures. On the success path (quotaErr==nil) + // we fall through to removeQuotaSchedulingGate below instead of returning + // early, so the gate is cleared in the same reconcile pass rather than + // waiting for a requeue that may never come (ResourceClaim is immutable + // and local Instances are not watched). + if quotaErr != nil { + if err := r.writeBackToUpstream(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, quotaErr + } + } else if readyErr != nil { + return ctrl.Result{}, readyErr + } else if quotaErr != nil { + // No status change but quota evaluation failed — return error to requeue. + return ctrl.Result{}, quotaErr + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + if err := r.writeBackToUpstream(ctx, &instance); err != nil { + if quotaReq > 0 && apierrors.IsConflict(err) { + logger.Info("upstream writeback conflicted while quota pending; requeuing instead of error-backoff", + "after", quotaReq.String(), "instance", instance.Name) + return ctrl.Result{RequeueAfter: quotaReq}, nil + } + return ctrl.Result{}, err + } + + if quotaReq > 0 { + logger.Info("requeuing instance", "after", quotaReq.String(), + "cluster", req.ClusterName.String(), "instance", instance.Name) + } + + return ctrl.Result{RequeueAfter: quotaReq}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + if r.quotaClientManager != nil { + if err := r.cleanupQuotaClaim(ctx, clusterName, instance); err != nil { + if !errors.Is(err, errProjectIdentityUnresolvable) { + // Transient failure (API unreachable, quota client errors) — + // retry with backoff rather than risking an orphaned claim. + return err + } + // Unresolvable project identity must not wedge deletion: the identity + // labels are stamped at namespace creation, so absence is + // misconfiguration that no retry fixes. Log at ERROR and emit an event + // so the operator is aware, then fall through to finalizer removal so + // the Instance is not permanently stuck in Terminating. The orphaned + // claim will count against project budget until Milo's TTL/GC removes it. + log.FromContext(ctx).Error(err, "project identity unresolvable during deletion; ResourceClaim may be orphaned — budget leak possible", + "instance", instance.Name, "namespace", instance.Namespace) + r.recorder.Event(instance, corev1.EventTypeWarning, + "QuotaClaimOrphaned", + "Skipping ResourceClaim cleanup: project identity could not be resolved; claim may be orphaned in Milo project control plane") + quotametrics.ClaimOrphanedTotal.Inc() + } + } + + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// cleanupQuotaClaim deletes the ResourceClaim backing an Instance from the +// project control plane. Errors wrapping errProjectIdentityUnresolvable mean +// the claim cannot even be located; the caller decides whether deletion +// proceeds without cleanup. +func (r *InstanceReconciler) cleanupQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + projectID, err := r.resolveProjectID(ctx, clusterName, instance) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return fmt.Errorf("resolving project ID during deletion: %w", err) } - statusChanged := false + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + return fmt.Errorf("failed getting quota client for deletion: %w", err) + } + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project namespace during deletion: %w", err) + } + claimName := quotaClaimName(instance) + var claim quotav1alpha1.ResourceClaim + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: claimNamespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + return nil + } + if err := projectClient.Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } + return nil +} + +// quotaClaimName returns the name of the ResourceClaim backing an Instance's +// quota: the Instance name (unique among Instances within the project control +// plane) prefixed by instanceQuotaClaimNamePrefix to avoid colliding with other +// resource kinds' claims in the shared quota namespace. The owning Instance's +// namespace is preserved on the claim via instanceQuotaClaimNamespaceLabel so +// the claim watch can map a grant back to the Instance. +func quotaClaimName(instance *computev1alpha.Instance) string { + return instanceQuotaClaimNamePrefix + instance.Name +} + +// quotaPendingRequeueAfter returns a safety-net requeue interval while the +// instance's quota is not yet granted, backing off the longer it has waited (see +// the quotaPendingRequeue* constants). It returns 0 when quota is already granted +// (QuotaGranted=True) or the condition is absent, so a granted/normal instance is +// not needlessly requeued. +// +// Elapsed time is anchored on the instance's creation timestamp, NOT the +// QuotaGranted condition's LastTransitionTime: while quota is pending the +// condition stays Unknown (PendingEvaluation and NoBudget are both Unknown), so +// SetStatusCondition never bumps LastTransitionTime off its 1970-01-01 CRD +// default — which would peg every pending instance to the slowest tier. +func quotaPendingRequeueAfter(instance *computev1alpha.Instance, now time.Time) time.Duration { + cond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if cond == nil || cond.Status == metav1.ConditionTrue { + return 0 + } + elapsed := now.Sub(instance.CreationTimestamp.Time) switch { - case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case elapsed < quotaPendingFastWindow: + return quotaPendingRequeueFast + case elapsed < quotaPendingMediumWindow: + return quotaPendingRequeueMedium + case elapsed < quotaPendingSlowWindow: + return quotaPendingRequeueSlow + default: + return quotaPendingRequeueIdle + } +} + +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns (changed, err) where +// changed=true means a status update is required, and err non-nil means the +// reconciler should requeue (with backoff) in addition to writing the condition. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, claimErr := r.reconcileQuotaClaim(ctx, clusterName, instance) + + // reconcileQuotaClaim returns (condition, err). A non-nil error signals a + // transient infrastructure failure; a non-nil condition carries the reason to + // write. Both can be non-nil: write the condition AND requeue with backoff. + switch { + case grantedCondition == nil && claimErr == nil: + // No grant decision yet: the claim was just created or carries no + // Granted condition. Stay PendingEvaluation until the claim watch or + // the safety-net requeue observes the decision. + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, + }), nil + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionFalse && + grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason: + // Claim exists but pending — no AllowanceBucket. Distinct from "evaluating". + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceQuotaGrantedReasonNoBudget, + Message: "ResourceClaim is pending: no AllowanceBucket configured for this project", + ObservedGeneration: instance.Generation, + }) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonNoBudget, + "ResourceClaim pending: no AllowanceBucket configured for this project") + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonNoBudget).Inc() + return changed, claimErr + + case grantedCondition != nil && grantedCondition.Type == computev1alpha.InstanceQuotaGranted: + // reconcileQuotaClaim populated a structured failure condition. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: grantedCondition.Status, + Reason: grantedCondition.Reason, + Message: grantedCondition.Message, + ObservedGeneration: instance.Generation, }) + return changed, claimErr - case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionTrue: + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr - case grantedCondition.Status == metav1.ConditionFalse: + case grantedCondition != nil: // False, non-pending reason from ResourceClaim reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr + + default: // grantedCondition == nil && claimErr != nil — should not reach here + return false, claimErr } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +// It guards on ObservedGeneration to prevent a stale True condition from +// generation N unblocking a generation N+1 instance before quota for the +// new spec has been evaluated. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + // Stale condition guard: only remove the gate if the condition reflects the + // current spec generation. A condition from an older generation means quota + // has not yet been evaluated for the current spec. + if quotaGrantedCond.ObservedGeneration != instance.Generation { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.FederationClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToUpstream copies the Instance spec and status to the upstream +// Karmada/federation control plane so that the InstanceProjector can aggregate +// state from all POP cells. It is a no-op when FederationClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToUpstream(ctx context.Context, instance *computev1alpha.Instance) error { + if r.FederationClient == nil { + return nil + } + + // Read the upstream project namespace name and encoded cluster name from + // the federation-plane namespace. The federator stamps both labels + // atomically when it creates the namespace, before any cell Instance can + // exist in it, so they are the sole source of write-back identity: a + // failed read or a missing label is corruption, never a propagation race. + // Deriving substitute values here would write WRONG identity upstream, + // where the InstanceProjector could mislink the projection — erroring + // retries with backoff instead. + var downstreamNS corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err != nil { + return fmt.Errorf("failed getting federation namespace %q for write-back identity: %w", instance.Namespace, err) + } + upstreamNamespace := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if upstreamNamespace == "" { + return fmt.Errorf("federation namespace %q is missing the %s label required for write-back identity", + instance.Namespace, downstreamclient.UpstreamOwnerNamespaceLabel) + } + encodedClusterName := downstreamNS.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encodedClusterName == "" { + return fmt.Errorf("federation namespace %q is missing the %s label required for write-back identity", + instance.Namespace, downstreamclient.UpstreamOwnerClusterNameLabel) + } + + // The write-back copy must carry every label the stateful control strategy + // stamps atomically at Instance creation (with a backfill pass converging + // live instances), so absence of any of them is transient. Erroring retries + // with backoff until the labels land, instead of propagating incomplete + // identity upstream where the projection could never be linked back to its + // owners or routed by city/placement. + var missingLabels []string + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + computev1alpha.WorkloadDeploymentNameLabel, + computev1alpha.CityCodeLabel, + computev1alpha.WorkloadNameLabel, + computev1alpha.PlacementNameLabel, + } { + if instance.Labels[key] == "" { + missingLabels = append(missingLabels, key) } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil + } + if len(missingLabels) > 0 { + return fmt.Errorf("instance %s/%s is missing linking labels required for write-back: %s", + instance.Namespace, instance.Name, strings.Join(missingLabels, ", ")) } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + computev1alpha.WorkloadUIDLabel: instance.Labels[computev1alpha.WorkloadUIDLabel], + computev1alpha.WorkloadDeploymentUIDLabel: instance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + computev1alpha.InstanceIndexLabel: instance.Labels[computev1alpha.InstanceIndexLabel], + computev1alpha.WorkloadDeploymentNameLabel: instance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + computev1alpha.CityCodeLabel: instance.Labels[computev1alpha.CityCodeLabel], + computev1alpha.WorkloadNameLabel: instance.Labels[computev1alpha.WorkloadNameLabel], + computev1alpha.PlacementNameLabel: instance.Labels[computev1alpha.PlacementNameLabel], + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // The federation namespace already exists: the identity Get above read + // it, and the federator guarantees a labeled namespace before any cell + // Instance can exist. If it disappears between the Get and this Create, + // the Create fails NotFound and retries via backoff until the federator + // restores it — creating an unlabeled namespace here would manufacture + // the very corruption the identity checks reject. + if err := r.FederationClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) + } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Build a comparable map containing only the keys this function owns so that + // Karmada-managed labels on the existing object do not cause spurious updates. + ownedLabels := make(map[string]string, len(writeBack.Labels)) + for k := range writeBack.Labels { + ownedLabels[k] = existing.Labels[k] + } + + if !apiequality.Semantic.DeepEqual(existing.Spec, instance.Spec) || + !apiequality.Semantic.DeepEqual(ownedLabels, writeBack.Labels) { + existing.Spec = instance.Spec + // Merge writeBack.Labels into existing.Labels. Only keys owned by + // writeBackToUpstream are written; any labels Karmada or other actors + // have placed on the downstream object are preserved. + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + maps.Copy(existing.Labels, writeBack.Labels) + if err := r.FederationClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + } + + if !apiequality.Semantic.DeepEqual(existing.Status, instance.Status) { + existing.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) } } - return ctrl.Result{}, nil + return nil } -func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { +// reconcileQuotaClaim attempts to create or observe a ResourceClaim for the +// given instance. It returns: +// - (nil, nil) — no grant decision yet: the claim was just created or +// carries no Granted condition; caller sets PendingEvaluation +// - (condition, nil) — terminal condition (True/False/Unknown from claim or failure) +// - (condition, err) — condition to write + transient error to requeue with backoff +// +// The condition's Type field is always InstanceQuotaGranted when set by this function +// to distinguish it from ResourceClaim conditions returned directly. +func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (*metav1.Condition, error) { + if r.quotaClientManager == nil { + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, + Message: "Quota enforcement disabled: no credential configured", + }, nil + } + logger := log.FromContext(ctx) - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + // Transient (namespace API unreachable) or permanent (identity labels + // missing — misconfiguration). Either way the failure is surfaced: + // structured condition + warning event + error return, instead of + // silently parking the instance at PendingEvaluation. + msg := fmt.Sprintf("Could not resolve project ID: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project ID for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + msg := fmt.Sprintf("Failed to build quota client for project %q: %v", projectID, err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting quota client for project %q: %w", projectID, err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + msg := fmt.Sprintf("Could not resolve project namespace: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project namespace for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + + claimName := quotaClaimName(instance) requests := []quotav1alpha1.ResourceRequest{ { - ResourceType: "compute.datumapis.com/instances", + ResourceType: quotaResourceTypeInstances, Amount: 1, }, } @@ -213,39 +788,99 @@ func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterNam desired := "av1alpha1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ Name: claimName, - Namespace: instance.Namespace, + Namespace: claimNamespace, + Labels: map[string]string{ + instanceQuotaClaimSourceLabel: r.edgeClusterName, + instanceQuotaClaimNamespaceLabel: instance.Namespace, + }, }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", - Name: clusterName, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instance.Name, - Namespace: instance.Namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, Requests: requests, }, } var existing quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { - if !apierrors.IsNotFound(err) { - return nil, fmt.Errorf("failed getting resource claim: %w", err) - } - if err := r.managementCluster.GetClient().Create(ctx, desired); err != nil { - return nil, fmt.Errorf("failed creating resource claim: %w", err) + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { + if apierrors.IsNotFound(err) { + // Claim doesn't exist yet — attempt to create it. + createErr := projectClient.Create(ctx, desired) + if createErr == nil { + return nil, nil + } + return r.classifyCreateError(instance, projectID, claimNamespace, createErr) } - return nil, nil + // GET itself failed — treat as backend unavailable. + msg := fmt.Sprintf("Quota backend unreachable getting ResourceClaim: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting resource claim: %w", err) } grantedCondition := apimeta.FindStatusCondition(existing.Status.Conditions, quotav1alpha1.ResourceClaimGranted) return grantedCondition, nil } +// classifyCreateError maps a ResourceClaim creation error to a structured +// QuotaGranted condition with a specific reason, emits a Kubernetes event, and +// increments the appropriate metric counter. +func (r *InstanceReconciler) classifyCreateError( + instance *computev1alpha.Instance, + projectID, claimNamespace string, + err error, +) (*metav1.Condition, error) { + var reason, metricLabel, msg string + + switch { + case apierrors.IsNotFound(err): + // 404 on Create: either the project control plane path doesn't exist + // (project deleted) or the namespace doesn't exist yet. + if claimNamespace != "" { + reason = computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound + metricLabel = quotametrics.ReasonNamespaceNotFound + msg = fmt.Sprintf("Quota claim namespace %q not found on project %q control plane", claimNamespace, projectID) + } else { + reason = computev1alpha.InstanceQuotaGrantedReasonProjectNotFound + metricLabel = quotametrics.ReasonProjectNotFound + msg = fmt.Sprintf("Milo project %q not found", projectID) + } + case apierrors.IsForbidden(err) || apierrors.IsInvalid(err): + // 403/422: quota admission plugin rejected the claim. + reason = computev1alpha.InstanceQuotaGrantedReasonMisconfigured + metricLabel = quotametrics.ReasonMisconfigured + msg = fmt.Sprintf("Quota admission rejected ResourceClaim for project %q: %v", projectID, err) + default: + // Connectivity or server error — treat as backend unavailable. + reason = computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable + metricLabel = quotametrics.ReasonBackendUnavailable + msg = fmt.Sprintf("Quota backend unreachable creating ResourceClaim: %v", err) + } + + r.recorder.Event(instance, corev1.EventTypeWarning, reason, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(metricLabel).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: reason, + Message: msg, + }, fmt.Errorf("failed creating resource claim: %w", err) +} + func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime if rt.Sandbox != nil { @@ -327,7 +962,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, ObservedGeneration: instance.Generation, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, } } else { readyCondition = readyCondition.DeepCopy() @@ -344,8 +979,9 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = "NetworkFailedToCreate" + readyCondition.Reason = reasonNetworkFailedToCreate readyCondition.Message = networkCreationFailureMessage } else { readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent @@ -360,12 +996,13 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason } - readyCondition.Message = "Instance has not been programmed" + readyCondition.Message = msgNotProgrammed if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { readyCondition.Message = programmedCondition.Message } @@ -375,18 +1012,19 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( logger.Info("instance is programmed", "instance", instance.Name) - runningCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceAvailable) - if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { - logger.Info("instance is not running", "instance", instance.Name) + availableCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceAvailable) + if availableCondition == nil || availableCondition.Status != metav1.ConditionTrue { + logger.Info("instance is not available", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason - if runningCondition != nil && runningCondition.Reason != pendingReason { - readyCondition.Reason = runningCondition.Reason + if availableCondition != nil && availableCondition.Reason != pendingReason { + readyCondition.Reason = availableCondition.Reason } - readyCondition.Message = "Instance is not running" - if runningCondition != nil && runningCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = runningCondition.Message + readyCondition.Message = "Instance is not available" + if availableCondition != nil && availableCondition.Status != metav1.ConditionUnknown { + readyCondition.Message = availableCondition.Message } return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil @@ -394,7 +1032,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( readyCondition.Status = metav1.ConditionTrue readyCondition.Reason = computev1alpha.InstanceReadyReasonAvailable - readyCondition.Message = "Instance is ready" + readyCondition.Message = msgInstanceReady return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -436,38 +1074,111 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, return false, "", nil } +// resolveProjectID delegates to projectIDForInstance; when nil it falls back +// to string(clusterName) (Milo mode). +func (r *InstanceReconciler) resolveProjectID(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectIDForInstance != nil { + return r.projectIDForInstance(ctx, clusterName, instance) + } + return string(clusterName), nil +} + +// resolveProjectNamespace delegates to projectNamespaceForInstance; when nil +// it falls back to instance.Namespace (Milo mode). +func (r *InstanceReconciler) resolveProjectNamespace(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectNamespaceForInstance != nil { + return r.projectNamespaceForInstance(ctx, clusterName, instance) + } + return instance.Namespace, nil +} + +// resolveClusterNameForProject delegates to clusterNameForProject; when nil it +// falls back to multicluster.ClusterName(projectID) (Milo mode). +func (r *InstanceReconciler) resolveClusterNameForProject(projectID string) multicluster.ClusterName { + if r.clusterNameForProject != nil { + return r.clusterNameForProject(projectID) + } + return multicluster.ClusterName(projectID) +} + // SetupWithManager sets up the controller with the Manager. -func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementCluster cluster.Cluster) error { +// +// quotaRestConfig is the REST config used to reach Milo project control planes +// for ResourceClaim management. Pass nil to disable quota accounting. +// +// projectIDForInstance derives the Milo project ID for each reconcile request. +// In Milo mode pass nil (falls back to using ClusterName). In single-cell mode +// pass a function that decodes the project ID from the edge namespace's +// upstream-cluster-name label. +// +// clusterNameForProject maps a project ID back to the multicluster ClusterName. +// In Milo mode pass nil (falls back to ClusterName(projectID)). In single-cell +// mode pass a function that always returns "single". +func (r *InstanceReconciler) SetupWithManager( + mgr mcmanager.Manager, + quotaRestConfig *rest.Config, + projectIDForInstance InstanceProjectIDFunc, + projectNamespaceForInstance InstanceProjectNamespaceFunc, + edgeClusterName string, + clusterNameForProject func(projectID string) multicluster.ClusterName, +) error { r.mgr = mgr - r.managementCluster = managementCluster - - // Watch ResourceClaim objects on the management cluster directly, bypassing - // the multicluster clusterInjectingQueue which would overwrite ClusterName. - // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request - // values with the correct ClusterName taken from claim.Spec.ConsumerRef.Name. - claimSource := ctrlsource.TypedKind( - managementCluster.GetCache(), - "av1alpha1.ResourceClaim{}, - handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, claim *quotav1alpha1.ResourceClaim) []mcreconcile.Request { - if claim.Spec.ResourceRef.Kind != "Instance" || claim.Spec.ResourceRef.APIGroup != "compute.datumapis.com" { - return nil - } - return []mcreconcile.Request{ - { - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: claim.Spec.ResourceRef.Name, - Namespace: claim.Spec.ResourceRef.Namespace, - }, - }, - ClusterName: claim.Spec.ConsumerRef.Name, - }, - } - }), - ) + r.scheme = mgr.GetLocalManager().GetScheme() + //nolint:staticcheck // GetEventRecorder (new events API) has an incompatible Eventf + // signature (requires related object + action args) that would require migrating + // all emit sites. GetEventRecorderFor remains correct; migration is deferred. + r.recorder = mgr.GetLocalManager().GetEventRecorderFor("instance-controller") + r.edgeClusterName = edgeClusterName + r.projectIDForInstance = projectIDForInstance + r.projectNamespaceForInstance = projectNamespaceForInstance + r.clusterNameForProject = clusterNameForProject + if quotaRestConfig != nil { + if edgeClusterName == "" { + return fmt.Errorf("edgeClusterName must be set when quota enforcement is enabled; set discovery.clusterName in the server config") + } + r.quotaClientManager = quotametrics.New(quotaRestConfig) + } + + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + + edgeClusterNameVal := r.edgeClusterName return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Instance{}, mcbuilder.WithEngageWithLocalCluster(false)). - WatchesRawSource(claimSource). + Watches( + "av1alpha1.ResourceClaim{}, + func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []mcreconcile.Request { + claim := obj.(*quotav1alpha1.ResourceClaim) + // Map the claim back to its owning Instance. The Instance + // namespace is carried on a label (the claim itself lives in + // the project's quota namespace) and the Instance name is the + // claim name with the resource-kind prefix stripped. + instanceNamespace := claim.GetLabels()[instanceQuotaClaimNamespaceLabel] + if instanceNamespace == "" { + return nil + } + return []mcreconcile.Request{ + { + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: instanceNamespace, + Name: strings.TrimPrefix(claim.Name, instanceQuotaClaimNamePrefix), + }, + }, + ClusterName: r.resolveClusterNameForProject(claim.Spec.ConsumerRef.Name), + }, + } + }, + ) + }, + mcbuilder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal + })), + ). Complete(r) } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index b356d433..202b58b1 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -3,8 +3,8 @@ package controller import ( "context" "fmt" - "net/http" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -12,50 +12,40 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/quota" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} +// Test constants for repeated string literals across controller package tests. +const ( + testInstanceName = "test-instance" + testReasonString = "TestReason" + testMessageString = "Test message" + testUIDString = "test-uid" + testInstanceType = "d1-standard-2" + testDefaultPlacement = "default" + testDefaultNamespace = "default" + testEdgeClusterName = "test-edge" + testComputeAPIVersion = "compute.datumapis.com/v1alpha" + testQuotaAPIGroup = "quota.miloapis.com" + testQuotaResource = "resourceclaims" + kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment +) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { @@ -79,8 +69,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance without ready condition should create default", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, }, @@ -89,7 +79,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, }, }, @@ -97,8 +87,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates should set scheduling gates present", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -114,7 +104,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -134,8 +124,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates and network failure should set network failed", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -153,7 +143,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "NetworkFailedToCreate", + Reason: reasonNetworkFailedToCreate, Message: "Network creation failed: timeout", ObservedGeneration: 1, }, @@ -162,8 +152,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance not programmed should set pending programming", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -171,8 +161,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -181,17 +171,17 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, { - name: "instance programmed but not running should wait for running", + name: "instance programmed but not available should wait for available", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -200,13 +190,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -215,8 +205,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -224,8 +214,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance fully ready should set ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -234,13 +224,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, }, }, }, @@ -250,7 +240,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -258,8 +248,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "no change when condition already matches", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -268,7 +258,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -276,13 +266,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, }, }, }, @@ -292,7 +282,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -343,8 +333,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota denied blocks ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -360,14 +350,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -385,8 +375,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota available does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -402,14 +392,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceAvailable, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceAvailableReasonAvailable, - Message: "Instance is running", + Message: msgInstanceAvailable, LastTransitionTime: metav1.Now(), }, }, @@ -420,15 +410,15 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonAvailable, - Message: "Instance is ready", + Message: msgInstanceReady, }, }, { name: "quota pending unknown does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -448,7 +438,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, }, }, } @@ -491,7 +481,7 @@ func TestReconcileQuota(t *testing.T) { instanceName = "my-instance" ) - claimName := namespace + "--" + instanceName + claimName := instanceQuotaClaimNamePrefix + instanceName const deploymentName = "my-deployment" @@ -501,25 +491,28 @@ func TestReconcileQuota(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, Namespace: namespace, - UID: "test-uid", + UID: testUIDString, }, } } // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, - Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Namespace: testDefaultNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "compute.datumapis.com/v1alpha", - Kind: "WorkloadDeployment", + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, Name: deploymentName, - UID: "test-uid", + UID: testUIDString, Controller: func() *bool { b := true; return &b }(), }, }, @@ -529,7 +522,7 @@ func TestReconcileQuota(t *testing.T) { SchedulingGates: gates, }, Runtime: computev1alpha.InstanceRuntimeSpec{ - Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, }, NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, }, @@ -544,18 +537,21 @@ func TestReconcileQuota(t *testing.T) { }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, Name: clusterName, }, + // ResourceRef points at the Project resource (cluster-scoped), not the + // Instance. The quota admission plugin validates against the + // ResourceRegistration's claimingResources, which only allows + // resourcemanager.miloapis.com/Project. ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instanceName, - Namespace: namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: clusterName, }, Requests: []quotav1alpha1.ResourceRequest{ - {ResourceType: "compute.datumapis.com/instances", Amount: 1}, + {ResourceType: quotaResourceTypeInstances, Amount: 1}, }, }, Status: quotav1alpha1.ResourceClaimStatus{ @@ -572,7 +568,7 @@ func TestReconcileQuota(t *testing.T) { } } - newReconciler := func(t *testing.T, projectObjs []client.Object, mgmtObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { + newReconciler := func(t *testing.T, projectObjs []client.Object, quotaObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { t.Helper() s := newTestScheme(t) @@ -582,26 +578,44 @@ func TestReconcileQuota(t *testing.T) { WithStatusSubresource(&computev1alpha.Instance{}). Build() - mgmtClient := fake.NewClientBuilder(). + quotaClient := fake.NewClientBuilder(). WithScheme(s). - WithObjects(mgmtObjs...). + WithObjects(quotaObjs...). WithStatusSubresource("av1alpha1.ResourceClaim{}). Build() mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ - mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + // Milo mode: project ID == ClusterName; claim namespace == instance.Namespace. + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + // nil → falls back to instance.Namespace, which is correct for Milo mode. + projectNamespaceForInstance: nil, } - return r, projectClient, mgmtClient + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + return r, projectClient, quotaClient } - t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True", func(t *testing.T) { + t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True in single reconcile", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -611,7 +625,10 @@ func TestReconcileQuota(t *testing.T) { r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) - // First reconcile: sets QuotaGranted=True in status, returns early. + // Single reconcile: sets QuotaGranted=True in status AND removes the + // Quota scheduling gate in the same pass. The early-return-before-gate- + // removal bug required a second reconcile that never arrived because + // ResourceClaims are immutable and local Instances are not watched. _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -623,22 +640,41 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) - // Second reconcile: status is already set, so removes the scheduling gate. - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) - hasQuotaGate := false for _, g := range updated.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate must be removed in the same reconcile pass as the status update") }) - t.Run("quota exceeded flow: conditions cascade to block Programmed/Running/Ready", func(t *testing.T) { + t.Run("ready-condition reconcile error: quota condition persisted before the error returns", func(t *testing.T) { + s := newTestScheme(t) + // A scheduling gate keeps the Ready-condition reconcile on the network + // failure checker path, and the missing owner reference makes that + // checker fail. + instance := makeInstance(s, + computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, + ) + instance.OwnerReferences = nil + claim := makeClaim(s, metav1.ConditionTrue, quotav1alpha1.ResourceClaimGrantedReason) + + r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) + + _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, + "QuotaGranted condition must be persisted even when the Ready-condition reconcile fails") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + }) + + t.Run("quota exceeded flow: conditions cascade to block Programmed/Available/Ready", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -664,10 +700,10 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionFalse, programmedCond.Status) assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, programmedCond.Reason) - runningCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceAvailable) - require.NotNil(t, runningCond) - assert.Equal(t, metav1.ConditionFalse, runningCond.Status) - assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, runningCond.Reason) + availableCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceAvailable) + require.NotNil(t, availableCond) + assert.Equal(t, metav1.ConditionFalse, availableCond.Status) + assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, availableCond.Reason) readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceReady) require.NotNil(t, readyCond) @@ -709,7 +745,9 @@ func TestReconcileQuota(t *testing.T) { } require.NoError(t, mgmtClient.Status().Update(context.Background(), &existingClaim)) - // Second reconcile should see granted claim and update status. + // Second reconcile should see the granted claim, update status to + // QuotaGranted=True, AND remove the gate in the same pass (no third + // reconcile required). _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -719,28 +757,41 @@ func TestReconcileQuota(t *testing.T) { require.NotNil(t, quotaCond) assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) - // Third reconcile removes the gate (status is already true, no more status write needed). - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &recovered)) hasQuotaGate := false for _, g := range recovered.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed after quota granted") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate should be removed in the same reconcile pass that sets QuotaGranted=True") }) t.Run("deleted before grant: finalizer deletes claim and is removed", func(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) @@ -766,3 +817,1031 @@ func TestReconcileQuota(t *testing.T) { } }) } + +// TestQuotaGateRemovedInSingleReconcile is a regression test for the bug where +// the Quota scheduling gate was never removed from an Instance after quota was +// granted. The root cause was an early return in the Reconcile function: when +// reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code +// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// Because ResourceClaims are immutable (no further transitions) and local +// Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever +// arrived — leaving the Quota gate stranded in spec.controller.schedulingGates +// and the projected Instance stuck "Pending (SchedulingGatesPresent)". +func TestQuotaGateRemovedInSingleReconcile(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + deploymentName = "my-deployment" + ) + + claimName := instanceQuotaClaimNamePrefix + instanceName + + tests := []struct { + name string + initialGates []computev1alpha.SchedulingGate + expectGateGone bool + }{ + { + name: "Quota gate only: removed in single reconcile when claim is granted", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "Quota gate plus Network gate: Quota removed, Network preserved", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "No gates: no-op, reconcile completes cleanly", + initialGates: []computev1alpha.SchedulingGate{}, + expectGateGone: false, // no gate to begin with + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Generation: 1, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: tt.initialGates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: namespace, UID: testUIDString}, + } + + // ResourceClaim already in QuotaAvailable state — simulates the state + // that triggered the bug: claim already granted but gate still present. + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota available", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Exactly one reconcile — must be sufficient to both set QuotaGranted=True + // and remove the Quota gate. No second reconcile should be required. + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + // QuotaGranted condition must be set to True. + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be present") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Quota gate must be gone after the single reconcile. + hasQuotaGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasQuotaGate = true + } + } + if tt.expectGateGone { + assert.False(t, hasQuotaGate, + "Quota gate must be removed in the same reconcile pass as the QuotaGranted=True status write; "+ + "a stranded gate leaves the projected Instance stuck Pending (SchedulingGatesPresent)") + } + + // Network gate (if present) must be preserved — only the Quota gate is + // cleared by InstanceReconciler; NetworkSchedulingGate is owned by + // WorkloadDeploymentReconciler. + for _, g := range updated.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.QuotaSchedulingGate.String(), g.Name, + "Quota gate must not remain after granted claim") + } + }) + } +} + +// TestReconcileQuotaSingleMode verifies that in single-cell mode: +// - the project ID is decoded from the upstream-cluster-name label on the edge +// namespace (not taken from the always-"single" ClusterName) +// - the ResourceClaim is created in the in-project namespace (upstream-namespace +// label, e.g. "default"), not in the edge namespace (ns-abc123) +// - the ResourceRef points at resourcemanager.miloapis.com/Project, not Instance +func TestReconcileQuotaSingleMode(t *testing.T) { + const ( + instanceName = "my-instance" + edgeNS = "ns-abc123" // edge namespace (ns-{uid}) — does NOT exist in project CP + projectID = "datum-cloud" // decoded from "cluster-datum-cloud" + projectNS = "default" // upstream-namespace label value — where claims live + deploymentName = "my-deployment" + ) + + // Claim name is the instance-prefixed Instance name; the claim object itself + // lives in projectNS (the instance's edge namespace is carried on a label). + claimName := instanceQuotaClaimNamePrefix + instanceName + + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: edgeNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: edgeNS, UID: "test-uid"}, + } + + // ResourceClaim lives in projectNS ("default"), not edgeNS ("ns-abc123"). + // ResourceRef points at the Project resource, matching the ResourceRegistration's + // claimingResources (resourcemanager.miloapis.com/Project only). + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: projectNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + // The quota client is keyed by projectID ("datum-cloud"), matching what + // projectIDForInstance returns after decoding "cluster-datum-cloud". + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(projectID, quotaClient) + + const singleCluster = "single" + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + singleCluster: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: singleCluster, + // Single-cell mode: project ID decoded from upstream-cluster-name label. + // Simulates what cmd/main.go does for "cluster-datum-cloud" → "datum-cloud". + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectID, nil + }, + // Single-cell mode: claim namespace comes from upstream-namespace label. + // Simulates what cmd/main.go does by reading the edge namespace labels. + projectNamespaceForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectNS, nil + }, + // Single-cell mode: watch map func must always return "single". + clusterNameForProject: func(_ string) multicluster.ClusterName { + return singleCluster + }, + } + + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: edgeNS, Name: instanceName}}, + ClusterName: singleCluster, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: edgeNS, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be set") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status, "quota should be granted in single mode") + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Verify clusterNameForProject always returns "single" so the watch map func + // never enqueues an unknown cluster name. + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject(projectID)) + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject("any-other-project")) + + // Verify resolveProjectNamespace returns the in-project namespace, not the edge namespace. + resolvedNS, resolveErr := r.resolveProjectNamespace(context.Background(), singleCluster, instance) + require.NoError(t, resolveErr) + assert.Equal(t, projectNS, resolvedNS, "claim namespace must be the in-project namespace, not the edge namespace") +} + +// TestReconcileQuotaFailureModes verifies that infrastructure failures in the +// quota path set specific QuotaGranted=False conditions (fail-closed) rather +// than silently allowing workloads to schedule. +func TestReconcileQuotaFailureModes(t *testing.T) { + const ( + testProject = "test-project" + testNS = "default" + testInstance = "my-instance" + testDeployment = "my-deployment" + ) + + makeInstance := func() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstance, + Namespace: testNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: testDeployment, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + } + + makeDeployment := func() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testDeployment, Namespace: testNS, UID: testUIDString}, + } + } + + newReconcilerWithInterceptor := func( + t *testing.T, + funcs interceptor.Funcs, + fakeRecorder *record.FakeRecorder, + ) (*InstanceReconciler, client.Client) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(funcs). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient + } + + reconcileReq := func() mcreconcile.Request { + return mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: testNS, Name: testInstance}}, + ClusterName: testProject, + } + } + + t.Run("FM-2: backend unreachable sets QuotaBackendUnavailable", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return fmt.Errorf("connection refused") + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + // Reconcile returns error for transient failures. + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, cond.Reason) + + // Event should have been emitted. + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable) + default: + t.Error("expected a Warning event for backend unavailable, got none") + } + }) + + // FM-4/FM-5: 404 on Create maps to NamespaceNotFound when the claim namespace + // is known (the more common case for project-exists-but-namespace-absent), and + // to ProjectNotFound when the namespace itself is empty (project CP path missing). + t.Run("FM-5: 404 on Create with known namespace sets QuotaNamespaceNotFound", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + notFoundErr := apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return notFoundErr + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return notFoundErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // claimNamespace == testNS (non-empty) → NamespaceNotFound, not ProjectNotFound. + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound, cond.Reason, + "404 on Create with known namespace should map to NamespaceNotFound") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound) + default: + t.Error("expected a Warning event for namespace not found, got none") + } + }) + + t.Run("FM-6: 403 on Create sets QuotaMisconfigured", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + forbiddenErr := apierrors.NewForbidden( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim", + fmt.Errorf("ResourceRegistration not found")) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return forbiddenErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonMisconfigured, cond.Reason, + "403 on Create should map to Misconfigured") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonMisconfigured) + default: + t.Error("expected a Warning event for misconfigured quota, got none") + } + }) + + t.Run("FM-7: claim pending with no budget sets QuotaNoBudget", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + claimName := instanceQuotaClaimNamePrefix + testInstance + pendingClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionFalse, + Reason: quotav1alpha1.ResourceClaimPendingReason, + Message: "No AllowanceBucket configured", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(pendingClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err, "pending-no-budget is not a transient error — no requeue needed") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNoBudget, cond.Reason, + "pending claim with no budget should use NoBudget reason, not PendingEvaluation") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNoBudget) + default: + t.Error("expected a Warning event for no budget, got none") + } + }) + + t.Run("quota disabled: quotaClientManager nil sets QuotaDisabled (not QuotaAvailable)", func(t *testing.T) { + s := newTestScheme(t) + instance := makeInstance() + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: nil, // explicitly disabled + edgeClusterName: testEdgeClusterName, + recorder: record.NewFakeRecorder(10), + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, cond.Reason, + "intentionally disabled quota should use QuotaDisabled reason") + }) + + t.Run("observedGeneration guard: stale True condition does not remove gate for new generation", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + // Instance at generation 2 with a stale QuotaGranted=True from generation 1. + instance := makeInstance() + instance.Generation = 2 + instance.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "quota granted (generation 1)", + ObservedGeneration: 1, // stale — does not match instance.Generation=2 + LastTransitionTime: metav1.Now(), + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + claimName := instanceQuotaClaimNamePrefix + testInstance + grantedClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(grantedClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with + // ObservedGeneration=2 into the in-memory instance, status is persisted, + // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // instance.Generation=2) and removes the gate — all in one pass. + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + hasGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be removed in the same reconcile that refreshes the condition to current generation") + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition must reflect current generation") + }) + + t.Run("FM-1: missing identity label sets ProjectIDUnresolvable and errors", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, fake.NewClientBuilder().WithScheme(s).Build()) + + // Mirrors the single-mode resolver contract: the edge namespace exists + // but was never stamped with the cluster-name identity label. + identityErr := fmt.Errorf("edge namespace %q is missing label %q: %w", + testNS, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", identityErr + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err, "unresolvable identity must surface as an error, not a silent PendingEvaluation park") + require.ErrorIs(t, err, errProjectIdentityUnresolvable) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, cond.Reason) + assert.Contains(t, cond.Message, downstreamclient.UpstreamOwnerClusterNameLabel, + "condition message must name the missing label") + assert.Contains(t, cond.Message, testNS, + "condition message must name the edge namespace") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable) + default: + t.Error("expected a Warning event for unresolvable project identity, got none") + } + }) +} + +// TestReconcileDeletionProjectIdentity verifies the deletion-path tradeoff for +// project-identity resolution: unresolvable identity (missing namespace labels, +// a misconfiguration no retry fixes) must not wedge deletion — claim cleanup is +// skipped and the claim may leak until Milo GC — while transient resolution +// failures retry rather than risking an orphaned claim. +func TestReconcileDeletionProjectIdentity(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + ) + claimName := instanceQuotaClaimNamePrefix + instanceName + + makeDeletingInstance := func() *computev1alpha.Instance { + now := metav1.Now() + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + } + + makeClaim := func() *quotav1alpha1.ResourceClaim { + return "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + } + } + + newReconciler := func(t *testing.T, projectIDFn InstanceProjectIDFunc, rec record.EventRecorder) (*InstanceReconciler, client.Client, client.Client) { + t.Helper() + s := newTestScheme(t) + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeDeletingInstance()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeClaim()). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: rec, + projectIDForInstance: projectIDFn, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient, quotaClient + } + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + } + + t.Run("unresolvable identity: deletion proceeds, claim cleanup skipped", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + identityErr := fmt.Errorf("edge namespace %q is missing label %q: %w", + namespace, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + r, projectClient, quotaClient := newReconciler(t, + func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", identityErr + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err, "unresolvable identity must not wedge deletion") + + // Finalizer removed; the fake client garbage collects the object once the + // last finalizer clears, so accept either a clean object or NotFound. + var updated computev1alpha.Instance + getErr := projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated) + if getErr != nil { + assert.True(t, apierrors.IsNotFound(getErr), "unexpected error getting instance after finalizer removal") + } else { + assert.NotContains(t, updated.Finalizers, instanceQuotaFinalizer) + } + + // Claim cleanup skipped — the claim leaks until Milo GC removes it. + var claim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &claim), + "claim must be left in place when identity is unresolvable") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, "QuotaClaimOrphaned") + default: + t.Error("expected a QuotaClaimOrphaned event, got none") + } + }) + + t.Run("transient resolution failure: reconcile errors and retries", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient, quotaClient := newReconciler(t, + func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return "", fmt.Errorf("connection refused") + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), req) + require.Error(t, err, "transient failures must retry rather than orphan the claim") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + assert.Contains(t, updated.Finalizers, instanceQuotaFinalizer, + "finalizer must stay until claim cleanup succeeds") + + var claim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &claim)) + + select { + case event := <-fakeRecorder.Events: + t.Errorf("no orphan event expected for a transient failure, got %q", event) + default: + } + }) +} + +// TestQuotaPendingRequeueAfter verifies the backing-off safety-net requeue used +// while an instance's quota claim is still pending: 1s for the first minute, then +// 15s, then 60s after 5m, then 300s after 10m; and no requeue once granted. +func TestQuotaPendingRequeueAfter(t *testing.T) { + base := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + + // created is the instance creation time; quota elapsed is measured from it + // (NOT the condition's LastTransitionTime, which stays at the 1970 default + // while quota is pending). The condition LastTransitionTime here is + // deliberately left at the 1970 zero value to mirror that production reality. + withQuota := func(s metav1.ConditionStatus, created time.Time) *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(created), + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{{ + Type: computev1alpha.InstanceQuotaGranted, + Status: s, + Reason: "PendingEvaluation", + }}, + }, + } + } + + tests := []struct { + name string + inst *computev1alpha.Instance + now time.Time + want time.Duration + }{ + {"granted -> no requeue", withQuota(metav1.ConditionTrue, base), base.Add(time.Hour), 0}, + {"no quota condition -> no requeue", &computev1alpha.Instance{}, base, 0}, + {"just pending -> 1s", withQuota(metav1.ConditionUnknown, base), base.Add(5 * time.Second), quotaPendingRequeueFast}, + {"59s -> 1s", withQuota(metav1.ConditionUnknown, base), base.Add(59 * time.Second), quotaPendingRequeueFast}, + {"60s boundary -> 15s", withQuota(metav1.ConditionUnknown, base), base.Add(60 * time.Second), quotaPendingRequeueMedium}, + {"3m -> 15s", withQuota(metav1.ConditionUnknown, base), base.Add(3 * time.Minute), quotaPendingRequeueMedium}, + {"5m boundary -> 60s", withQuota(metav1.ConditionUnknown, base), base.Add(5 * time.Minute), quotaPendingRequeueSlow}, + {"8m -> 60s", withQuota(metav1.ConditionUnknown, base), base.Add(8 * time.Minute), quotaPendingRequeueSlow}, + {"10m boundary -> 300s", withQuota(metav1.ConditionUnknown, base), base.Add(10 * time.Minute), quotaPendingRequeueIdle}, + {"1h -> 300s", withQuota(metav1.ConditionUnknown, base), base.Add(time.Hour), quotaPendingRequeueIdle}, + {"denied(False) still polls", withQuota(metav1.ConditionFalse, base), base.Add(2 * time.Minute), quotaPendingRequeueMedium}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, quotaPendingRequeueAfter(tc.inst, tc.now)) + }) + } +} From 8717dad7e781251c9a27a12e009797755a293790 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:39:34 -0500 Subject: [PATCH 07/14] feat(webhook): validation updates for federation Update Workload webhook and Instance validation so the API accepts the fields federated scheduling adds and continues to reject invalid placement and runtime specs. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/validation/instance_validation.go | 29 +++++++-- .../validation/workload_validation_test.go | 64 +++++++++++-------- internal/webhook/v1alpha/workload_webhook.go | 58 ++++------------- 3 files changed, 72 insertions(+), 79 deletions(-) diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 7f112822..59a57585 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -17,6 +17,17 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +const ( + // diskTypePDStandard is the only currently supported disk type. + diskTypePDStandard = "pd-standard" + + // defaultImageName is the only currently supported container image. + defaultImageName = "datumcloud/ubuntu-2204-lts" + + // defaultInstanceType is the only currently supported instance type. + defaultInstanceType = "datumcloud/d1-standard-2" +) + func validateInstanceTemplate( template computev1alpha.InstanceTemplateSpec, fieldPath *field.Path, @@ -97,6 +108,11 @@ func validateInstanceNetworkInterfaces( allErrs = append(allErrs, field.Invalid(networkNameField, networkInterface.Network, msg)) } + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + review := authorizationv1.SubjectAccessReview{ Spec: authorizationv1.SubjectAccessReviewSpec{ ResourceAttributes: &authorizationv1.ResourceAttributes{ @@ -110,6 +126,7 @@ func validateInstanceNetworkInterfaces( User: opts.AdmissionRequest.UserInfo.Username, Groups: opts.AdmissionRequest.UserInfo.Groups, UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, }, } @@ -258,8 +275,8 @@ func validateDiskVolumeSource(diskSource *computev1alpha.DiskTemplateVolumeSourc diskTemplateSpecField := diskTemplateField.Child("spec") // TODO(jrese) look up valid disk types - if diskTemplate.Spec.Type != "pd-standard" { - allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{"pd-standard"})) + if diskTemplate.Spec.Type != diskTypePDStandard { + allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{diskTypePDStandard})) } populatorResourceRequests, errs := validateDiskPopulator(diskTemplate.Spec.Populator, diskTemplateField.Child("populator")) @@ -400,8 +417,8 @@ func validateDiskPopulator(populator *computev1alpha.DiskPopulator, fieldPath *f // TODO(jreese) look up image imagePopulator := populator.Image - if imagePopulator.Name != "datumcloud/ubuntu-2204-lts" { - allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{"datumcloud/ubuntu-2204-lts"})) + if imagePopulator.Name != defaultImageName { + allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{defaultImageName})) } } } @@ -657,8 +674,8 @@ func validateInstanceRuntimeResources(resources computev1alpha.InstanceRuntimeRe allErrs := field.ErrorList{} // TODO(jreese) look up available instance types - if resources.InstanceType != "datumcloud/d1-standard-2" { - allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{"datumcloud/d1-standard-2"})) + if resources.InstanceType != defaultInstanceType { + allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{defaultInstanceType})) } if resources.Requests != nil { diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index f73e4c9f..2a0324ee 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -23,6 +23,14 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +const ( + testCPUResource = "cpu" + testVolName = "vol" + testDuplicateMountPath = "duplicate-mount-path" + testDefaultNamespace = "default" + testCityCodeDFW = "DFW" +) + func TestValidateWorkloads(t *testing.T) { scenarios := map[string]struct { workload *computev1alpha.Workload @@ -157,7 +165,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(50, resource.DecimalSI), AverageValue: resource.NewQuantity(50, resource.DecimalSI), @@ -181,7 +189,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -202,7 +210,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageValue: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -223,7 +231,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageUtilization: proto.Int32(0), }, @@ -336,16 +344,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Gi"), @@ -369,16 +377,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Pi"), @@ -402,16 +410,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10.5Gi"), @@ -436,7 +444,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -473,7 +481,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -490,11 +498,11 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { @@ -503,7 +511,7 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, VolumeSource: volumeSource, }, } @@ -540,7 +548,7 @@ func TestValidateWorkloads(t *testing.T) { interceptorFuncs: &interceptor.Funcs{ Create: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.CreateOption) error { if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { - if sar.Spec.ResourceAttributes.Name == "default" && + if sar.Spec.ResourceAttributes.Name == testDefaultNamespace && sar.Spec.ResourceAttributes.Group == networkingv1alpha.GroupVersion.Group && sar.Spec.ResourceAttributes.Version == networkingv1alpha.GroupVersion.Version && sar.Spec.ResourceAttributes.Resource == "networks" { @@ -559,8 +567,8 @@ func TestValidateWorkloads(t *testing.T) { initObjs := []client.Object{ &networkingv1alpha.Network{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "default", + Namespace: testDefaultNamespace, + Name: testDefaultNamespace, }, }, } @@ -606,7 +614,7 @@ func TestValidateWorkloads(t *testing.T) { ) if len(scenario.opts.ValidCityCodes) == 0 { - scenario.opts.ValidCityCodes = []string{"DFW"} + scenario.opts.ValidCityCodes = []string{testCityCodeDFW} } t.Run(name, func(t *testing.T) { @@ -645,7 +653,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, Sandbox: &computev1alpha.SandboxRuntime{ Containers: []computev1alpha.SandboxContainer{ @@ -661,7 +669,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, @@ -702,7 +710,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, VirtualMachine: &computev1alpha.VirtualMachineRuntime{ VolumeAttachments: []computev1alpha.VolumeAttachment{ @@ -719,10 +727,10 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Populator: &computev1alpha.DiskPopulator{ Image: &computev1alpha.ImageDiskPopulator{ - Name: "datumcloud/ubuntu-2204-lts", + Name: defaultImageName, }, }, }, @@ -736,7 +744,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index e3f3735c..a8b94b38 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -6,12 +6,12 @@ import ( "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/validation" @@ -27,8 +27,7 @@ func SetupWorkloadWebhookWithManager(mgr mcmanager.Manager) error { mgr: mgr, } - return ctrl.NewWebhookManagedBy(mgr.GetLocalManager()). - For(&computev1alpha.Workload{}). + return ctrl.NewWebhookManagedBy(mgr.GetLocalManager(), &computev1alpha.Workload{}). WithDefaulter(webhook). WithValidator(webhook). Complete() @@ -40,17 +39,11 @@ type workloadWebhook struct { mgr mcmanager.Manager } -var _ admission.CustomDefaulter = &workloadWebhook{} -var _ admission.CustomValidator = &workloadWebhook{} - -// Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return fmt.Errorf("unexpected type %T", obj) - } - _ = workload +var _ admission.Defaulter[*computev1alpha.Workload] = &workloadWebhook{} +var _ admission.Validator[*computev1alpha.Workload] = &workloadWebhook{} +// Default implements admission.Defaulter so a mutating webhook will be registered for the type. +func (r *workloadWebhook) Default(_ context.Context, _ *computev1alpha.Workload) error { // // TODO(jreese) review and test gateway defaulting / logic // if gw := workload.Spec.Gateway; gw != nil { // for i, tcpRoute := range gw.TCPRoutes { @@ -75,15 +68,10 @@ func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error // +kubebuilder:webhook:path=/validate-compute-datumapis-com-v1alpha-workload,mutating=false,failurePolicy=fail,sideEffects=None,groups=compute.datumapis.com,resources=workloads,verbs=create;update,versions=v1alpha,name=vworkload.kb.io,admissionReviewVersions=v1 -func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - +func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev1alpha.Workload) (admission.Warnings, error) { clusterName := computewebhook.ClusterNameFromContext(ctx) - cluster, err := r.mgr.GetCluster(ctx, clusterName) + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) if err != nil { return nil, err } @@ -101,9 +89,9 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object // that means for the scheduling phase, since there would not currently be // sufficient context to know who created the workload and what locations // are valid candidates based on that. Maybe an annotation, or spec field? - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := clusterClient.List(ctx, &locations); err != nil { - return nil, fmt.Errorf("failed to list locations: %w", err) + return nil, fmt.Errorf("failed to list location bindings: %w", err) } validCityCodes := sets.Set[string]{} @@ -123,38 +111,18 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object } if errs := validation.ValidateWorkloadCreate(workload, opts); len(errs) > 0 { - return nil, errors.NewInvalid(obj.GetObjectKind().GroupVersionKind().GroupKind(), workload.Name, errs) + return nil, errors.NewInvalid(workload.GroupVersionKind().GroupKind(), workload.Name, errs) } return nil, nil } -func (r *workloadWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldworkload, ok := oldObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", oldObj) - } - - _ = oldworkload - - newworkload, ok := newObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", newObj) - } - - _ = newworkload - +func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object update. return nil, nil } -func (r *workloadWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - _ = workload - +func (r *workloadWebhook) ValidateDelete(_ context.Context, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object deletion. return nil, nil } From 9138353785a3e3fdbf051d743cb34e0e4936d68b Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:40:02 -0500 Subject: [PATCH 08/14] feat(cmd): cell and management-plane wiring with feature gates Wire the manager to run in either cell or management-plane mode, gating the federator, projector, and per-cell controllers behind feature flags. Add the feature-gate registry and extend configuration to carry the downstream kubeconfig and discovery settings each mode needs. Single-mode project resolution (decoding edge namespace labels into project identity) lives in the controller package as NewSingleModeProjectID/NewSingleModeProjectNamespace constructors; main.go keeps only the wiring. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/main.go | 276 +++++++++++++++++++------ internal/config/config.go | 43 ++++ internal/config/config_test.go | 66 ++++++ internal/controller/singlemode.go | 88 ++++++++ internal/controller/singlemode_test.go | 127 ++++++++++++ internal/features/features.go | 60 ++++++ internal/features/features_test.go | 43 ++++ 7 files changed, 645 insertions(+), 58 deletions(-) create mode 100644 internal/controller/singlemode.go create mode 100644 internal/controller/singlemode_test.go create mode 100644 internal/features/features.go create mode 100644 internal/features/features_test.go diff --git a/cmd/main.go b/cmd/main.go index 7d6a682d..4358a087 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -18,20 +18,27 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" + "go.datum.net/compute/internal/features" + quotametrics "go.datum.net/compute/internal/quota" computewebhook "go.datum.net/compute/internal/webhook" computev1alphawebhooks "go.datum.net/compute/internal/webhook/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" @@ -41,6 +48,10 @@ import ( // +kubebuilder:scaffold:imports ) +// singleClusterName is the fixed cluster name that mcsingle.New registers. +// All single-mode wiring that references this cluster must use this constant. +const singleClusterName = "single" + var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") @@ -51,6 +62,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // federationRestConfig holds the REST config for the Karmada federation control + // plane. It is populated from --federation-kubeconfig when set, and is nil + // when the flag is omitted. + federationRestConfig *rest.Config ) func init() { @@ -61,22 +77,45 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } +//nolint:gocyclo // main wires all controller paths; complexity is inherent to startup sequencing func main() { var enableLeaderElection bool var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var federationKubeconfig string + var federationContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&federationKubeconfig, "federation-kubeconfig", "", + "Path to the kubeconfig file for the Karmada federation control plane. "+ + "Required when --enable-management-controllers is set. "+ + "When omitted, federation features are disabled.") + flag.StringVar(&federationContext, "federation-context", "", + "Context to use from the federation kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", false, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector).") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", false, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler).") + + var featureGatesFlag string + flag.StringVar(&featureGatesFlag, "feature-gates", "", + "A set of key=value pairs that describe feature gates for the compute operator. "+ + "Example: --feature-gates=NetworkingIntegration=false. "+ + "Available features: NetworkingIntegration (default=true).") opts := zap.Options{ Development: true, @@ -87,8 +126,40 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if featureGatesFlag != "" { + if err := features.MutableFeatureGate.Set(featureGatesFlag); err != nil { + setupLog.Error(err, "unable to parse feature gates", "feature-gates", featureGatesFlag) + os.Exit(1) + } + } + setupLog.Info("feature gates", "NetworkingIntegration", features.FeatureGate.Enabled(features.NetworkingIntegration)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if federationKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: federationKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: federationContext}, + ) + var err error + federationRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load federation kubeconfig", "path", federationKubeconfig) + os.Exit(1) + } + setupLog.Info("federation kubeconfig loaded", "path", federationKubeconfig) + } + + // Fail loud: management controllers require a federation kubeconfig. Silently + // skipping them when --enable-management-controllers is set would leave + // federation and instance projection broken with no visible signal. + if enableManagementControllers && federationRestConfig == nil { + setupLog.Error(nil, + "management controllers enabled but no federation kubeconfig configured", + "hint", "set --federation-kubeconfig") + os.Exit(1) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -96,24 +167,28 @@ func main() { "buildDate", buildDate, ) - var serverConfig config.WorkloadOperator - var configData []byte - if len(serverConfigFile) > 0 { - var err error - configData, err = os.ReadFile(serverConfigFile) - if err != nil { - setupLog.Error(fmt.Errorf("unable to read server config from %q", serverConfigFile), "") - os.Exit(1) - } - } - - if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { - setupLog.Error(err, "unable to decode server config") + serverConfig, err := loadServerConfig(serverConfigFile) + if err != nil { + setupLog.Error(err, "unable to load server config") os.Exit(1) } setupLog.Info("server config", "config", serverConfig) + quotaRestConfig, err := serverConfig.Discovery.QuotaRestConfig() + if err != nil { + setupLog.Error(err, "unable to load quota REST config") + os.Exit(1) + } + if quotaRestConfig != nil { + setupLog.Info("quota REST config loaded", "path", serverConfig.Discovery.QuotaKubeconfigPath) + quotametrics.EnforcementEnabled.Set(1) + } else { + setupLog.Error(nil, "quota enforcement is DISABLED — workloads will schedule without quota accounting; "+ + "set quotaKubeconfigPath in server config to enable enforcement") + quotametrics.EnforcementEnabled.Set(0) + } + cfg := ctrl.GetConfigOrDie() deploymentCluster, err := cluster.New(cfg, func(o *cluster.Options) { @@ -124,7 +199,9 @@ func main() { os.Exit(1) } - runnables, provider, err := initializeClusterDiscovery(serverConfig, deploymentCluster, scheme) + runnables, provider, edgeClusterName, err := initializeClusterDiscovery( + serverConfig, deploymentCluster, scheme, + ) if err != nil { setupLog.Error(err, "unable to initialize cluster discovery") os.Exit(1) @@ -176,17 +253,62 @@ func main() { os.Exit(1) } - if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workload") - os.Exit(1) + if enableManagementControllers { + if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single federation client shared across all controllers that need to + // read or write to the Karmada federation control plane. This is the hub that + // the management controllers federate through and that edge cells write back to. + var federationClient client.Client + if federationRestConfig != nil { + federationClient, err = client.New(federationRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create federation client") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{ + NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } + } + + if enableCellControllers { + clusterNameForProject := func(_ string) multicluster.ClusterName { + return multicluster.ClusterName(singleClusterName) + } + instanceReconciler := &controller.InstanceReconciler{FederationClient: federationClient} + err = instanceReconciler.SetupWithManager( + mgr, + quotaRestConfig, + controller.NewSingleModeProjectID(mgr), + controller.NewSingleModeProjectNamespace(mgr), + edgeClusterName, + clusterNameForProject, + ) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // The fail-loud guard above ensures federationRestConfig is non-nil when + // management controllers are enabled; the nil check here is defensive. + if enableManagementControllers && federationRestConfig != nil { + extra, err := setupManagementControllers(mgr, federationClient) + if err != nil { + setupLog.Error(err, "unable to set up management controllers") + os.Exit(1) + } + runnables = append(runnables, extra...) } if serverConfig.WebhookServer != nil { @@ -219,11 +341,6 @@ func main() { }) } - setupLog.Info("starting cluster discovery provider") - g.Go(func() error { - return ignoreCanceled(provider.Run(ctx, mgr)) - }) - setupLog.Info("starting multicluster manager") g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) @@ -235,51 +352,33 @@ func main() { } } -type runnableProvider interface { - multicluster.Provider - Run(context.Context, mcmanager.Manager) error -} - -// Needed until we contribute the patch in the following PR again (need to sign CLA): -// -// See: https://github.com/kubernetes-sigs/multicluster-runtime/pull/18 -type wrappedSingleClusterProvider struct { - multicluster.Provider - cluster cluster.Cluster -} - -func (p *wrappedSingleClusterProvider) Run(ctx context.Context, mgr mcmanager.Manager) error { - if err := mgr.Engage(ctx, "single", p.cluster); err != nil { - return err - } - return p.Provider.(runnableProvider).Run(ctx, mgr) -} - func initializeClusterDiscovery( serverConfig config.WorkloadOperator, deploymentCluster cluster.Cluster, scheme *runtime.Scheme, -) (runnables []manager.Runnable, provider runnableProvider, err error) { +) (runnables []manager.Runnable, provider multicluster.Provider, edgeClusterName string, err error) { runnables = append(runnables, deploymentCluster) switch serverConfig.Discovery.Mode { case multiclusterproviders.ProviderSingle: - provider = &wrappedSingleClusterProvider{ - Provider: mcsingle.New("single", deploymentCluster), - cluster: deploymentCluster, + provider = mcsingle.New(multicluster.ClusterName(singleClusterName), deploymentCluster) + edgeClusterName = serverConfig.Discovery.ClusterName + if edgeClusterName == "" { + edgeClusterName = singleClusterName } case multiclusterproviders.ProviderMilo: discoveryRestConfig, err := serverConfig.Discovery.DiscoveryRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get discovery rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get discovery rest config: %w", err) } projectRestConfig, err := serverConfig.Discovery.ProjectRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get project rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get project rest config: %w", err) } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, @@ -287,7 +386,7 @@ func initializeClusterDiscovery( }, }) if err != nil { - return nil, nil, fmt.Errorf("unable to set up overall controller manager: %w", err) + return nil, nil, "", fmt.Errorf("unable to set up overall controller manager: %w", err) } provider, err = milomulticluster.New(discoveryManager, milomulticluster.Options{ @@ -300,10 +399,11 @@ func initializeClusterDiscovery( ProjectRestConfig: projectRestConfig, }) if err != nil { - return nil, nil, fmt.Errorf("unable to create datum project provider: %w", err) + return nil, nil, "", fmt.Errorf("unable to create datum project provider: %w", err) } runnables = append(runnables, discoveryManager) + edgeClusterName = serverConfig.Discovery.ClusterName // case providers.ProviderKind: // provider = mckind.New(mckind.Options{ @@ -315,13 +415,29 @@ func initializeClusterDiscovery( // }) default: - return nil, nil, fmt.Errorf( + return nil, nil, "", fmt.Errorf( "unsupported cluster discovery mode %s", serverConfig.Discovery.Mode, ) } - return runnables, provider, nil + return runnables, provider, edgeClusterName, nil +} + +func loadServerConfig(path string) (config.WorkloadOperator, error) { + var serverConfig config.WorkloadOperator + var configData []byte + if len(path) > 0 { + var err error + configData, err = os.ReadFile(path) + if err != nil { + return serverConfig, fmt.Errorf("unable to read server config from %q: %w", path, err) + } + } + if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { + return serverConfig, fmt.Errorf("unable to decode server config: %w", err) + } + return serverConfig, nil } func ignoreCanceled(err error) error { @@ -330,3 +446,47 @@ func ignoreCanceled(err error) error { } return err } + +// setupManagementControllers wires the WorkloadDeploymentFederator and +// InstanceProjector onto mgr. It returns any additional Runnable objects that +// must be started alongside the main manager (the federation manager used by +// InstanceProjector). Called only when management controllers are enabled and +// a federation REST config is available. +func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + // The federation manager provides a cached, watchable handle to the Karmada + // federation control plane. It backs the InstanceProjector's Instance watch + // and the WorkloadDeploymentFederator's downstream WorkloadDeployment status + // watch. A manager.Manager embeds a cluster.Cluster, so it can be passed + // directly anywhere a watchable federation cluster source is required. + federationMgr, err := manager.New(federationRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + return nil, fmt.Errorf("federation manager: %w", err) + } + + // The federator watches both the project WD (via the multicluster manager) + // and the downstream Karmada WD (via the federation cluster) so that status + // aggregated downstream by Karmada is mirrored back to the project WD + // immediately instead of on the next informer resync. + federator := &controller.WorkloadDeploymentFederator{ + FederationClient: federationClient, + FederationCluster: federationMgr, + } + if err := federator.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentFederator: %w", err) + } + + // InstanceProjector runs in the management plane, watches Instances written + // back by POP-cell operators to the Karmada federation control plane, and + // projects them into the corresponding project namespaces via the multicluster manager. + if err = (&controller.InstanceProjector{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("InstanceProjector: %w", err) + } + + return []manager.Runnable{federationMgr}, nil +} diff --git a/internal/config/config.go b/internal/config/config.go index dddb7926..df4419b6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -229,6 +229,23 @@ type DiscoveryConfig struct { // template when connecting to project control planes. When not provided, // the operator will use the in-cluster config. ProjectKubeconfigPath string `json:"projectKubeconfigPath"` + + // ClusterName is the stable, unique name for this edge cluster. It is + // stamped onto ResourceClaim objects so that each edge controller can + // distinguish its own claims from those created by other edge controllers + // in the same project control planes. + // + // Required when Mode is "milo". Optional in single mode; defaults to "single". + ClusterName string `json:"clusterName"` + + // QuotaKubeconfigPath is the path to the kubeconfig file used when creating + // ResourceClaim objects against Milo project control planes. When set it + // takes precedence over ProjectKubeconfigPath for quota calls. When both are + // unset, quota accounting is disabled. + // + // Use this field in deployments (mode: single or mode: milo) that need to + // talk to api.datum.net for quota enforcement. + QuotaKubeconfigPath string `json:"quotaKubeconfigPath"` } func SetDefaults_DiscoveryConfig(obj *DiscoveryConfig) { @@ -253,6 +270,32 @@ func (c *DiscoveryConfig) ProjectRestConfig() (*rest.Config, error) { return clientcmd.BuildConfigFromFlags("", c.ProjectKubeconfigPath) } +// QuotaRestConfig returns the REST config for quota ResourceClaim management +// against Milo project control planes. QuotaKubeconfigPath is preferred; if +// unset, ProjectKubeconfigPath is used as a fallback. +// +// Returns (nil, nil) when no credential path is configured at all — this is +// the intentional opt-out case and the caller should disable quota enforcement. +// +// Returns (nil, error) when a credential path IS configured but the file does +// not exist on disk. This is a misconfiguration (Secret not mounted, wrong +// path) that must not silently disable enforcement; callers should treat this +// as a fatal startup error. +func (c *DiscoveryConfig) QuotaRestConfig() (*rest.Config, error) { + path := c.QuotaKubeconfigPath + if path == "" { + path = c.ProjectKubeconfigPath + } + if path == "" { + return nil, nil + } + if _, err := os.Stat(path); os.IsNotExist(err) { + return nil, fmt.Errorf("quota kubeconfig path %q is configured but file does not exist: "+ + "ensure the quota credential Secret is mounted correctly", path) + } + return clientcmd.BuildConfigFromFlags("", path) +} + func init() { SchemeBuilder.Register(&WorkloadOperator{}) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5f586932..5a7a3cee 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "os" + "path/filepath" "testing" "k8s.io/apimachinery/pkg/runtime" @@ -56,3 +58,67 @@ webhookServer: t.Error("TLS.CertDir was not defaulted") } } + +// TestQuotaRestConfig_NilWhenNoPath verifies that omitting quotaKubeconfigPath +// returns (nil, nil) — the intentional opt-out / enforcement-disabled case. +func TestQuotaRestConfig_NilWhenNoPath(t *testing.T) { + cfg := &DiscoveryConfig{} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() = non-nil, want nil (no path configured)") + } +} + +// TestQuotaRestConfig_ErrorWhenPathMissing verifies that explicitly setting a +// kubeconfig path that does not exist on disk returns a non-nil error (fail-loud). +func TestQuotaRestConfig_ErrorWhenPathMissing(t *testing.T) { + cfg := &DiscoveryConfig{ + QuotaKubeconfigPath: "/nonexistent/path/quota.kubeconfig", + } + restCfg, err := cfg.QuotaRestConfig() + if err == nil { + t.Fatal("QuotaRestConfig() error = nil, want non-nil error when path is configured but file absent") + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() returned non-nil config alongside error") + } +} + +// TestQuotaRestConfig_SuccessWhenFileExists verifies that a configured path +// pointing to an existing (though minimal) kubeconfig file succeeds. +func TestQuotaRestConfig_SuccessWhenFileExists(t *testing.T) { + // Write a minimal kubeconfig that clientcmd can parse. + dir := t.TempDir() + kubeconfigPath := filepath.Join(dir, "quota.kubeconfig") + minimalKubeconfig := []byte(`apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://localhost:1234 + name: test +contexts: +- context: + cluster: test + user: test + name: test +current-context: test +users: +- name: test + user: {} +`) + if err := os.WriteFile(kubeconfigPath, minimalKubeconfig, 0600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg := &DiscoveryConfig{QuotaKubeconfigPath: kubeconfigPath} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg == nil { + t.Error("QuotaRestConfig() = nil, want non-nil when file exists") + } +} diff --git a/internal/controller/singlemode.go b/internal/controller/singlemode.go new file mode 100644 index 00000000..46a2aa95 --- /dev/null +++ b/internal/controller/singlemode.go @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// NewSingleModeProjectID returns an InstanceProjectIDFunc for single-cell mode. +// It reads the upstream-cluster-name label on the edge namespace (e.g. +// "cluster-datum-cloud") and decodes it to the project ID ("datum-cloud"). +// This is the inverse of the "cluster-" encoding used by NSO's +// MappedNamespaceResourceStrategy when stamping cluster-scoped namespace labels. +// The label is stamped atomically at namespace creation, before any Instance +// can exist in the namespace, so an absent label is misconfiguration: the +// returned error wraps errProjectIdentityUnresolvable and names the namespace +// and the missing label. Transient API failures return ordinary errors +// (requeue with backoff). +func NewSingleModeProjectID(mgr mcmanager.Manager) InstanceProjectIDFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + encoded := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encoded == "" { + return "", fmt.Errorf("edge namespace %q is missing label %q: %w", + inst.Namespace, downstreamclient.UpstreamOwnerClusterNameLabel, errProjectIdentityUnresolvable) + } + return DecodeClusterName(encoded), nil + } +} + +// NewSingleModeProjectNamespace returns an InstanceProjectNamespaceFunc for +// single-cell mode. It reads the upstream-namespace label on the edge namespace +// (e.g. "ns-efdf8ca1-...") to find the in-project namespace ("default") where +// ResourceClaims must be created in the project control plane. +// The label is stamped atomically at namespace creation, before any Instance +// can exist in the namespace, so an absent label is misconfiguration: the +// returned error wraps errProjectIdentityUnresolvable and names the namespace +// and the missing label. Transient API failures return ordinary errors +// (requeue with backoff). +func NewSingleModeProjectNamespace(mgr mcmanager.Manager) InstanceProjectNamespaceFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + projectNS := ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNS == "" { + return "", fmt.Errorf("edge namespace %q is missing label %q: %w", + inst.Namespace, downstreamclient.UpstreamOwnerNamespaceLabel, errProjectIdentityUnresolvable) + } + return projectNS, nil + } +} + +// readEdgeNamespace reads the edge namespace object via the uncached APIReader +// (no informer started, no cache sync required) with a short deadline. +// Returns a transient error on API failures so callers can requeue with backoff. +func readEdgeNamespace( + ctx context.Context, + mgr mcmanager.Manager, + clusterName multicluster.ClusterName, + namespace string, +) (corev1.Namespace, error) { + cl, err := mgr.GetCluster(ctx, clusterName) + if err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: getting cluster %q: %w", clusterName, err) + } + var ns corev1.Namespace + getCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := cl.GetAPIReader().Get(getCtx, client.ObjectKey{Name: namespace}, &ns); err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: reading namespace %q: %w", namespace, err) + } + return ns, nil +} diff --git a/internal/controller/singlemode_test.go b/internal/controller/singlemode_test.go new file mode 100644 index 00000000..99b94f6b --- /dev/null +++ b/internal/controller/singlemode_test.go @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // smTestEdgeNS mirrors the ns-{uid} edge namespaces NSO creates. + smTestEdgeNS = "ns-efdf8ca1-7b6e-4a30-9b1c-0d6f55555555" + // smTestEncodedCluster mirrors the "cluster-" encoding stamped by + // NSO's MappedNamespaceResourceStrategy. + smTestEncodedCluster = "cluster-datum-cloud" + smTestProjectID = "datum-cloud" + smTestProjectNS = "default" + smTestCluster = "single" +) + +// smEdgeNamespace builds an edge namespace shaped like production: both +// identity labels are stamped together at creation. Passing nil labels models +// convention drift where the stamping never happened. +func smEdgeNamespace(labels map[string]string) *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: smTestEdgeNS, + Labels: labels, + }, + } +} + +func smInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-instance", + Namespace: smTestEdgeNS, + }, + } +} + +func TestNewSingleModeProjectID(t *testing.T) { + t.Run("label present: decodes cluster- to the project ID", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + projectID, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.NoError(t, err) + assert.Equal(t, smTestProjectID, projectID) + }) + + t.Run("label absent: returns errProjectIdentityUnresolvable naming the label", func(t *testing.T) { + // Only the namespace label is present — the cluster-name label was never + // stamped (convention drift, not a propagation race). + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + _, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.ErrorIs(t, err, errProjectIdentityUnresolvable) + assert.Contains(t, err.Error(), smTestEdgeNS, + "error must name the edge namespace") + assert.Contains(t, err.Error(), downstreamclient.UpstreamOwnerClusterNameLabel, + "error must name the missing label") + }) + + t.Run("namespace read failure: transient error, not the sentinel", func(t *testing.T) { + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient())) + + _, err := NewSingleModeProjectID(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.False(t, errors.Is(err, errProjectIdentityUnresolvable), + "a failed namespace read is transient and must not be classified as unresolvable identity") + }) +} + +func TestNewSingleModeProjectNamespace(t *testing.T) { + t.Run("label present: returns the in-project namespace", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: smTestProjectNS, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + projectNS, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.NoError(t, err) + assert.Equal(t, smTestProjectNS, projectNS) + }) + + t.Run("label absent: returns errProjectIdentityUnresolvable naming the label", func(t *testing.T) { + ns := smEdgeNamespace(map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: smTestEncodedCluster, + }) + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient(ns))) + + _, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.ErrorIs(t, err, errProjectIdentityUnresolvable) + assert.Contains(t, err.Error(), smTestEdgeNS, + "error must name the edge namespace") + assert.Contains(t, err.Error(), downstreamclient.UpstreamOwnerNamespaceLabel, + "error must name the missing label") + }) + + t.Run("namespace read failure: transient error, not the sentinel", func(t *testing.T) { + mgr := newFakeMCManager(smTestCluster, newFakeCluster(newProjectFakeClient())) + + _, err := NewSingleModeProjectNamespace(mgr)(context.Background(), smTestCluster, smInstance()) + require.Error(t, err) + assert.False(t, errors.Is(err, errProjectIdentityUnresolvable), + "a failed namespace read is transient and must not be classified as unresolvable identity") + }) +} diff --git a/internal/features/features.go b/internal/features/features.go new file mode 100644 index 00000000..c44de349 --- /dev/null +++ b/internal/features/features.go @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package features defines the feature gates for the compute operator. Feature +// gates follow the Kubernetes component-base convention: each feature is +// declared as a Feature constant, registered with a FeatureSpec that includes +// its default enablement state, and toggled at runtime via the --feature-gates +// flag exposed by the binary. +// +// cmd/main.go defines the --feature-gates string flag itself and applies its +// value with: +// +// features.MutableFeatureGate.Set(featureGatesFlag) +// +// Enablement is read through the read-only view: +// +// if features.FeatureGate.Enabled(features.NetworkingIntegration) { ... } +package features + +import ( + "k8s.io/component-base/featuregate" +) + +const ( + // NetworkingIntegration controls whether the compute operator integrates with + // the network-services-operator (VPC) for NetworkBinding provisioning and the + // Network scheduling gate on Instances. + // + // When disabled: + // - No NetworkBinding objects are created. + // - The Network scheduling gate is not added to newly created Instances. + // - Any existing Network scheduling gate is actively removed. + // - The networking step is treated as immediately ready so Instances + // proceed to the runtime without a NetworkBinding. + // + // This flag exists so operators can run compute on edge/lab cells where + // VPC/NSO is not yet functional. The default is true (enabled) so that + // existing production deployments are unaffected. + // + // alpha: v0.1 + NetworkingIntegration featuregate.Feature = "NetworkingIntegration" +) + +// MutableFeatureGate is the mutable feature gate for the compute operator. +// cmd/main.go applies the --feature-gates flag value via MutableFeatureGate.Set +// at startup. Enablement should be read from FeatureGate (the read-only view) +// after startup. +var MutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() + +// FeatureGate is the read-only view of the compute operator feature gate. +// Use this for enablement checks rather than MutableFeatureGate to avoid +// accidental mutations after startup. +var FeatureGate featuregate.FeatureGate = MutableFeatureGate + +func init() { + if err := MutableFeatureGate.Add(map[featuregate.Feature]featuregate.FeatureSpec{ + NetworkingIntegration: {Default: true, PreRelease: featuregate.Alpha}, + }); err != nil { + panic(err) + } +} diff --git a/internal/features/features_test.go b/internal/features/features_test.go new file mode 100644 index 00000000..61687064 --- /dev/null +++ b/internal/features/features_test.go @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package features + +import ( + "testing" +) + +// TestNetworkingIntegration_DefaultEnabled verifies that the NetworkingIntegration +// feature gate defaults to enabled so that existing production deployments are +// unaffected when the flag is not set. +func TestNetworkingIntegration_DefaultEnabled(t *testing.T) { + // Use a fresh gate so this test is independent of any global state mutations. + gate := MutableFeatureGate.DeepCopy() + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration default = false, want true") + } +} + +// TestNetworkingIntegration_CanBeDisabled verifies that setting +// NetworkingIntegration=false via the feature gate string disables the +// integration, allowing operators to run compute without VPC/NSO. +func TestNetworkingIntegration_CanBeDisabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=false"); err != nil { + t.Fatalf("Set(NetworkingIntegration=false): %v", err) + } + if gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = true after Set=false, want false") + } +} + +// TestNetworkingIntegration_ExplicitlyEnabled verifies that the gate can be +// explicitly set to true (round-trip). +func TestNetworkingIntegration_ExplicitlyEnabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=true"); err != nil { + t.Fatalf("Set(NetworkingIntegration=true): %v", err) + } + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = false after Set=true, want true") + } +} From 94d4d9537df43bc5a4f6156d40c9994f06515ad0 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 3 Jun 2026 19:03:49 -0400 Subject: [PATCH 09/14] fix(controller): roll instances by recreate so restart actually rolls them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A template-hash change (an image update, or a restartedAt annotation from `datumctl compute restart`) previously resolved to an in-place Update of the Instance. The unikraft provider bakes the pod at creation time and never recomputes an existing pod's spec, so the in-place update silently failed to roll the running workload — instances kept their old pod. Emit a delete (recreate) for drifted Ready instances instead. The next reconcile refills the slot via the create path with the new template, and the provider's finalizer-gated teardown plus create-on-new-Instance roll the pod with no provider changes. Ordered one-at-a-time pacing is preserved by the existing descending-ordinal sort, skip-all-but-first, and the DeletionTimestamp WaitAction. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../stateful/stateful_control.go | 54 ++++++----- .../stateful/stateful_control_test.go | 90 +++++++++++-------- 2 files changed, 83 insertions(+), 61 deletions(-) diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index de79bd2d..34e5966e 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -53,8 +53,10 @@ func (c *statefulControl) GetActions( var createActions []instancecontrol.Action var waitActions []instancecontrol.Action - // highest -> lowest - var updateActions []instancecontrol.Action + // highest -> lowest. Instances whose template hash has drifted from the + // desired template are deleted and recreated (not updated in place) so the + // change actually rolls the backing pod — see the recreate branch below. + var recreateActions []instancecontrol.Action // highest -> lowest var deleteActions []instancecontrol.Action @@ -129,24 +131,31 @@ func (c *statefulControl) GetActions( if !apimeta.IsStatusConditionTrue(instance.Status.Conditions, v1alpha.InstanceReady) { waitActions = append(waitActions, instancecontrol.NewWaitAction(instance)) } else if needsUpdate(instance, instanceTemplateHash) { - updatedInstance := instance.DeepCopy() - updatedInstance.Annotations = deployment.Spec.Template.Annotations - updatedInstance.Labels = deployment.Spec.Template.Labels - - addInstanceControllerLabels(updatedInstance, getInstanceOrdinal(updatedInstance.Name), deployment) - - updatedInstance.Spec = deployment.Spec.Template.Spec - updateActions = append(updateActions, instancecontrol.NewUpdateAction(updatedInstance)) + // The instance's template hash no longer matches the desired + // template — e.g. an image change, or a restart requested via the + // RestartedAtAnnotation, which is part of the template hash. The + // unikraft provider bakes the pod's runtime, rootfs, and file + // mounts at pod-creation time and never reconciles an existing + // pod's spec, so an in-place Instance update would silently fail to + // roll the running workload. Delete the instance instead; the next + // reconcile recreates it from the current template via the create + // path above, and the provider tears down the old pod + // (finalizer-gated) and boots a fresh one. Ordered, one-at-a-time + // pacing is preserved by the descending-ordinal sort, the + // skip-all-but-first logic, and the DeletionTimestamp WaitAction. + recreateActions = append(recreateActions, instancecontrol.NewDeleteAction(instance)) } } } - // Backfill controller-managed labels on every existing instance, regardless - // of Ready state or template hash. This ensures newly-introduced labels - // (e.g. city-code, workload-name) are applied to pre-existing instances that - // were never touched by a rolling update. The patch is metadata-only and is - // emitted outside the ordered rollout decision so it never gates or reorders - // instance creation/updates. + // Converge controller-managed labels on every existing instance, regardless + // of Ready state or template hash. Labels are stamped only at instance + // creation and rollout is recreate-only, so when the label schema evolves — + // a label is added or its value derivation changes — this pass is the only + // mechanism that updates live instances; without it, any instance alive at + // the time of the change would never receive it. The patch is metadata-only + // and is emitted outside the ordered rollout decision so it never gates or + // reorders instance creation/updates. var patchLabelActions []instancecontrol.Action for _, instance := range desiredInstances { if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { @@ -168,10 +177,10 @@ func (c *statefulControl) GetActions( } } - slices.SortFunc(updateActions, descendingOrdinal) + slices.SortFunc(recreateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)+len(patchLabelActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(recreateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -186,7 +195,7 @@ func (c *statefulControl) GetActions( slices.SortFunc(actions, ascendingOrdinal) - actions = append(actions, updateActions...) + actions = append(actions, recreateActions...) actions = append(actions, deleteActions...) // Skip all actions except the first one. @@ -214,17 +223,14 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme } // desiredControllerLabels returns the full set of controller-managed labels -// that every instance should carry. Used both when stamping a new/updated -// instance and when checking whether an existing instance needs a backfill -// patch. +// that every instance should carry. Used both when stamping a new instance +// and when checking whether an existing instance needs a backfill patch. func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { return map[string]string{ v1alpha.InstanceIndexLabel: strconv.Itoa(index), v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), // Self-describing labels for routing, filtering, and observability. - // Backfilled on every reconcile so they stay accurate even for instances - // that pre-date the labels or that were not reached by a rolling update. v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), v1alpha.CityCodeLabel: deployment.Spec.CityCode, v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index 229a224b..d9133efa 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -49,6 +49,11 @@ func TestFreshDeployment(t *testing.T) { assert.True(t, actions[1].IsSkipped()) } +// TestUpdateWithAllReadyInstances verifies that a template change on Ready +// instances rolls them by delete+recreate (not an in-place update), ordered +// highest-ordinal-first with only the first action active. An in-place update +// would never roll the backing pod, since the unikraft provider bakes the pod +// at creation time and ignores spec changes on an existing pod. func TestUpdateWithAllReadyInstances(t *testing.T) { ctx := context.Background() control := New() @@ -67,11 +72,11 @@ func TestUpdateWithAllReadyInstances(t *testing.T) { assert.Len(t, actions, 2) assert.Equal(t, "test-deploy-1", actions[0].Object.GetName()) - assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[0].ActionType()) assert.False(t, actions[0].IsSkipped()) assert.Equal(t, "test-deploy-0", actions[1].Object.GetName()) - assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[1].ActionType()) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[1].ActionType()) assert.True(t, actions[1].IsSkipped()) } @@ -244,38 +249,48 @@ func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { "PlacementNameLabel must equal deployment.Spec.PlacementName") } -// TestInstanceLabels_PropagatedOnUpdate verifies that when an existing instance -// is updated (rolling update path), the four new labels are refreshed from the -// deployment so they remain accurate after spec changes. -func TestInstanceLabels_PropagatedOnUpdate(t *testing.T) { +// TestInstanceLabels_RefreshedOnRecreate verifies that when a template change +// rolls an instance, the recreated instance carries the four self-describing +// labels sourced from the WorkloadDeployment. A template change deletes the +// drifted instance and recreates it via the create path on the following +// reconcile, which stamps the labels. +func TestInstanceLabels_RefreshedOnRecreate(t *testing.T) { ctx := context.Background() control := New() deployment := getWorkloadDeployment("test-labels-update", 1) - // Build a ready existing instance. + // A ready existing instance on the old template hash. currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} - // Trigger a rolling update by changing the image. + // Trigger a roll by changing the image. deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + // First reconcile: the drifted instance is deleted (recreate), not updated. actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeDelete, actions[0].ActionType()) + assert.Equal(t, "test-labels-update-0", actions[0].Object.GetName()) + // Next reconcile, after the old instance has been fully deleted and is gone: + // the empty slot is refilled by the create path, which stamps the labels. + actions, err = control.GetActions(ctx, scheme, deployment, nil) assert.NoError(t, err) assert.Len(t, actions, 1) - assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) instance, ok := actions[0].Object.(*v1alpha.Instance) assert.True(t, ok) assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], - "WorkloadDeploymentNameLabel must be refreshed on update") + "WorkloadDeploymentNameLabel must be set on the recreated instance") assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], - "CityCodeLabel must be refreshed on update") + "CityCodeLabel must be set on the recreated instance") assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], - "WorkloadNameLabel must be refreshed on update") + "WorkloadNameLabel must be set on the recreated instance") assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], - "PlacementNameLabel must be refreshed on update") + "PlacementNameLabel must be set on the recreated instance") } // TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when @@ -331,7 +346,7 @@ func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { // TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance // with an unchanged template hash receives a PatchLabels action when it is -// missing controller-managed labels. The action must not be a rollout Update, +// missing controller-managed labels. The action must not be a rollout recreate, // must not alter spec/template, and must not block subsequent instances. func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { ctx := context.Background() @@ -361,15 +376,15 @@ func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { assert.NoError(t, err) // Collect actions by type. - var waitActions, createActions, updateActions, patchActions []instancecontrol.Action + var waitActions, createActions, recreateActions, patchActions []instancecontrol.Action for _, a := range actions { switch a.ActionType() { case instancecontrol.ActionTypeWait: waitActions = append(waitActions, a) case instancecontrol.ActionTypeCreate: createActions = append(createActions, a) - case instancecontrol.ActionTypeUpdate: - updateActions = append(updateActions, a) + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) case instancecontrol.ActionTypePatchLabels: patchActions = append(patchActions, a) } @@ -383,8 +398,8 @@ func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { assert.Len(t, createActions, 1, "instance-1 create action must be present") assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") - // No template Update actions must be produced. - assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash instance") + // No rollout recreate actions must be produced. + assert.Empty(t, recreateActions, "no rollout recreate must be produced for a matching-hash instance") // A PatchLabels action must be produced for instance-0. assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") @@ -439,7 +454,7 @@ func TestLabelBackfill_Idempotent(t *testing.T) { // TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with // correct template hash but drifted labels receives a PatchLabels action -// without triggering a template rollout Update. +// without triggering a rollout recreate. func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { ctx := context.Background() control := New() @@ -456,18 +471,18 @@ func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { assert.NoError(t, err) - var updateActions, patchActions []instancecontrol.Action + var recreateActions, patchActions []instancecontrol.Action for _, a := range actions { switch a.ActionType() { - case instancecontrol.ActionTypeUpdate: - updateActions = append(updateActions, a) + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) case instancecontrol.ActionTypePatchLabels: patchActions = append(patchActions, a) } } - // No template Update must be produced — template hash matches. - assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash ready instance") + // No rollout recreate must be produced — template hash matches. + assert.Empty(t, recreateActions, "no rollout recreate must be produced for a matching-hash ready instance") // A PatchLabels action must be produced. assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") @@ -478,8 +493,9 @@ func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { } // TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template -// change on a Ready instance still produces a normal ordered Update action and -// that the PatchLabels path does not interfere with or duplicate it. +// change on a Ready instance still produces the normal ordered roll (a recreate +// Delete per instance) and that the PatchLabels path does not interfere with or +// duplicate it. func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { ctx := context.Background() control := New() @@ -516,23 +532,23 @@ func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { assert.NoError(t, err) - var updateActions, patchActions []instancecontrol.Action + var recreateActions, patchActions []instancecontrol.Action for _, a := range actions { switch a.ActionType() { - case instancecontrol.ActionTypeUpdate: - updateActions = append(updateActions, a) + case instancecontrol.ActionTypeDelete: + recreateActions = append(recreateActions, a) case instancecontrol.ActionTypePatchLabels: patchActions = append(patchActions, a) } } - // Two Update actions expected (one per instance), ordered highest-to-lowest. - assert.Len(t, updateActions, 2, "both instances must produce Update actions on template change") - assert.Equal(t, "test-backfill-rolling-1", updateActions[0].Object.GetName(), - "Update actions must be ordered highest ordinal first") - assert.Equal(t, "test-backfill-rolling-0", updateActions[1].Object.GetName()) - assert.False(t, updateActions[0].IsSkipped(), "first Update must be active") - assert.True(t, updateActions[1].IsSkipped(), "second Update must be skipped (ordered rollout)") + // Two recreate (Delete) actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, recreateActions, 2, "both instances must produce recreate actions on template change") + assert.Equal(t, "test-backfill-rolling-1", recreateActions[0].Object.GetName(), + "recreate actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", recreateActions[1].Object.GetName()) + assert.False(t, recreateActions[0].IsSkipped(), "first recreate must be active") + assert.True(t, recreateActions[1].IsSkipped(), "second recreate must be skipped (ordered rollout)") // No PatchLabels — all labels are already correct. assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") From 2b33fb5d657254b3b9c986c7a8e7a6bda40d3b9e Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 3 Jun 2026 19:48:57 -0400 Subject: [PATCH 10/14] feat(controller): surface rollout progress via UpdatedReplicas + ObservedGeneration A restart/rolling update was invisible from the project plane: there was no status field representing how many instances are on the new template revision. Add UpdatedReplicas (instances whose observed template hash matches the desired template, regardless of readiness) and ObservedGeneration to both WorkloadDeployment and Workload (plus placement) status. UpdatedReplicas is computed on the cell WD reconcile alongside CurrentReplicas (which is now its Programmed subset), aggregated up into the Workload, and rides the existing status sync to the project plane. Repoint the "Up-to-date" printcolumn to .status.updatedReplicas to match `kubectl get deployment` semantics, so a roll is visible as the count dips below Replicas and recovers. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/workload_controller.go | 7 +++++ .../workloaddeployment_controller.go | 30 ++++++++++++++----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6ca92e03..34f55def 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -220,6 +220,7 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( newWorkloadStatus := workload.Status.DeepCopy() totalReplicas := int32(0) totalCurrentReplicas := int32(0) + totalUpdatedReplicas := int32(0) totalDesiredReplicas := int32(0) totalReadyReplicas := int32(0) totalDeployments := int32(0) @@ -251,12 +252,14 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( foundAvailableDeployment := false replicas := int32(0) currentReplicas := int32(0) + updatedReplicas := int32(0) desiredReplicas := int32(0) readyReplicas := int32(0) totalDeployments += int32(len(placementDeployments)) for _, deployment := range placementDeployments { replicas += deployment.Status.Replicas currentReplicas += deployment.Status.CurrentReplicas + updatedReplicas += deployment.Status.UpdatedReplicas desiredReplicas += deployment.Status.DesiredReplicas readyReplicas += deployment.Status.ReadyReplicas @@ -266,11 +269,13 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( } totalReplicas += replicas totalCurrentReplicas += currentReplicas + totalUpdatedReplicas += updatedReplicas totalDesiredReplicas += desiredReplicas totalReadyReplicas += readyReplicas placementStatus.Replicas = replicas placementStatus.CurrentReplicas = currentReplicas + placementStatus.UpdatedReplicas = updatedReplicas placementStatus.DesiredReplicas = desiredReplicas placementStatus.ReadyReplicas = readyReplicas @@ -304,8 +309,10 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( newWorkloadStatus.Deployments = totalDeployments newWorkloadStatus.Replicas = totalReplicas newWorkloadStatus.CurrentReplicas = totalCurrentReplicas + newWorkloadStatus.UpdatedReplicas = totalUpdatedReplicas newWorkloadStatus.DesiredReplicas = totalDesiredReplicas newWorkloadStatus.ReadyReplicas = totalReadyReplicas + newWorkloadStatus.ObservedGeneration = workload.Generation if equality.Semantic.DeepEqual(workload.Status, newWorkloadStatus) { return nil diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 76216cb2..f810e53f 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -171,15 +171,17 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco desiredReplicas = 0 } - currentReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady) if err != nil { return ctrl.Result{}, err } deployment.Status.Replicas = int32(replicas) deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.UpdatedReplicas = int32(updatedReplicas) deployment.Status.DesiredReplicas = desiredReplicas deployment.Status.ReadyReplicas = int32(readyReplicas) + deployment.Status.ObservedGeneration = deployment.Generation if quotaBlockedReplicas > 0 { apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ @@ -246,7 +248,7 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( deployment *computev1alpha.WorkloadDeployment, instances []computev1alpha.Instance, networkReady bool, -) (currentReplicas, readyReplicas, quotaBlockedReplicas int, err error) { +) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas int, err error) { templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) for _, instance := range instances { if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted, metav1.ConditionFalse) { @@ -265,22 +267,34 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( instance.Spec.Controller.SchedulingGates = newGates return nil }); patchErr != nil { - return 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) + return 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr) } } } - if apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceProgrammed) { - if instance.Status.Controller.ObservedTemplateHash == templateHash { - currentReplicas++ - } + // An instance is "updated" once it has observed the desired template + // revision, regardless of readiness. Counting these (even before they are + // Programmed) makes a rolling update / restart observable: UpdatedReplicas + // dips below Replicas while the recreated instance comes up, then recovers. + // Status.Controller is a pointer the infra provider may not have populated + // yet; guard the deref to avoid a panic that would abort the reconcile. + onLatestRevision := instance.Status.Controller != nil && + instance.Status.Controller.ObservedTemplateHash == templateHash + if onLatestRevision { + updatedReplicas++ + } + + // CurrentReplicas is the Programmed subset of UpdatedReplicas — updated + // instances that are ready to serve. + if onLatestRevision && apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceProgrammed) { + currentReplicas++ } if apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceReady) { readyReplicas++ } } - return currentReplicas, readyReplicas, quotaBlockedReplicas, nil + return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, nil } // reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all From 98190b09fba858b0c296b6ead21f1eee7ee17f60 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:40:50 -0500 Subject: [PATCH 11/14] feat: surface instance blocking reasons and claim instanceType vCPU/memory Two Instance-controller correctness changes: - Blocking-reason rollup: surface the most specific provider sub-condition (ImageUnavailable, InstanceCrashing, ConfigurationError, Provisioning) and its message onto the Instance Ready condition instead of a generic "Instance has not been programmed", so e.g. an image-pull failure reads as ImageUnavailable with the real message. Ranks the API reason constants in the blocking-reason priority. - Quota sizing: resolve vCPU/memory for instanceType-sized instances from a new instanceTypeCatalog (datumcloud/d1-standard-2 = 1 vCPU / 2 GiB) so the quota ResourceClaim requests vcpus + memory, not just instance count. Explicit container limits / instance requests still take precedence. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_controller.go | 203 +++++- .../controller/instance_controller_test.go | 599 ++++++++++++++++++ 2 files changed, 782 insertions(+), 20 deletions(-) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 5a4f53f8..2ea23780 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -90,6 +90,34 @@ const ( reasonNetworkFailedToCreate = "NetworkFailedToCreate" ) +// instanceTypeD1Standard2 is the platform instance type name for the +// 1 vCPU / 2 GiB size used as the catalog baseline for quota accounting. +const instanceTypeD1Standard2 = "datumcloud/d1-standard-2" + +// instanceTypeResources holds the vCPU and memory for a named instance type. +type instanceTypeResources struct { + // CPUMillicores is the number of CPU millicores (1000 = 1 vCPU). + CPUMillicores int64 + // MemoryMiB is the amount of RAM in mebibytes. + MemoryMiB int64 +} + +// instanceTypeCatalog maps platform instance type names to their resource +// dimensions used for quota accounting when the instance spec carries only an +// instanceType and no explicit container Limits or instance-level Requests. +// +// These are the platform-declared quota sizes for the instance type, not a +// derivation of any infra provider's machine type. (infra-provider-gcp separately +// maps datumcloud/d1-standard-2 to the GCP n2-standard-2 machine type for VM +// provisioning; that mapping does not define the quota size here.) When new +// instance types are added, add them here with their vCPU/memory values. +var instanceTypeCatalog = map[string]instanceTypeResources{ + instanceTypeD1Standard2: { + CPUMillicores: 1000, // 1 vCPU + MemoryMiB: 2048, // 2 GiB + }, +} + // Quota-pending requeue backoff. The instance controller is normally re-queued by // the ResourceClaim watch when a claim is granted, but that grant event lives on // the project control plane and can be missed (informer engagement races, watch @@ -881,8 +909,24 @@ func (r *InstanceReconciler) classifyCreateError( }, fmt.Errorf("failed creating resource claim: %w", err) } +// resolveInstanceResources determines the vCPU and memory amounts to claim +// for an instance. Explicit sizing always takes precedence over the instance +// type catalog, so a workload that overrides container limits is accounted at +// its actual resource footprint rather than the catalog baseline. +// +// Precedence order: +// 1. Sandbox container Limits (sum across all containers) — all containers +// must have both cpu and memory Limits for this path to succeed. +// 2. Instance-level Resources.Requests — both cpu and memory must be present. +// 3. instanceTypeCatalog lookup by instanceType — used for the common case +// where a workload is sized only by instanceType with no explicit limits. +// +// Returns (0, 0, false) when none of the above yield a complete sizing, so +// the caller falls back to claiming only the instance count. func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime + + // Path 1: explicit per-container Limits — most specific, wins if fully set. if rt.Sandbox != nil { var totalCPU resource.Quantity var totalMem resource.Quantity @@ -901,18 +945,60 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores totalCPU.Add(cpu) totalMem.Add(mem) } - if !allSet || len(rt.Sandbox.Containers) == 0 { - return 0, 0, false + if allSet && len(rt.Sandbox.Containers) > 0 { + return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true } - return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true + // Containers exist but limits are incomplete — fall through so the + // instance-level Requests and instanceType catalog paths can still + // yield a sizing. } + // Path 2: instance-level resource requests. cpu, hasCPU := rt.Resources.Requests[corev1.ResourceCPU] mem, hasMem := rt.Resources.Requests[corev1.ResourceMemory] - if !hasCPU || !hasMem { - return 0, 0, false + if hasCPU && hasMem { + return cpu.MilliValue(), mem.Value() / (1024 * 1024), true + } + + // Path 3: instanceType catalog — handles the typical production case where + // instanceType is the only sizing signal and no explicit limits are set. + if rt.Resources.InstanceType != "" { + if spec, ok := instanceTypeCatalog[rt.Resources.InstanceType]; ok { + return spec.CPUMillicores, spec.MemoryMiB, true + } + } + + return 0, 0, false +} + +// instanceBlockingReasonPriority ranks Instance blocking reasons so the most +// specific, user-actionable cause wins when several conditions are unsatisfied. +// Higher numbers are more specific. Reasons absent from the table rank 0. +// +// 0 - unknown/default +// 1 - Provisioning (transient runtime startup) +// 3 - PendingQuota (operator action may be needed) +// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError +// (hard runtime error, user-actionable) +// 7 - NetworkFailedToCreate (hard infra error) +func instanceBlockingReasonPriority(reason string) int { + switch reason { + case computev1alpha.InstanceReadyReasonProvisioning: + return 1 + case computev1alpha.InstanceProgrammedReasonPendingQuota: + return 3 + case computev1alpha.InstanceReadyReasonImageUnavailable, + computev1alpha.InstanceReadyReasonInstanceCrashing, + computev1alpha.InstanceReadyReasonConfigurationError: + // Hard runtime errors are user-actionable (wrong image, crashing app, bad + // config) and rank highest among non-infra reasons so they are not buried + // under transient startup/quota reasons. + return 5 + case reasonNetworkFailedToCreate: + return 7 + default: + return 0 } - return cpu.MilliValue(), mem.Value() / (1024 * 1024), true } // networkFailureChecker is a function that checks if a network creation failure @@ -996,16 +1082,88 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) - readyCondition.Status = metav1.ConditionFalse - readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming - if programmedCondition != nil && programmedCondition.Reason != pendingReason { - readyCondition.Reason = programmedCondition.Reason + // Surface the most specific provider sub-condition rather than a generic + // "Instance has not been programmed". A provider reason like + // ImageUnavailable (set on the Available condition while Programmed is + // still Unknown) must surface on Ready with its actionable message. + // + // Two tiers are tracked: + // - bestKnown: the best candidate from the priority table (ranked 1-7). + // - fallback: the Programmed condition's own reason/message when it has + // one but it is not in the priority table (e.g. a provider + // writes a custom Programmed reason otherwise unknown to + // this controller). Preserves Programmed.Reason → Ready.Reason + // pass-through behavior. + type candidate struct { + status metav1.ConditionStatus + reason string + message string + priority int + } + + // Generic default — used only when nothing better is found. + fallbackCandidate := candidate{ + status: metav1.ConditionFalse, + reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, + message: msgNotProgrammed, + priority: -1, + } + // Promote the Programmed condition's own reason as a fallback when it is + // more specific than PendingProgramming/Pending but not in the priority + // table. Preserves pass-through for provider-written Programmed reasons. + if programmedCondition != nil && programmedCondition.Reason != pendingReason && + programmedCondition.Reason != computev1alpha.InstanceProgrammedReasonPendingProgramming { + fallbackCandidate = candidate{ + status: programmedCondition.Status, + reason: programmedCondition.Reason, + message: programmedCondition.Message, + priority: 0, + } + } + + best := fallbackCandidate + consider := func(status metav1.ConditionStatus, reason, message string) { + // A generic "Pending" reason carries no actionable signal; skip it so + // it cannot displace an already-set specific reason from the provider. + if reason == pendingReason { + return + } + p := instanceBlockingReasonPriority(reason) + if p > best.priority { + best = candidate{status: status, reason: reason, message: message, priority: p} + } } - readyCondition.Message = msgNotProgrammed - if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = programmedCondition.Message + // Sub-conditions set by the provider (e.g. Available=Unknown/ImageUnavailable) + // may be more specific than the Programmed condition. Consult each one so + // the highest-priority reason wins, regardless of which condition carries it. + for _, cond := range instance.Status.Conditions { + if cond.Status == metav1.ConditionTrue { + // Satisfied conditions are not blocking; skip them. + continue + } + switch cond.Type { + case computev1alpha.InstanceProgrammed, + computev1alpha.InstanceReady, + computev1alpha.InstanceQuotaGranted: + // InstanceProgrammed is handled below; InstanceReady is being set + // now. InstanceQuotaGranted is a gate-level signal evaluated before + // this branch is reached — including it here would let a transient + // PendingEvaluation reason displace the generic not-programmed + // fallback when no provider sub-condition is set yet. + continue + } + consider(cond.Status, cond.Reason, cond.Message) } + // Also let the Programmed condition itself compete through the priority table + // in case it carries a known reason (e.g. PendingQuota). + if programmedCondition != nil { + consider(programmedCondition.Status, programmedCondition.Reason, programmedCondition.Message) + } + + readyCondition.Status = best.status + readyCondition.Reason = best.reason + readyCondition.Message = best.message return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -1016,16 +1174,21 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if availableCondition == nil || availableCondition.Status != metav1.ConditionTrue { logger.Info("instance is not available", "instance", instance.Name) - readyCondition.Status = metav1.ConditionFalse - readyCondition.Reason = pendingReason + // Propagate the Available condition's reason and message directly — + // including when the status is Unknown — so provider-set reasons like + // ImageUnavailable surface on Ready rather than a generic message. + readyStatus := metav1.ConditionFalse + readyReason := pendingReason + readyMessage := "Instance is not available" if availableCondition != nil && availableCondition.Reason != pendingReason { - readyCondition.Reason = availableCondition.Reason + readyStatus = availableCondition.Status + readyReason = availableCondition.Reason + readyMessage = availableCondition.Message } - readyCondition.Message = "Instance is not available" - if availableCondition != nil && availableCondition.Status != metav1.ConditionUnknown { - readyCondition.Message = availableCondition.Message - } + readyCondition.Status = readyStatus + readyCondition.Reason = readyReason + readyCondition.Message = readyMessage return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 202b58b1..1445ff96 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -8,8 +8,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -1845,3 +1847,600 @@ func TestQuotaPendingRequeueAfter(t *testing.T) { }) } } + +// Shared literals for the instance-sizing / blocking-reason tests below. +const ( + testContainerName = "app" + testContainerImage = "test/image:latest" +) + +// TestReconcileInstanceReadyCondition_ProviderSubConditionSurfacing verifies +// that provider-set sub-condition reasons (e.g. ImageUnavailable written by the +// unikraft provider onto the Available condition) surface on Ready with both the +// reason AND the message preserved — even when the sub-condition status is +// Unknown (the normal state for a retriable image-pull failure). +// +// This guards against Ready carrying a generic message that discards the +// actionable provider reason. +func TestReconcileInstanceReadyCondition_ProviderSubConditionSurfacing(t *testing.T) { + // These messages mirror the exact strings that translateWaitingReason in the + // unikraft provider writes. Both the reason AND the message must reach Ready. + const ( + msgImageUnavailable = "The instance image could not be pulled" + msgInstanceCrashing = "The instance is repeatedly failing to start" + msgConfigError = "The instance could not be started due to a configuration error" + msgProvisioning = "Instance is provisioning" + msgProgrammingInProgress = "Instance is being programmed" + ) + + noGates := func(inst *computev1alpha.Instance) *computev1alpha.Instance { return inst } + withQuotaGranted := func(inst *computev1alpha.Instance) *computev1alpha.Instance { + inst.Status.Conditions = append(inst.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "Quota allocated", + }) + return inst + } + + tests := []struct { + name string + instance *computev1alpha.Instance + wantStatus metav1.ConditionStatus + wantReason string + wantMessage string + }{ + { + // The key scenario from the design: provider writes Available=Unknown/ + // ImageUnavailable while Programmed is still Unknown/ProgrammingInProgress. + // Ready must carry ImageUnavailable + the actionable message, NOT the + // generic "Instance has not been programmed". + name: "image_pull_failure_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + // Provider sets Available=Unknown/ImageUnavailable when the + // container enters an image-pull waiting state. + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // Even while Programmed is Unknown, Ready must surface the provider + // sub-condition's reason and message; the generic PendingProgramming/ + // msgNotProgrammed pair is reserved for instances with no more + // specific signal. + name: "provider_reason_wins_over_generic_message_while_programmed_unknown", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // When both a transient Provisioning and ImageUnavailable are present, + // ImageUnavailable (priority 5) must win over Provisioning (priority 1). + name: "image_unavailable_beats_transient_provisioning", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonProvisioning, + Message: msgProvisioning, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + { + // When no specific provider sub-condition exists but Programmed carries + // a specific reason (ProgrammingInProgress), that reason should + // pass-through to Ready. The generic msgNotProgrammed fallback is only + // used when Programmed is absent or carries only a generic "Pending" reason. + name: "programmed_in_progress_passes_through_when_no_provider_sub_condition", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + }, + }, + }), + // ProgrammingInProgress is more specific than PendingProgramming and + // passes through from Programmed → Ready. + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + wantMessage: msgProgrammingInProgress, + }, + { + // True generic fallback: no Programmed condition at all. The default + // PendingProgramming/msgNotProgrammed must be emitted. + name: "generic_fallback_when_programmed_condition_absent", + instance: noGates(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + }, + }), + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.InstanceProgrammedReasonPendingProgramming, + wantMessage: msgNotProgrammed, + }, + { + // InstanceCrashing: terminal-ish (not retried indefinitely by the user, + // they must fix the app). Status=Unknown from provider → Ready=Unknown. + name: "instance_crashing_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonInstanceCrashing, + Message: msgInstanceCrashing, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonInstanceCrashing, + wantMessage: msgInstanceCrashing, + }, + { + // ConfigurationError: provider could not start the container due to a + // spec/config issue. User must correct the workload. + name: "configuration_error_surfaces_on_ready", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceProgrammedReasonProgrammingInProgress, + Message: msgProgrammingInProgress, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonConfigurationError, + Message: msgConfigError, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonConfigurationError, + wantMessage: msgConfigError, + }, + { + // When Programmed=True but Available=Unknown/ImageUnavailable, the + // available-not-true branch must also propagate the provider reason+message. + name: "image_unavailable_on_available_condition_programmed_true", + instance: withQuotaGranted(&computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceProgrammedReasonProgrammed, + Message: msgInstanceProgrammed, + }, + { + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceReadyReasonImageUnavailable, + Message: msgImageUnavailable, + }, + }, + }, + }), + wantStatus: metav1.ConditionUnknown, + wantReason: computev1alpha.InstanceReadyReasonImageUnavailable, + wantMessage: msgImageUnavailable, + }, + } + + noNetworkFailure := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return false, "", nil + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &InstanceReconciler{} + _, err := r.reconcileInstanceReadyCondition(context.Background(), nil, tt.instance, noNetworkFailure) + require.NoError(t, err) + + ready := apimeta.FindStatusCondition(tt.instance.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, ready, "Ready condition must be set") + assert.Equal(t, tt.wantStatus, ready.Status, "Ready.Status mismatch") + assert.Equal(t, tt.wantReason, ready.Reason, "Ready.Reason mismatch") + assert.Equal(t, tt.wantMessage, ready.Message, "Ready.Message mismatch") + }) + } +} + +// TestResolveInstanceResources verifies the three-tier sizing precedence: +// explicit container Limits > instance-level Requests > instanceType catalog. +func TestResolveInstanceResources(t *testing.T) { + // d1Standard2 is the canonical catalog entry for datumcloud/d1-standard-2 + // (1 vCPU = 1000 millicores, 2 GiB = 2048 MiB) — the platform-declared quota + // size for the instance type. + const ( + d1CPUMillicores = int64(1000) + d1MemMiB = int64(2048) + ) + + cpu500m := resource.MustParse("500m") + cpu1 := resource.MustParse("1") + mem256Mi := resource.MustParse("256Mi") + mem512Mi := resource.MustParse("512Mi") + + makeContainerResources := func(cpu, mem resource.Quantity) *computev1alpha.ContainerResourceRequirements { + return &computev1alpha.ContainerResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: cpu, + corev1.ResourceMemory: mem, + }, + } + } + + tests := []struct { + name string + instance *computev1alpha.Instance + wantCPU int64 + wantMem int64 + wantResolved bool + }{ + { + // Common production case: instanceType only, no explicit limits. + // resolveInstanceResources must consult the catalog and return the + // d1-standard-2 values so vcpus + memory are included in the claim. + name: "instanceType only: d1-standard-2 resolves from catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + }, + }, + }, + wantCPU: d1CPUMillicores, + wantMem: d1MemMiB, + wantResolved: true, + }, + { + // Explicit container Limits take precedence over the catalog so that + // a workload with custom sizing is accounted at its actual footprint. + name: "explicit container limits override catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + Resources: makeContainerResources(cpu500m, mem256Mi), + }, + { + Name: "sidecar", + Image: "test/sidecar:latest", + Resources: makeContainerResources(cpu500m, mem256Mi), + }, + }, + }, + }, + }, + }, + // Two containers each contributing 500m CPU + 256 MiB → 1000m + 512 MiB. + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // A single container with full cpu+memory Limits; no instanceType needed. + name: "single container limits, no instanceType", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + Resources: makeContainerResources(cpu1, mem512Mi), + }, + }, + }, + }, + }, + }, + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // Instance-level Requests (no sandbox, no instanceType) use path 2. + name: "instance-level resources.requests resolve correctly", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: cpu1, + corev1.ResourceMemory: mem512Mi, + }, + }, + }, + }, + }, + wantCPU: 1000, + wantMem: 512, + wantResolved: true, + }, + { + // An unknown instanceType with no explicit sizing must not fabricate + // values; the caller falls back to claiming instance count only. + name: "unknown instanceType, no explicit limits: unresolved", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: "datumcloud/unknown-type-99", + }, + }, + }, + }, + wantCPU: 0, + wantMem: 0, + wantResolved: false, + }, + { + // Empty instanceType and no explicit sizing: unresolved. + name: "empty instanceType, nothing explicit: unresolved", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{}, + }, + }, + }, + wantCPU: 0, + wantMem: 0, + wantResolved: false, + }, + { + // Sandbox containers without any Limits fall through to the catalog + // when an instanceType is set — partial container specs must not block + // catalog resolution. + name: "sandbox containers without limits fall through to catalog", + instance: &computev1alpha.Instance{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + InstanceType: instanceTypeD1Standard2, + }, + Sandbox: &computev1alpha.SandboxRuntime{ + Containers: []computev1alpha.SandboxContainer{ + { + Name: testContainerName, + Image: testContainerImage, + // No Resources.Limits set — common for UKC workloads. + }, + }, + }, + }, + }, + }, + wantCPU: d1CPUMillicores, + wantMem: d1MemMiB, + wantResolved: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cpu, mem, resolved := resolveInstanceResources(tt.instance) + assert.Equal(t, tt.wantResolved, resolved, "resolved mismatch") + assert.Equal(t, tt.wantCPU, cpu, "cpuMillicores mismatch") + assert.Equal(t, tt.wantMem, mem, "memMiB mismatch") + }) + } +} + +// TestReconcileQuotaClaim_RequestsIncludeVCPUsAndMemory confirms that when an +// instance is sized by instanceType alone (the typical production shape), the +// ResourceClaim created by reconcileQuotaClaim includes vcpus and memory +// requests in addition to the instance count, so the AllowanceBuckets are fed. +func TestReconcileQuotaClaim_RequestsIncludeVCPUsAndMemory(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "claim-resources-test" + ) + + claimName := instanceQuotaClaimNamePrefix + instanceName + + s := newTestScheme(t) + + // Instance sized by instanceType only — no container limits, no explicit + // instance-level requests. This is the common production workload shape. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: "owner-deployment", + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{ + // No Requests, no container Limits — catalog must supply the values. + InstanceType: instanceTypeD1Standard2, + }, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "owner-deployment", + Namespace: namespace, + UID: testUIDString, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: &fakeMCManager{clusters: map[string]cluster.Cluster{clusterName: newFakeCluster(projectClient)}}, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + recorder: &record.FakeRecorder{}, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + // Verify the created ResourceClaim carries vcpus and memory requests. + var createdClaim quotav1alpha1.ResourceClaim + require.NoError(t, quotaClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: claimName}, &createdClaim)) + + byType := make(map[string]int64, len(createdClaim.Spec.Requests)) + for _, req := range createdClaim.Spec.Requests { + byType[req.ResourceType] = req.Amount + } + + assert.Equal(t, int64(1), byType[quotaResourceTypeInstances], "instance count must be 1") + assert.Equal(t, int64(1000), byType["compute.datumapis.com/vcpus"], + "d1-standard-2 must claim 1000 millicores (1 vCPU)") + assert.Equal(t, int64(2048), byType["compute.datumapis.com/memory"], + "d1-standard-2 must claim 2048 MiB (2 GiB)") +} From cf31ee5ea1446f121db6d2ffebac72dc8e64e5a3 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:41:10 -0500 Subject: [PATCH 12/14] feat(config): CRDs, RBAC, and kustomize overlays for federation Regenerate the Instance, Workload, and WorkloadDeployment CRDs for the new API fields and add the kustomize structure that deploys the manager in cell or management-plane mode: federation and downstream RBAC bases, cell/management/quota-credentials components, the WorkloadDeployment status interpreter, and the matching overlays. The regenerated controller role also grants the event writes the instance controller performs when surfacing blocking reasons (QuotaNoBudget, ImageUnavailable, NetworkFailedToCreate, ...) so those signals reach kubectl describe and the activity timeline instead of being rejected by RBAC. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../base/downstream-rbac/kustomization.yaml | 5 ++ config/base/downstream-rbac/rbac.yaml | 35 ++++++++++++++ config/base/federation/kustomization.yaml | 10 ++++ config/base/manager/manager.yaml | 31 +++++++++--- config/base/manager/service_account.yaml | 2 +- .../cell-controllers/kustomization.yaml | 20 ++++++++ .../metrics_auth_role_binding.yaml | 2 +- config/components/controller_rbac/role.yaml | 47 +++++++++++++++++++ .../controller_rbac/role_binding.yaml | 2 +- .../components/federation/kustomization.yaml | 5 ++ .../workloaddeployment-interpreter.yaml | 28 +++++++++++ .../leader_election_role_binding.yaml | 2 +- .../management-controllers/kustomization.yaml | 20 ++++++++ .../quota-credentials/kustomization.yaml | 26 ++++++++++ .../service-configuration.yaml | 25 +++++++++- .../overlays/cell/disable_webhook_patch.yaml | 12 +++++ config/overlays/cell/kustomization.yaml | 17 +++++++ .../discovery_mode_patch.yaml | 13 +++++ .../downstream_kubeconfig_patch.yaml | 29 ++++++++++++ .../management-plane/kustomization.yaml | 21 +++++++++ .../single-cluster/kustomization.yaml | 2 + 21 files changed, 343 insertions(+), 11 deletions(-) create mode 100644 config/base/downstream-rbac/kustomization.yaml create mode 100644 config/base/downstream-rbac/rbac.yaml create mode 100644 config/base/federation/kustomization.yaml create mode 100644 config/components/cell-controllers/kustomization.yaml create mode 100644 config/components/federation/kustomization.yaml create mode 100644 config/components/federation/workloaddeployment-interpreter.yaml create mode 100644 config/components/management-controllers/kustomization.yaml create mode 100644 config/components/quota-credentials/kustomization.yaml create mode 100644 config/overlays/cell/disable_webhook_patch.yaml create mode 100644 config/overlays/cell/kustomization.yaml create mode 100644 config/overlays/management-plane/discovery_mode_patch.yaml create mode 100644 config/overlays/management-plane/downstream_kubeconfig_patch.yaml create mode 100644 config/overlays/management-plane/kustomization.yaml diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 00000000..4c4dbe44 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 00000000..1937ef02 --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 00000000..1261dac6 --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index e2c06e97..8ef18135 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,33 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --federation-kubeconfig=$(FEDERATION_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + - --feature-gates=$(FEATURE_GATES) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: FEDERATION_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" + - name: FEATURE_GATES + value: "" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -66,7 +85,7 @@ spec: volumeMounts: - name: config mountPath: /config - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711deb..cc6bd6cc 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 00000000..3f32da3b --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d974..ada1a1de 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index 5d803d2c..a634f512 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,20 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list - apiGroups: - compute.datumapis.com resources: @@ -36,3 +50,36 @@ rules: - get - patch - update +- apiGroups: + - networking.datumapis.com + resources: + - locations + - networkcontexts + - subnets + verbs: + - get + - list + - watch +- apiGroups: + - networking.datumapis.com + resources: + - networkbindings + - subnetclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - quota.miloapis.com + resources: + - resourceclaims + verbs: + - create + - delete + - get + - list + - watch diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3f..2f3e2676 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 00000000..3ba207ff --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 00000000..2743a63b --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,28 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if statusItems[1].status ~= nil then + desiredObj.status = statusItems[1].status + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe9996..d6783c07 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 00000000..d1e29e7f --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/config/components/quota-credentials/kustomization.yaml b/config/components/quota-credentials/kustomization.yaml new file mode 100644 index 00000000..ffc9a6d8 --- /dev/null +++ b/config/components/quota-credentials/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: quota-credentials + mountPath: /etc/quota-credentials + readOnly: true + volumes: + - name: quota-credentials + secret: + secretName: compute-quota-credentials + optional: true diff --git a/config/components/service-catalog/service-configuration.yaml b/config/components/service-catalog/service-configuration.yaml index 202ac8af..8c29a50e 100644 --- a/config/components/service-catalog/service-configuration.yaml +++ b/config/components/service-catalog/service-configuration.yaml @@ -6,6 +6,9 @@ spec: serviceRef: name: compute phase: Published + locations: + supportedClasses: + - datum-managed monitoredResourceTypes: - type: compute.datumapis.com/Instance displayName: Compute Instance @@ -44,6 +47,26 @@ spec: description: Seconds the instance has been in a running state. kind: Cumulative unit: s + - name: compute.datumapis.com/workloads + displayName: Compute Workloads + description: Number of compute workloads. + kind: Gauge + unit: '{workload}' + - name: compute.datumapis.com/instances + displayName: Compute Instances + description: Number of compute instances. + kind: Gauge + unit: '{instance}' + - name: compute.datumapis.com/vcpus + displayName: Compute vCPUs + description: Number of vCPUs allocated across all instances. + kind: Gauge + unit: '{millicore}' + - name: compute.datumapis.com/memory + displayName: Compute Memory + description: Memory allocated across all instances. + kind: Gauge + unit: MiB billing: consumerDestinations: - monitoredResourceType: compute.datumapis.com/Instance @@ -53,13 +76,13 @@ spec: - compute.datumapis.com/instance/cpu-allocated - compute.datumapis.com/instance/memory-allocated - compute.datumapis.com/instance/uptime-seconds + quota: metricRules: - selector: apiGroup: compute.datumapis.com kind: Workload metricCosts: compute.datumapis.com/workloads: 1 - quota: limits: - name: compute-workloads metric: compute.datumapis.com/workloads diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml new file mode 100644 index 00000000..85b57f09 --- /dev/null +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + discovery: + quotaKubeconfigPath: /etc/quota-credentials/kubeconfig diff --git a/config/overlays/cell/kustomization.yaml b/config/overlays/cell/kustomization.yaml new file mode 100644 index 00000000..80925ee2 --- /dev/null +++ b/config/overlays/cell/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/cell-controllers + - ../../components/quota-credentials + +patches: +- path: disable_webhook_patch.yaml diff --git a/config/overlays/management-plane/discovery_mode_patch.yaml b/config/overlays/management-plane/discovery_mode_patch.yaml new file mode 100644 index 00000000..97bf762c --- /dev/null +++ b/config/overlays/management-plane/discovery_mode_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + webhookServer: {} + discovery: + mode: milo diff --git a/config/overlays/management-plane/downstream_kubeconfig_patch.yaml b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml new file mode 100644 index 00000000..7b3b764b --- /dev/null +++ b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: FEDERATION_KUBECONFIG + value: /etc/kubernetes/downstream/auth/downstream-kubeconfig.yaml + volumeMounts: + - name: downstream-kubeconfig + mountPath: /etc/kubernetes/downstream/auth + readOnly: true + - name: karmada-token + mountPath: /etc/kubernetes/karmada-token + readOnly: true + volumes: + - name: downstream-kubeconfig + configMap: + name: compute-downstream-kubeconfig + - name: karmada-token + projected: + sources: + - serviceAccountToken: + audience: https://karmada-apiserver.karmada-system.svc.cluster.local:5443 + path: token diff --git a/config/overlays/management-plane/kustomization.yaml b/config/overlays/management-plane/kustomization.yaml new file mode 100644 index 00000000..dae13c58 --- /dev/null +++ b/config/overlays/management-plane/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager + - ../../base/webhook +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/resource-metrics + - ../../components/high-availability + - ../../components/management-controllers + - ../../components/csi-webhook-cert + +patches: +- path: downstream_kubeconfig_patch.yaml +- path: discovery_mode_patch.yaml diff --git a/config/overlays/single-cluster/kustomization.yaml b/config/overlays/single-cluster/kustomization.yaml index 7a2d0320..4d72934e 100644 --- a/config/overlays/single-cluster/kustomization.yaml +++ b/config/overlays/single-cluster/kustomization.yaml @@ -15,3 +15,5 @@ components: - ../../components/resource-metrics - ../../components/high-availability - ../../components/csi-webhook-cert + - ../../components/management-controllers + - ../../components/cell-controllers From 1cc509ff14c79765d45a1de371569251b2a694b2 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 09:47:43 -0500 Subject: [PATCH 13/14] feat: allow users to patch workloads --- config/components/iam/roles/compute-admin.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/components/iam/roles/compute-admin.yaml b/config/components/iam/roles/compute-admin.yaml index 4405bbd2..6bfaafa2 100644 --- a/config/components/iam/roles/compute-admin.yaml +++ b/config/components/iam/roles/compute-admin.yaml @@ -12,4 +12,5 @@ spec: includedPermissions: - compute.datumapis.com/workloads.create - compute.datumapis.com/workloads.update + - compute.datumapis.com/workloads.patch - compute.datumapis.com/workloads.delete From 7dc94a0cd266f1110cbaba98ab370b323abec1f6 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 10 Jun 2026 13:41:21 -0500 Subject: [PATCH 14/14] test(controller): cover reconcileInstanceGates replica counting and gate clearing Adds unit coverage for the WorkloadDeployment controller's replica bucketing (updated/current/ready/quota-blocked), the network scheduling-gate clearing path, the nil Spec.Controller and nil Status.Controller regressions, and the finalizer-add requeue with status publication (ObservedGeneration, DesiredReplicas, ReplicasReady). Co-Authored-By: Claude Fable 5 --- .../workloaddeployment_controller_test.go | 432 ++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 internal/controller/workloaddeployment_controller_test.go diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go new file mode 100644 index 00000000..e343a17b --- /dev/null +++ b/internal/controller/workloaddeployment_controller_test.go @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.datum.net/compute/internal/controller/instancecontrol" +) + +const ( + // wdControllerTestName / wdControllerTestNS / wdControllerTestUID are shared + // fixtures for the WorkloadDeployment controller unit tests. + wdControllerTestName = "test-wd" + wdControllerTestNS = "default" + wdControllerTestUID = "wd-uid-test" + + // wdControllerTestCityCode is the shared CityCode fixture for + // WorkloadDeployment controller tests. + wdControllerTestCityCode = "DFW" + + // wdControllerTestWorkload is the shared WorkloadRef fixture. + wdControllerTestWorkload = "test-workload" + + // wdTestReasonProgrammed / wdTestReasonReady are condition Reason fixtures + // matching what the infra provider writes on Instances. + wdTestReasonProgrammed = "Programmed" + wdTestReasonReady = "Ready" +) + +// wdControllerTestDeployment builds a WorkloadDeployment fixture shaped like a +// cell-local deployment after Karmada propagation: city code, placement, and a +// minimal instance template so ComputeHash produces a stable hash. +func wdControllerTestDeployment(minReplicas int32) *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + UID: wdControllerTestUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: wdControllerTestCityCode, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: minReplicas, + // Always present in production: the API server defaults the policy + // via kubebuilder, and the instance-control strategy emits no + // create/wait actions without it. + InstanceManagementPolicy: computev1alpha.OrderedReadyInstanceManagementPolicyType, + }, + Template: computev1alpha.InstanceTemplateSpec{ + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{}, + }, + }, + }, + }, + } +} + +// wdControllerTestInstance builds an Instance fixture labeled the way the +// instance control strategy creates them (workload-deployment-uid label set). +func wdControllerTestInstance(name string) computev1alpha.Instance { + return computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: wdControllerTestNS, + Labels: map[string]string{ + computev1alpha.WorkloadDeploymentUIDLabel: wdControllerTestUID, + }, + }, + } +} + +// TestReconcileInstanceGates_NilController_DoesNotPanic is a regression test for +// the case where an Instance has Programmed=True but Status.Controller is nil. +// +// Background: Status.Controller is a nilable pointer that the infra provider +// populates independently of setting the Programmed condition. Before the guard +// was added, reconcileInstanceGates would dereference Status.Controller while +// counting currentReplicas, causing a nil pointer panic that aborted the +// reconcile loop and froze WorkloadDeployment status. +// +// This test verifies that: +// 1. The reconcile does not panic when Status.Controller is nil. +// 2. Only instances with a non-nil Status.Controller whose ObservedTemplateHash +// matches the deployment's current template hash are counted as current. +func TestReconcileInstanceGates_NilController_DoesNotPanic(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(2) + templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) + + // Instance A: Programmed=True but Status.Controller is nil (the panic case). + // This instance must NOT be counted as current and must NOT cause a panic. + instanceNilController := wdControllerTestInstance("instance-nil-controller") + instanceNilController.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: wdTestReasonProgrammed, + LastTransitionTime: metav1.Now(), + }, + }, + // Status.Controller intentionally nil — this is the regression scenario. + Controller: nil, + } + + // Instance B: Programmed=True with Status.Controller populated and matching + // hash. This instance MUST be counted as current (currentReplicas == 1). + instanceWithController := wdControllerTestInstance("instance-with-controller") + instanceWithController.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionTrue, + Reason: wdTestReasonProgrammed, + LastTransitionTime: metav1.Now(), + }, + }, + Controller: &computev1alpha.InstanceControllerStatus{ + ObservedTemplateHash: templateHash, + }, + } + + // Instance C: Ready=True (contributes to readyReplicas). + instanceReady := wdControllerTestInstance("instance-ready") + instanceReady.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: wdTestReasonReady, + LastTransitionTime: metav1.Now(), + }, + }, + Controller: &computev1alpha.InstanceControllerStatus{ + ObservedTemplateHash: templateHash, + }, + } + + instances := []computev1alpha.Instance{ + instanceNilController, + instanceWithController, + instanceReady, + } + + // Use a fake client. networkReady=false avoids the gate-patch path that + // would call CreateOrPatch, so the client is not exercised here. + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + // The call must not panic — that is the primary regression assertion. + currentReplicas, _, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + instances, + false, // networkReady=false: skip gate-patch path + ) + + require.NoError(t, err) + + // Only instanceWithController has Programmed=True AND a non-nil + // Status.Controller with a matching hash — the nil-Controller instance must + // not be counted. instanceReady also has a matching hash but no Programmed + // condition, so it also does not increment currentReplicas. + assert.Equal(t, 1, currentReplicas, + "only the instance with a populated, matching Status.Controller counts as current; "+ + "the nil-Controller instance must not be counted (Status.Controller nil regression guard)") + + assert.Equal(t, 1, readyReplicas, "instanceReady must be counted as ready") + assert.Equal(t, 0, quotaBlockedReplicas) +} + +// TestReconcileInstanceGates_NilSpecController_DoesNotPanic is a regression test +// for a nil-deref in reconcileInstanceGates: Spec.Controller is a nilable +// pointer, and the network gate-clearing path dereferenced +// instance.Spec.Controller.SchedulingGates without a nil guard. When +// networkReady is true and an instance has no controller spec, the unguarded +// deref panicked the reconcile. This must not panic. +func TestReconcileInstanceGates_NilSpecController_DoesNotPanic(t *testing.T) { + t.Parallel() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: wdControllerTestName, Namespace: wdControllerTestNS, UID: wdControllerTestUID}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: wdControllerTestCityCode, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: wdControllerTestWorkload}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + Template: computev1alpha.InstanceTemplateSpec{}, + }, + } + + // Spec.Controller intentionally nil — the network gate-clearing path runs + // (networkReady=true) and must skip this instance instead of panicking. + instanceNilSpecController := computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{Name: "instance-nil-spec-controller", Namespace: wdControllerTestNS}, + } + + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + require.NotPanics(t, func() { + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{instanceNilSpecController}, + true, // networkReady=true exercises the Spec.Controller deref path + ) + require.NoError(t, err) + }) +} + +// TestReconcileInstanceGates_ReplicaCounting verifies how instances are bucketed +// into the replica counters: +// +// - updatedReplicas: ObservedTemplateHash matches the desired template hash, +// regardless of readiness — a stale hash must not count. +// - currentReplicas: the Programmed=True subset of updated instances. +// - readyReplicas: Ready=True regardless of revision. +// - quotaBlockedReplicas: QuotaGranted=False. +func TestReconcileInstanceGates_ReplicaCounting(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(4) + templateHash := instancecontrol.ComputeHash(deployment.Spec.Template) + + // Updated + Programmed + Ready: counts toward updated, current, and ready. + instanceUpdatedReady := wdControllerTestInstance("instance-updated-ready") + instanceUpdatedReady.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + {Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: wdTestReasonProgrammed, LastTransitionTime: metav1.Now()}, + {Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: wdTestReasonReady, LastTransitionTime: metav1.Now()}, + }, + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: templateHash}, + } + + // Stale revision but Programmed and Ready: counts toward ready only — a + // rolling update must surface UpdatedReplicas < Replicas. + instanceStale := wdControllerTestInstance("instance-stale") + instanceStale.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + {Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: wdTestReasonProgrammed, LastTransitionTime: metav1.Now()}, + {Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: wdTestReasonReady, LastTransitionTime: metav1.Now()}, + }, + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: "stale-hash"}, + } + + // Updated but not yet Programmed: counts toward updated only. + instanceUpdatedPending := wdControllerTestInstance("instance-updated-pending") + instanceUpdatedPending.Status = computev1alpha.InstanceStatus{ + Controller: &computev1alpha.InstanceControllerStatus{ObservedTemplateHash: templateHash}, + } + + // Quota-blocked: QuotaGranted=False as the instance quota controller writes it. + instanceQuotaBlocked := wdControllerTestInstance("instance-quota-blocked") + instanceQuotaBlocked.Status = computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: "quota exceeded", + LastTransitionTime: metav1.Now(), + }, + }, + } + + cl := newProjectFakeClient() + r := &WorkloadDeploymentReconciler{} + + currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{instanceUpdatedReady, instanceStale, instanceUpdatedPending, instanceQuotaBlocked}, + false, + ) + require.NoError(t, err) + + assert.Equal(t, 2, updatedReplicas, "matching-hash instances count as updated; the stale-hash instance must not") + assert.Equal(t, 1, currentReplicas, "only updated AND Programmed instances count as current") + assert.Equal(t, 2, readyReplicas, "Ready=True counts regardless of revision") + assert.Equal(t, 1, quotaBlockedReplicas, "QuotaGranted=False counts as quota-blocked") +} + +// TestReconcileInstanceGates_ClearsNetworkSchedulingGate verifies the network +// gate-clearing path: once networking is ready, the Network scheduling gate is +// removed from gated instances while unrelated gates are preserved. When +// networking is not ready, gates are left untouched. +func TestReconcileInstanceGates_ClearsNetworkSchedulingGate(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(1) + + newGatedInstance := func() *computev1alpha.Instance { + instance := wdControllerTestInstance("instance-gated") + // Gate order matches the stateful instance control strategy: Network + // prepended ahead of Quota. + instance.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + } + return &instance + } + + t.Run("network ready removes only the Network gate", func(t *testing.T) { + t.Parallel() + + instance := newGatedInstance() + cl := newProjectFakeClient(instance) + r := &WorkloadDeploymentReconciler{} + + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{*instance}, + true, + ) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, cl.Get(context.Background(), client.ObjectKeyFromObject(instance), &updated)) + require.NotNil(t, updated.Spec.Controller) + require.Len(t, updated.Spec.Controller.SchedulingGates, 1, + "the Network gate must be removed and the Quota gate preserved") + assert.Equal(t, instancecontrol.QuotaSchedulingGate.String(), updated.Spec.Controller.SchedulingGates[0].Name) + }) + + t.Run("network not ready leaves gates untouched", func(t *testing.T) { + t.Parallel() + + instance := newGatedInstance() + cl := newProjectFakeClient(instance) + r := &WorkloadDeploymentReconciler{} + + _, _, _, _, err := r.reconcileInstanceGates( + context.Background(), + cl, + deployment, + []computev1alpha.Instance{*instance}, + false, + ) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, cl.Get(context.Background(), client.ObjectKeyFromObject(instance), &updated)) + require.NotNil(t, updated.Spec.Controller) + assert.Len(t, updated.Spec.Controller.SchedulingGates, 2, + "gates must not be cleared while networking is still provisioning") + }) +} + +// newTestWDReconciler builds a WorkloadDeploymentReconciler wired to a fake +// project cluster with the controller finalizer pre-registered, mirroring +// SetupWithManager. Networking is disabled so Reconcile treats the network as +// immediately ready without touching networking CRDs. +func newTestWDReconciler(projectClient client.Client) *WorkloadDeploymentReconciler { + r := &WorkloadDeploymentReconciler{ + mgr: newFakeMCManager(testCluster, newFakeCluster(projectClient)), + NetworkingEnabled: false, + } + feds := finalizer.NewFinalizers() + if err := feds.Register(workloadControllerFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// TestWorkloadDeploymentReconcile_FinalizerAddRequeues verifies the first +// reconcile of a brand-new WorkloadDeployment: the finalizer is added and the +// reconciler requeues explicitly, since the metadata-only finalizer Update may +// be filtered by event predicates or handlers and would otherwise strand the +// deployment unreconciled. +func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { + t.Parallel() + + deployment := wdControllerTestDeployment(1) // no finalizer yet + cl := newProjectFakeClient(deployment) + r := newTestWDReconciler(cl) + + req := mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{Name: wdControllerTestName, Namespace: wdControllerTestNS}, + }, + } + + result, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{Requeue: true}, result, + "finalizer-add reconcile must requeue explicitly; the metadata-only update may not re-enqueue via watches") + + var updated computev1alpha.WorkloadDeployment + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + assert.Contains(t, updated.Finalizers, workloadControllerFinalizer) + + // Second reconcile (post-requeue) proceeds past the finalizer branch and + // publishes status: ObservedGeneration tracks the deployment generation and + // DesiredReplicas reflects scale settings. + result, err = r.Reconcile(context.Background(), req) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) + assert.Equal(t, updated.Generation, updated.Status.ObservedGeneration) + assert.Equal(t, int32(1), updated.Status.DesiredReplicas) + assert.True(t, apimeta.IsStatusConditionTrue(updated.Status.Conditions, computev1alpha.WorkloadDeploymentReplicasReady), + "no instances are quota-blocked, so ReplicasReady must be true") +}