From 556ad2068556ff153b958f07ce9bbe455e70c9dc Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:06:54 -0500 Subject: [PATCH 01/11] chore(deps): upgrade to Go 1.25 and golangci-lint v2.12.2 Bump the toolchain to Go 1.25 and golangci-lint v2.12.2, introduce a Taskfile for the standard build/test/lint targets, and align the CI workflows and Makefile with the new versions. Remove stale RFC and enhancement docs that the federated-scheduling work supersedes. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/lint.yml | 4 +- .github/workflows/publish.yaml | 3 + .github/workflows/test-e2e.yml | 2 +- .github/workflows/test.yml | 2 +- .gitignore | 7 +- .golangci.yml | 10 + Makefile | 2 +- Taskfile.yaml | 481 ++++++++++++++++++ .../rfcs/configmap-secret-mounts.md | 300 ----------- docs/enhancements/datumctl-compute-dx.md | 330 ------------ go.mod | 117 +++-- go.sum | 309 +++++------ 12 files changed, 703 insertions(+), 864 deletions(-) create mode 100644 Taskfile.yaml delete mode 100644 docs/compute/development/rfcs/configmap-secret-mounts.md delete mode 100644 docs/enhancements/datumctl-compute-dx.md diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ddfaa170..caa00ff3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,9 +15,9 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.5 + version: v2.12.2 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76b..5dcc90bb 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8429bf2d..9bede775 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Install the latest version of kind run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834d33a0..462cbf3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Running Tests run: | diff --git a/.gitignore b/.gitignore index 2b0c6e44..d5cc564d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,8 @@ # Output of the go coverage tool, specifically when used with LiteIDE *.out -# Dependency directories (remove the comment below to include it) -# vendor/ +# Dependency directories +vendor/ # Go workspace file go.work @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/.golangci.yml b/.golangci.yml index a7246fbb..f5834e3c 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -35,6 +35,16 @@ linters: - dupl - lll path: internal/* + # field.ErrorList{} is the idiomatic Kubernetes validation init pattern; + # preallocating requires knowing the error count in advance which is not + # possible in recursive validation helpers. + - linters: + - prealloc + path: internal/validation/ + # Test helpers that build slices via append are clearer without prealloc. + - linters: + - prealloc + path: internal/controller/instancecontrol/ paths: - third_party$ - builtin$ diff --git a/Makefile b/Makefile index 61744a36..3d6a3e2e 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ KUSTOMIZE_VERSION ?= v5.5.0 CONTROLLER_TOOLS_VERSION ?= v0.16.4 DEFAULTER_GEN_VERSION ?= v0.32.3 ENVTEST_VERSION ?= release-0.19 -GOLANGCI_LINT_VERSION ?= v2.1.5 +GOLANGCI_LINT_VERSION ?= v2.12.2 # renovate: datasource=go depName=fybrik.io/crdoc CRDOC_VERSION ?= v0.6.4 diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 00000000..bcfbb0f8 --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,481 @@ +version: '3' + +# ─── Variables ────────────────────────────────────────────────────────────── + +vars: + # Karmada Helm chart version to install (karmada-charts/karmada) + KARMADA_VERSION: v1.16.0 + + # karmadactl CLI version for cluster registration + KARMADACTL_VERSION: v1.16.0 + + # Chainsaw version for e2e testing (kyverno/chainsaw) + CHAINSAW_VERSION: v0.2.15 + + # Local tool directory (mirrors Makefile convention) + LOCALBIN: '{{.ROOT_DIR}}/bin' + KARMADACTL: '{{.ROOT_DIR}}/bin/karmadactl' + CHAINSAW: '{{.ROOT_DIR}}/bin/chainsaw' + + # Kind cluster names + KIND_CONTROL_PLANE: compute-control-plane + KIND_POP_DFW: compute-pop-dfw + KIND_POP_ORD: compute-pop-ord + + # All cluster names (for CRD installation loops) + KIND_ALL_CLUSTERS: '{{.KIND_CONTROL_PLANE}} {{.KIND_POP_DFW}} {{.KIND_POP_ORD}}' + + # Working directory for e2e artefacts (gitignored) + E2E_DIR: '{{.ROOT_DIR}}/tmp/e2e' + KUBECONFIG_DIR: '{{.ROOT_DIR}}/tmp/e2e/kubeconfigs' + + # Fixed NodePort for the Karmada API server. + # The Kind management cluster is created with an extraPortMapping for this port + # so it is reachable at https://localhost:32443 from the developer's machine. + KARMADA_API_NODEPORT: "32443" + +# ─── Tasks ────────────────────────────────────────────────────────────────── + +tasks: + + default: + cmds: + - task --list + silent: true + + # ════════════════════════════════════════════════════════════════════════ + # e2e environment lifecycle + # ════════════════════════════════════════════════════════════════════════ + + e2e:up: + desc: "Create the full local Kind+Karmada e2e environment (idempotent)" + cmds: + - task: e2e:tools + - task: e2e:clusters:create + - task: e2e:karmada:install + - task: e2e:karmada:configure + - task: e2e:karmada:join-clusters + - task: e2e:crds:install + - cmd: | + echo "" + echo "╔══════════════════════════════════════════════════════════╗" + echo "║ e2e environment ready ║" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Control plane: {{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "║ Karmada API: {{.KUBECONFIG_DIR}}/karmada.yaml" + echo "║ POP DFW: {{.KUBECONFIG_DIR}}/pop-dfw.yaml" + echo "║ POP ORD: {{.KUBECONFIG_DIR}}/pop-ord.yaml" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Export for kubectl: ║" + echo "║ export KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "╚══════════════════════════════════════════════════════════╝" + silent: false + + e2e:down: + desc: "Tear down the local e2e environment" + cmds: + - kind delete cluster --name {{.KIND_CONTROL_PLANE}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_DFW}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_ORD}} 2>/dev/null || true + - rm -rf {{.E2E_DIR}} + - cmd: echo "✓ e2e environment torn down" + silent: false + + e2e:test: + desc: "Run Chainsaw e2e tests against the local Kind+Karmada environment" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + test/e2e/ \ + {{.CLI_ARGS}} + + e2e:test:filter: + desc: "Run a subset of e2e tests by name regex (e.g. task e2e:test:filter -- --include-test-regex federation)" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + {{.CLI_ARGS}} \ + test/e2e/ + + # ════════════════════════════════════════════════════════════════════════ + # Tool installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:tools: + desc: "Install e2e-specific tooling (karmadactl, chainsaw, helm repo)" + cmds: + - task: e2e:tools:karmadactl + - task: e2e:tools:chainsaw + - task: e2e:tools:helm-repo + + e2e:tools:karmadactl: + desc: "Download karmadactl {{.KARMADACTL_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.KARMADACTL}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/karmada-io/karmada/releases/download/{{.KARMADACTL_VERSION}}/karmadactl-${OS}-${ARCH}.tgz" + echo "Downloading karmadactl {{.KARMADACTL_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} karmadactl + chmod +x {{.KARMADACTL}} + echo "karmadactl installed → {{.KARMADACTL}}" + else + echo "karmadactl already present at {{.KARMADACTL}}" + fi + status: + - test -f {{.KARMADACTL}} + + e2e:tools:chainsaw: + desc: "Download chainsaw {{.CHAINSAW_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.CHAINSAW}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/kyverno/chainsaw/releases/download/{{.CHAINSAW_VERSION}}/chainsaw_${OS}_${ARCH}.tar.gz" + echo "Downloading chainsaw {{.CHAINSAW_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} chainsaw + chmod +x {{.CHAINSAW}} + echo "chainsaw installed → {{.CHAINSAW}}" + else + echo "chainsaw already present at {{.CHAINSAW}}" + fi + status: + - test -f {{.CHAINSAW}} + + e2e:tools:helm-repo: + desc: "Add/update karmada-charts Helm repository" + cmds: + - | + if ! helm repo list 2>/dev/null | grep -q karmada-charts; then + helm repo add karmada-charts https://raw.githubusercontent.com/karmada-io/karmada/master/charts + echo "Added karmada-charts Helm repository" + fi + helm repo update karmada-charts + status: + - helm repo list 2>/dev/null | grep -q karmada-charts + + # ════════════════════════════════════════════════════════════════════════ + # Kind cluster management + # ════════════════════════════════════════════════════════════════════════ + + e2e:clusters:create: + desc: "Create all Kind clusters (idempotent)" + cmds: + # Management / control-plane cell cluster — needs extraPortMappings for + # the Karmada API server NodePort so it is accessible at localhost:32443. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_CONTROL_PLANE}}" + KIND_CONFIG: hack/e2e/kind-control-plane.yaml + # POP cell clusters — default Kind config is sufficient. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + KIND_CONFIG: "" + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + KIND_CONFIG: "" + - mkdir -p {{.KUBECONFIG_DIR}} + - task: _e2e:kubeconfigs:export + + _e2e:cluster:create: + internal: true + cmds: + - | + if kind get clusters 2>/dev/null | grep -qx '{{.CLUSTER_NAME}}'; then + echo "Kind cluster '{{.CLUSTER_NAME}}' already exists — skipping" + else + echo "Creating Kind cluster '{{.CLUSTER_NAME}}'..." + CONFIG_FLAG="" + if [ -n "{{.KIND_CONFIG}}" ]; then + CONFIG_FLAG="--config {{.KIND_CONFIG}}" + fi + kind create cluster \ + --name {{.CLUSTER_NAME}} \ + $CONFIG_FLAG \ + --wait 90s + fi + + _e2e:kubeconfigs:export: + internal: true + desc: "Export Kind kubeconfigs and create Docker-IP variants for cross-cluster use" + cmds: + # Standard kubeconfigs (localhost-based, for developer kubectl use) + - kind export kubeconfig --name {{.KIND_CONTROL_PLANE}} --kubeconfig {{.KUBECONFIG_DIR}}/control-plane.yaml + - kind export kubeconfig --name {{.KIND_POP_DFW}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-dfw.yaml + - kind export kubeconfig --name {{.KIND_POP_ORD}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-ord.yaml + # Docker-IP kubeconfigs (used by Karmada controller, running inside Docker, + # to reach POP cell API servers across the kind bridge network) + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml \ + {{.KIND_POP_DFW}} + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord-internal.yaml \ + {{.KIND_POP_ORD}} + + # ════════════════════════════════════════════════════════════════════════ + # Karmada installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:install: + desc: "Install Karmada into the management cluster via Helm (idempotent)" + cmds: + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get ns karmada-system &>/dev/null; then + echo "Karmada already installed (karmada-system namespace exists)" + else + echo "Installing Karmada {{.KARMADA_VERSION}} via Helm..." + helm install karmada karmada-charts/karmada \ + --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + --namespace karmada-system \ + --create-namespace \ + --version {{.KARMADA_VERSION}} \ + --set apiServer.serviceType=NodePort \ + --set apiServer.nodePort={{.KARMADA_API_NODEPORT}} \ + --wait \ + --timeout 5m + echo "Karmada installed" + fi + - task: _e2e:karmada:build-kubeconfig + + e2e:karmada:configure: + desc: "Apply federation component config to the Karmada API server (idempotent)" + cmds: + - | + echo "Applying federation component to Karmada..." + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml apply \ + -k config/components/federation/ + echo "Federation component applied" + + _e2e:karmada:build-kubeconfig: + internal: true + desc: "Extract Karmada kubeconfig from secret and patch server to localhost:{{.KARMADA_API_NODEPORT}}" + cmds: + - | + echo "Building Karmada kubeconfig → {{.KUBECONFIG_DIR}}/karmada.yaml" + # Extract raw kubeconfig from the secret the Helm chart creates + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get secret karmada-kubeconfig \ + -n karmada-system \ + -o jsonpath='{.data.kubeconfig}' \ + | base64 -d > {{.KUBECONFIG_DIR}}/karmada-raw.yaml + # Rewrite the server address to the NodePort exposed on localhost + python3 - {{.KUBECONFIG_DIR}}/karmada-raw.yaml {{.KUBECONFIG_DIR}}/karmada.yaml 127.0.0.1 {{.KARMADA_API_NODEPORT}} << 'PYEOF' + import sys, yaml + + src, dst, host, port = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + + with open(src) as f: + cfg = yaml.safe_load(f) + + for cluster in cfg.get('clusters', []): + old = cluster['cluster'].get('server', '') + cluster['cluster']['server'] = f'https://{host}:{port}' + # The cert is for the internal cluster IP, so skip TLS verification. + # This is a local dev-only environment. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + print(f" karmada server: {old} → https://{host}:{port}", file=sys.stderr) + + with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + PYEOF + rm {{.KUBECONFIG_DIR}}/karmada-raw.yaml + + # ════════════════════════════════════════════════════════════════════════ + # POP cell cluster registration + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:join-clusters: + desc: "Register POP cell clusters with Karmada and apply city-code labels" + cmds: + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + CITY_CODE: dfw + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml" + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + CITY_CODE: ord + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord-internal.yaml" + + _e2e:karmada:join-cluster: + internal: true + cmds: + # ── Register with karmadactl join ────────────────────────────────── + # We pass the EXTERNAL kubeconfig (localhost-based) here so karmadactl + # can reach the member cluster from this macOS host to set up initial + # RBAC. The stored secret is patched below to the Docker-IP variant. + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + get cluster {{.CLUSTER_NAME}} &>/dev/null; then + echo "Cluster '{{.CLUSTER_NAME}}' already registered in Karmada — skipping join" + else + echo "Joining '{{.CLUSTER_NAME}}' to Karmada..." + {{.KARMADACTL}} join {{.CLUSTER_NAME}} \ + --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --cluster-kubeconfig={{.EXTERNAL_KUBECONFIG}} \ + --cluster-context=kind-{{.CLUSTER_NAME}} + echo "Cluster '{{.CLUSTER_NAME}}' registered" + fi + # ── Patch cluster secret → Docker-IP kubeconfig ─────────────────── + # The Karmada controller manager runs inside Docker; it cannot use + # localhost to reach POP cell API servers. We update the stored secret + # with a kubeconfig whose server address uses the Kind container IP so + # container-to-container communication works across the kind bridge. + - | + hack/e2e/patch-cluster-secret.sh \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.CLUSTER_NAME}} \ + {{.INTERNAL_KUBECONFIG}} + # ── Apply city-code label ────────────────────────────────────────── + - | + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + label cluster {{.CLUSTER_NAME}} \ + topology.datum.net/city-code={{.CITY_CODE}} \ + --overwrite + echo "Labeled cluster '{{.CLUSTER_NAME}}' with topology.datum.net/city-code={{.CITY_CODE}}" + + # ════════════════════════════════════════════════════════════════════════ + # CRD installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:crds:install: + desc: "Install compute + NSO CRDs to all clusters" + cmds: + - task: _e2e:crds:compute + - task: _e2e:crds:nso + + _e2e:crds:compute: + internal: true + desc: "Apply compute CRDs to all clusters and the Karmada API server" + cmds: + # All three Kind clusters + the Karmada API server get the compute CRDs. + # The Karmada API server needs them so it can store and propagate + # WorkloadDeployment objects. + - | + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing compute CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k config/base/crd --server-side + done + + _e2e:crds:nso: + internal: true + desc: "Apply NSO CRDs to control-plane and POP cell clusters" + cmds: + # NSO CRDs (NetworkBinding, SubnetClaim, etc.) are installed on the + # control-plane as well as POP cells. The control-plane operator needs them + # so that Subnet/SubnetClaim informer watches can start without cache errors, + # even though NSO controllers themselves only run on POP cells. + - | + go mod download go.datum.net/network-services-operator + NSO_VERSION=$(go list -m -json go.datum.net/network-services-operator \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['Version'])") + NSO_CRD_PATH="$(go env GOMODCACHE)/go.datum.net/network-services-operator@${NSO_VERSION}/config/crd" + echo "NSO CRDs from: ${NSO_CRD_PATH}" + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing NSO CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k "${NSO_CRD_PATH}" --server-side + done + + # ════════════════════════════════════════════════════════════════════════ + # Operator lifecycle (background processes for federation e2e) + # ════════════════════════════════════════════════════════════════════════ + + e2e:operator:start: + desc: "Start management (control-plane) and cell (pop-dfw) operator instances in the background" + cmds: + - mkdir -p {{.E2E_DIR}}/logs {{.E2E_DIR}}/pids + - | + echo "Starting management operator (control-plane)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-cell-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9091 \ + > {{.E2E_DIR}}/logs/operator-management.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-management.pid + echo "Management operator PID: $!" + - | + echo "Waiting for management operator health check on :9091..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9091/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: management operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-management.log || true + exit 1 + fi + sleep 1 + done + echo "Management operator is healthy" + - | + echo "Starting cell operator (pop-dfw)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-management-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9092 \ + > {{.E2E_DIR}}/logs/operator-cell-dfw.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-cell-dfw.pid + echo "Cell operator PID: $!" + - | + echo "Waiting for cell operator health check on :9092..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9092/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: cell operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-cell-dfw.log || true + exit 1 + fi + sleep 1 + done + echo "Cell operator is healthy" + + e2e:operator:stop: + desc: "Stop background operator instances" + cmds: + - | + for PIDFILE in \ + {{.E2E_DIR}}/pids/operator-management.pid \ + {{.E2E_DIR}}/pids/operator-cell-dfw.pid; do + if [ -f "$PIDFILE" ]; then + PID=$(cat "$PIDFILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Stopping PID $PID ($(basename $PIDFILE .pid))..." + kill -TERM "$PID" || true + else + echo "Process $PID ($(basename $PIDFILE .pid)) is not running" + fi + rm -f "$PIDFILE" + else + echo "PID file not found: $PIDFILE" + fi + done diff --git a/docs/compute/development/rfcs/configmap-secret-mounts.md b/docs/compute/development/rfcs/configmap-secret-mounts.md deleted file mode 100644 index 8f1fe652..00000000 --- a/docs/compute/development/rfcs/configmap-secret-mounts.md +++ /dev/null @@ -1,300 +0,0 @@ ---- -status: proposed ---- - -# Mounting ConfigMaps and Secrets into Compute Instances (Unikraft Provider) - -> Drafted 2026-05-30, revised 2026-05-31. **This is the foundational referenced-data delivery design for compute, and it ships before [Image Pull Credentials](./image-pull-credentials.md)** — it introduces the resolver, companion delivery, and the scheduling gate; pull secrets become a later consumer of the same path. - -## Table of Contents - -- [Summary](#summary) -- [What this enables for users](#what-this-enables-for-users) -- [End-to-end flow](#end-to-end-flow) -- [The gap: cross-plane delivery](#the-gap-cross-plane-delivery) -- [Design](#design) - - [The referenced-data resolver](#the-referenced-data-resolver) - - [Consumption on the provider](#consumption-on-the-provider) - - [Scheduling gate](#scheduling-gate) - - [Rotation and restart](#rotation-and-restart) -- [Platform direction](#platform-direction) -- [Security](#security) -- [Alternatives](#alternatives) -- [Failure modes](#failure-modes) -- [What gets built](#what-gets-built) -- [Decisions](#decisions) -- [Open questions](#open-questions) - ---- - -## Summary - -A compute `Workload` can already *describe* config and secret mounts: a volume -sourced from a ConfigMap or Secret, a container attachment with a mount path, and -environment variables that reference a key. The runtimes can already *consume* them — -the Unikraft runtime runs instances from Pod specs through its kubelet integration, -which honors ConfigMap/Secret references as both environment variables and volume -mounts, and the GCP provider mounts them as files too. So the API is real and the -runtimes support it. - -The one thing missing is in the middle: **the referenced data never reaches the cell -where the instance runs.** It lives in the user's project; the instance runs on an -edge cell; federation propagates only the `WorkloadDeployment`. The instance comes up -referencing data that isn't there. - -This RFC closes that gap. It keeps the user contract unchanged (create a -ConfigMap/Secret, reference it by name), resolves the reference in the trusted -management plane, and delivers the data to the edge as a derived companion object — -secret bytes **never enter the Workload or Instance spec**. Both environment -variables and file mounts work, because once the data is present the runtime's -existing Pod-spec consumption handles the rest. - -## What this enables for users - -Today users can only set literal environment variables, so configuration and -credentials get baked into images or pasted in as plaintext. After this: - -- A user creates a `ConfigMap` and `Secret` in their project and references them - from the Workload; the platform delivers that data to every instance in every POP - cell, without the user ever knowing federation exists. -- **Both forms work** — keys surfaced as environment variables (the twelve-factor - case) and ConfigMaps/Secrets mounted as files at a path (config files, - certificates). -- **Secrets stay secret** — values never appear in the Workload or Instance the user - sees; they travel only as Secret objects. - -## End-to-end flow - -The decided design: management-plane resolution → companion object → federation → -cell → provider. The `WorkloadReconciler`, `ReferencedDataController`, and -`Federator` run in the management plane; the edge cell and the compute provider run -on the POP cell. - -```mermaid -sequenceDiagram - actor User - participant P as Project plane - participant WC as WorkloadReconciler - participant RDC as ReferencedDataController - participant F as Federator - participant K as Karmada hub - participant C as Edge cell - participant PR as Compute provider - participant U as kraftlet / UKC - - User->>P: 1. Create ConfigMap + Secret - User->>P: 2. Create Workload referencing them - Note over P: Admission check —
author may read the referenced objects - WC->>P: 3. Create a WorkloadDeployment per placement - RDC->>P: 4. Read referenced ConfigMap/Secret (scoped, trusted) - RDC->>K: 5. Materialize a companion copy in the project's federation namespace - RDC->>P: 6. Record the expected companion set on the WorkloadDeployment - F->>K: 7. Replicate the WorkloadDeployment + routing policy - K->>C: 8. Propagate the deployment + companions to each matching cell - C->>C: 9. Create the Instance, held by a referenced-data gate - C->>C: 10. Companions present? clear the gate, mark data ready - PR->>C: 11. Translate the Instance into a Pod spec referencing the companions - Note over PR,U: Kubelet integration mounts the volumes and
injects the env vars natively from the present data - PR->>U: 12. Launch the instance with config/secret applied - U-->>User: 13. Instance running with config/secret applied -``` - -## The gap: cross-plane delivery - -The referenced ConfigMap/Secret lives in the user's project namespace; the instance -runs on an edge cell, possibly thousands of miles away. The federation channel -carries only the `WorkloadDeployment`, so the data has no path to the cell. There is -also no gate guarding this — the instance isn't even held back; it simply launches -with the reference unresolved. - -Consumption is *not* a gap: the Unikraft runtime's kubelet integration already -resolves ConfigMap/Secret references in a Pod spec — env vars and volume mounts -alike — provided the referenced objects are present where it resolves them. So the -whole problem is getting the data to the cell, and faithfully carrying the -references through into the Pod spec the runtime consumes. - -## Design - -### The referenced-data resolver - -A new management-plane controller is the heart of delivery. For each -WorkloadDeployment it: - -1. **Collects** every ConfigMap/Secret the template references — environment - references and volume sources today; image pull secrets later. -2. **Reads** them with a scoped, trusted project-plane identity. The management plane - already has legitimate project access, so broad project-secret read never leaves - it. -3. **Materializes** one labeled companion per referenced object in the project's - federation namespace. -4. **Records** the expected companion set on the WorkloadDeployment, so the cell - knows exactly what to wait for rather than guessing. -5. **Routes** companions to cells by extending the existing federation routing policy - to carry the labeled companions alongside the deployment. - -One companion exists per referenced object and is replicated to each placed cell — a -single object to create, update, and delete. When several deployments reference the -same object the companion is shared and reference-counted, removed only when the last -reference goes away. In single-cluster mode the same resolver runs and the companion -is simply a local copy. - -### Consumption on the provider - -Once the companions are present on the cell, the provider translates the Instance -into the Pod spec the runtime consumes — carrying the volume sources, volume mounts, -and environment references through faithfully and pointing them at the delivered -companions, with the referenced data present in the namespace the runtime resolves -from. The kubelet integration then mounts the volumes and injects the environment -variables natively; there is no provider-side inlining of secret values. (The -provider does not do this faithful translation today — it drops volumes and copies -only literal env values — so this is the provider-side work this RFC covers. The GCP -provider performs the equivalent translation already, which is why the same Workload -runs on either substrate.) - -### Scheduling gate - -An instance that references any ConfigMap/Secret is held by a **referenced-data -scheduling gate**, alongside the existing network and quota gates. The cell clears it -once exactly the expected companion set is present, and surfaces a -`ReferencedDataReady` status with clear reasons — resolving, awaiting propagation, -source not found, source unauthorized, source too large, or ready — backed by events -and metrics so a held instance is diagnosable, not a silent hang. The compute -provider must respect scheduling gates so an instance is never launched with its data -missing; this RFC adds that behavior. - -### Rotation and restart - -Decided: **no automatic roll; an explicit restart instead.** When a source changes, -the resolver re-reads it and refreshes the companion, so the latest values are staged -at the edge for the next instance launch. Running instances are not rolled -automatically — a fleet-wide restart on every edit is surprising, and a running -instance's environment isn't mutated in place regardless. - -Compute already performs ordered, in-place rolling updates when a Workload's template -changes. The restart reuses that: a conventional restart annotation on the template -rolls the instances, which pick up the refreshed values — no new machinery. An -opt-in automatic roll on content change is a possible future addition, not part of -this RFC. - -## Platform direction - -The delivery half of this design — follow references, read them in the trusted -plane, materialize derived companions, route them to the cells where the resource is -placed, and signal readiness — is **not specific to compute**. It's a recurring -platform need: image pull credentials want the same thing next, and the network -operator already propagates derived Secrets/ConfigMaps to cells by label today. The -building blocks are already platform-level — the shared namespace-mapping and -downstream-delivery library, the label-based propagation pattern, and the -established policy-driven capabilities (quota, activity, insights) that a delivery -policy would sit naturally beside. - -We deliberately **do not** build that generic capability now. With a single consumer -in hand the abstraction's seams aren't yet known, and a cross-cutting platform -capability would slow the first ship and widen the security review. Instead, this RFC -builds toward it on purpose: - -- **Build the resolver in compute now, behind a narrow, capability-shaped - interface** — in: a subject, its set of referenced objects, and its placement - targets; out: companions delivered plus a readiness signal. It reuses the existing - platform delivery library rather than inventing its own placement and cleanup. -- **Keep delivery cleanly separable from consumption.** The scheduling gate and the - translation into the runtime's Pod spec stay in compute and depend only on the - readiness signal, so the delivery component carries no compute-specific knowledge. -- **Promote on the second consumer.** When a second user of this pattern appears - (image pull credentials, or another service), lift the delivery component into the - platform as a capability — most likely an admin-authored delivery policy that - declares, per resource kind, which references to follow and where to deliver them, - fitting the existing capability-policy pattern. Two real consumers is when the - abstraction can be shaped correctly. - -This keeps compute shippable and autonomous today while making the design a -deliberate step toward a shared capability, not a one-off to untangle later. A -governance benefit falls out: when the policy lands, *what may be propagated, and -where* becomes an inspectable, access-controlled object rather than logic buried in a -controller. - -## Security - -- **Bytes never in user-visible specs.** The Workload and the Instance the user sees - carry references only. Values exist as Secret objects in the project's federation - namespace and on the cell where the runtime mounts them — never in anything - projected back to the user. -- **Companion Secrets stay Secrets** end to end; ConfigMap companions carry only - non-secret config. The runtime mounts the companions directly, so the provider - never has to inline secret values itself. -- **Authorization.** Admission verifies the submitting user can read each referenced - object — the same check already used for referenced Networks. A user cannot pull in - an object they couldn't read themselves; the resolver's system identity is never - the authority. -- **Trust boundary at the edge.** Resolving in the management plane is deliberate, so - the shared, lower-trust edge never holds a credential that can read project - ConfigMaps/Secrets. Companions are isolated per project namespace on each cell. -- **At rest.** Companions live in storage on the project plane, the hub, and each - cell; this presumes encryption at rest on every plane. - -## Alternatives - -- **Let the provider read the originals from the edge (no companions).** The leanest - option — it removes the resolver, companions, routing changes, and the data gate, - and it is how the GCP provider already works. **Rejected for secret bytes:** it - requires the shared edge to hold a credential that reads project ConfigMaps and - Secrets, exactly the trust boundary this design keeps in the management plane. (A - config-only hybrid was considered and rejected to avoid maintaining two delivery - paths.) -- **Inline resolved values into the Instance.** Rejected — leaks secret bytes into - storage everywhere and into what the user sees. -- **Propagate the user's original objects directly.** Rejected — couples cell - contents to arbitrary project objects and loses the scoping boundary. -- **A separate controller for pull secrets.** Rejected — same machinery; pull secrets - become a thin consumer of this resolver instead. - -## Failure modes - -- **Source missing, unauthorized, or too large** → gate held, status names the - offending object; optional sources are skipped. -- **Companion not yet on the cell** → gate held (awaiting propagation); a normal - transient state during placement. -- **Source changed, instances not rolled** → stale by design until restarted; - last-synced state is surfaced so it's observable. -- **Single-cluster mode** → the local-copy path must be exercised so the absence of - federation never silently disables delivery. - -## What gets built - -- A **referenced-data resolver** in the management plane: collect, read, materialize, - reference-count, and clean up companions. -- A **scoped project-plane read identity** for the resolver (built here; reused later - by image pull credentials). -- **Federation routing** extended to carry companions to the same cells as the - deployment. -- A **referenced-data scheduling gate**, cell-side clearing, and the - `ReferencedDataReady` status with reasons, events, and metrics. -- **API additions**: a bulk "import all keys" env form, and completing volume - validation (secret volumes, key→path selection, file mode). -- **Provider changes**: respect scheduling gates, and faithfully translate the - Instance's volumes, mounts, and env references into the Pod spec the runtime - consumes. -- A **restart** path (a conventional template annotation) so a rotated source can be - picked up on demand. - -## Decisions - -- **Delivery:** management-plane companions (not edge-read). -- **Rotation:** no auto-roll; explicit restart. -- **Gate contract:** an explicit expected-companion set recorded on the deployment, - not guessed. -- **One resolver, not two:** pull secrets are a later consumer. -- **Platform direction:** build delivery behind a capability-shaped seam in compute - now; promote it to a platform-owned, policy-driven capability when a second - consumer appears — not before. -- **Sequencing:** ships before image pull credentials; owns the scoped read identity - and provider gate-honoring. - -## Open questions - -1. **Scoped-read granularity:** can the resolver's project read be scoped to specific - object types or labels, or is it broad config/secret read? -2. **Companion size limits** and behavior when exceeded. -3. **Bulk env import in v1**, or per-key references only for the first release? -4. **VM runtime** consumption — out of scope for Unikraft (sandbox-only); confirm - deferral. diff --git a/docs/enhancements/datumctl-compute-dx.md b/docs/enhancements/datumctl-compute-dx.md deleted file mode 100644 index f4b627f7..00000000 --- a/docs/enhancements/datumctl-compute-dx.md +++ /dev/null @@ -1,330 +0,0 @@ -# `datumctl compute` — Developer Experience - -**Status:** Draft - ---- - -## Summary - -This document proposes a `compute` subcommand group in `datumctl` designed around the workflows developers actually perform: deploying a workload, watching it roll out across cities, understanding why something isn't running, and inspecting instances when something goes wrong. - -The goal is to close the gap between "I have a container image" and "my workload is healthy across multiple locations" without requiring developers to understand the platform's internal resource model or write YAML to do common things. - ---- - -## The problem today - -Running a workload on Datum Cloud today requires a developer to: - -1. Write a YAML manifest with the correct `apiVersion`, `kind`, and nested spec structure. -2. Apply it with `datumctl apply -f` and wait with no visibility into what's happening. -3. Run `datumctl get workloads` to check status, and then manually interpret raw condition fields. -4. Look up individual instance names to get logs. - -Each of these steps has friction that compounds. A developer who hits a quota block on their first deploy gets a raw API condition with no explanation and no next step. Someone who wants to tail logs from their app across two cities has to discover instance names, then run multiple commands. - -This experience works. It doesn't feel like a product yet. - ---- - -## Who this is for - -The primary audience is a **backend developer** deploying a containerized service to Datum Cloud for the first time or as part of their daily workflow. They are comfortable with the terminal. They may have used Heroku, Railway, Fly.io, or GCP before. They should not need to know anything about how the platform's internal resource model works to deploy and operate their application. - -The secondary audience is a **platform operator** or **DevOps engineer** who needs scripting-friendly access to the full resource hierarchy for automation and debugging. - ---- - -## Workflows - -The design centers on five workflows, ordered by frequency. - -### 1. Deploy a workload - -The developer has a container image. They want it running in one or more cities. - -The fastest path requires no YAML: - -``` -$ datumctl compute deploy api \ - --image=ghcr.io/acme/api:1.4.2 \ - --instance-type=d1-standard-2 \ - --city=DFW,IAD \ - --min=2 \ - --port=8080 - -Resolving workload "api" in project acme-prod... - Workload does not exist — creating. - Placement "default": cities=[DFW, IAD], min=2 - -Applying... - workload/api created - -Waiting for rollout. Ctrl-C to detach (rollout continues in background). - - PLACEMENT CITY DESIRED READY PHASE - default DFW 2 0 Starting - default IAD 2 0 Starting - default DFW 2 2 Running - default IAD 2 2 Running - -Rollout complete in 47s. - - Instances: - DFW api-dfw-0 203.0.113.10 - api-dfw-1 203.0.113.11 - IAD api-iad-0 198.51.100.20 - api-iad-1 198.51.100.21 - -Saved workload config to ./workload.yaml — commit this file to manage deployments declaratively. -``` - -If a developer prefers an interactive walk-through: - -``` -$ datumctl compute deploy -? Workload name: api -? Container image: ghcr.io/acme/api:1.4.2 -? Instance type [d1-standard-2]: -? Cities (comma-separated) [DFW]: DFW,IAD -? Min replicas per city [1]: 2 -? Expose port (optional): 8080 - - workload: api - image: ghcr.io/acme/api:1.4.2 - instance type: d1-standard-2 - cities: DFW, IAD - replicas: min=2 - ports: 8080/tcp - -Proceed? (Y/n) -``` - -For teams managing workloads declaratively, `deploy` also accepts a manifest file. It shows a human-readable diff before applying, rather than applying silently: - -``` -$ datumctl compute deploy -f workload.yaml - -Changes to workload "api": - image: ghcr.io/acme/api:1.4.1 → ghcr.io/acme/api:1.4.2 - min replicas (default/DFW): 2 → 3 - -Apply? (Y/n) y -workload/api updated -``` - -All three paths — flags, interactive, manifest — converge on the same underlying representation. A developer can start with flags and graduate to a manifest when they need multi-placement topology, custom networking, or volume configuration. - -For automated pipelines, pass `-y` to skip the confirmation prompt. The CLI also suppresses the prompt automatically when stdin is not a terminal. - -### 2. Check workload health - -The developer wants to know if their workload is healthy and how many instances are running across each city. - -``` -$ datumctl compute status api - -Workload api project: acme-prod -Image ghcr.io/acme/api:1.4.2 -Updated 47s ago Revision #7 - -Health Available — all placements at desired replicas - - CITY READY DESIRED TYPE - default DFW 2/2 2 d1-standard-2 - IAD 2/2 2 d1-standard-2 -``` - -When something is wrong, the status view explains it in plain terms and tells the developer what to do next: - -``` -$ datumctl compute status api - -Workload api project: acme-prod -Image ghcr.io/acme/api:1.4.3 -Updated 1m ago Revision #8 - -Health Degraded — 2 instances blocked in IAD - - CITY READY DESIRED TYPE - default DFW 2/2 2 d1-standard-2 - IAD 2/4 4 d1-standard-2 [degraded] - - IAD: 2 instances could not start — quota exceeded - Requested 4 CPU. 2 CPU available in IAD. - - Next steps: - Reduce replicas: datumctl compute scale api --min=2 - Check quota: datumctl compute quota - View instances: datumctl compute instances --workload=api -``` - -The developer never sees raw condition names or internal state reasons. If they need that level of detail for debugging or scripting, `datumctl compute workloads describe api` exposes it. - -### 3. Watch a rollout - -When a developer updates their workload (new image, changed replica count, config change), they can watch the rollout progress city by city: - -``` -$ datumctl compute rollout api - -Rolling workload "api" rev #7 → #8 - - PLACEMENT CITY UPDATED READY OLD PHASE - default DFW 0 2 2 Pending - default IAD 0 2 2 Pending - default DFW 1 1 1 Updating - default DFW 2 2 0 Done - default IAD 1 1 1 Updating - default IAD 2 2 0 Done - -Rollout complete in 1m 12s. -``` - -If the rollout stalls because of a resource or scheduling issue, the output pauses on the affected row and gives an explanation: - -``` - default IAD 1 1 1 Blocked - - 2 instances waiting: quota exceeded in IAD - The rollout will resume when quota becomes available. - Ctrl-C to detach — the rollout continues in the background. -``` - -`Ctrl-C` always detaches from the watch. It never cancels the rollout itself. - -Rollout history is accessible at any time: - -``` -$ datumctl compute rollout history api - - REV WHEN IMAGE CHANGES BY STATUS - #8 2m ago ghcr.io/acme/api:1.4.3 image updated alice@acme.io active - #7 3h ago ghcr.io/acme/api:1.4.2 min replicas 2 → 3 ci-deploy — - #6 yesterday ghcr.io/acme/api:1.4.2 LOG_LEVEL info → warn bob@acme.io — -``` - -To roll back to a previous revision: - -``` -$ datumctl compute rollout undo api --to-revision=7 -Creating revision #9 (copy of #7)... -Rollout started. Run `datumctl compute rollout api` to watch progress. -``` - -Undo creates a new revision rather than rewriting history — the audit trail stays append-only. The platform retains the 20 most recent revisions per workload; revisions beyond that are no longer available for undo. - -### 4. Get logs - -`datumctl compute logs` treats the workload as the target, not the individual instance. By default it returns logs across all instances and prefixes each line with the city and instance short name: - -``` -$ datumctl compute logs api --follow - -Tailing logs for workload "api" in DFW, IAD. Ctrl-C to stop. - -[DFW/api-dfw-0] 10:14:02 GET /healthz 200 3ms -[IAD/api-iad-1] 10:14:02 GET /v1/users 200 18ms -[DFW/api-dfw-1] 10:14:03 POST /v1/login 401 4ms -[IAD/api-iad-0] 10:14:03 GET /healthz 200 2ms -``` - -Common filters reduce the output without requiring instance name lookup: - -``` -$ datumctl compute logs api --city=IAD --follow -$ datumctl compute logs api --since=15m -$ datumctl compute logs api -c worker --follow -``` - -All filters translate to label selectors against the platform's telemetry system. There is no per-city fan-out — the CLI queries a single endpoint and the label index handles scoping. - -### 5. Inspect and debug instances - -When something is wrong with a specific instance, `datumctl compute instances` gives a per-instance view across the whole project: - -``` -$ datumctl compute instances - - NAME WORKLOAD CITY EXTERNAL IP INTERNAL IP TYPE AGE STATUS - api-dfw-0 api DFW 203.0.113.10 10.4.1.5 d1-standard-2 2d Running - api-dfw-1 api DFW 203.0.113.11 10.4.1.6 d1-standard-2 2d Running - api-iad-0 api IAD 198.51.100.20 10.5.1.7 d1-standard-2 2d Running - api-iad-1 api IAD 198.51.100.21 10.5.1.8 d1-standard-2 2d Running - worker-dfw-0 worker DFW 203.0.113.30 10.4.1.9 d1-standard-4 6h Running - -5 instances — 5 Running, 0 Pending, 0 Failed -``` - -Pass a workload name to narrow the view: - -``` -$ datumctl compute instances --workload=api -``` - -Instances that haven't started show why, inline: - -``` - api-iad-2 api IAD — — d1-standard-2 30s Pending (quota exceeded) - api-iad-3 api IAD — — d1-standard-2 30s Pending (network provisioning) -``` - -Drilling into a single instance gives the full picture with actionable context: - -``` -$ datumctl compute instances describe api-iad-2 - -Instance api-iad-2 -Workload api / default / IAD -Type d1-standard-2 -Age 1m 12s - -Status Not running — quota exceeded - Requested 4 CPU. 2 CPU available in IAD. - -Runtime - Image: ghcr.io/acme/api:1.4.3 - Env: DATABASE_URL (from secret), LOG_LEVEL=info - Ports: 8080/tcp - -Network Waiting for addresses (not yet scheduled) - -Next steps - datumctl compute scale api --min=2 - datumctl compute quota -``` - ---- - -## Command reference - -### Short-form commands (the everyday interface) - -``` -datumctl compute deploy Deploy or update a workload -datumctl compute status Show health across all cities -datumctl compute instances List all instances (--workload, --city to filter) -datumctl compute logs Stream logs (--workload, --city, --instance, -c/--container) -datumctl compute rollout Watch a rollout in progress -datumctl compute rollout history List recent revisions -datumctl compute rollout undo Roll back to a previous revision -datumctl compute scale Adjust replica counts -datumctl compute restart Restart instances (rolling) -datumctl compute destroy Delete a workload -datumctl compute quota Show project quota usage -``` - -### Resource commands (for scripting and advanced use) - -``` -datumctl compute workloads [get | describe | delete | edit] -datumctl compute workloads rollout [status | history | undo] -datumctl compute workloads set image NAME CONTAINER=IMAGE - -datumctl compute instances [get | describe | logs] - -datumctl compute cities [list | describe] -datumctl compute instance-types [list | describe] -datumctl compute quota [--breakdown | --constrained | --city=CITY] -``` - diff --git a/go.mod b/go.mod index 19fc0103..48bab65b 100644 --- a/go.mod +++ b/go.mod @@ -1,31 +1,34 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.25.0 require ( + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.37.0 + github.com/karmada-io/api v1.15.0 + github.com/onsi/ginkgo/v2 v2.27.2 + github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 - go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 - golang.org/x/crypto v0.39.0 - golang.org/x/sync v0.16.0 + go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 + go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 + golang.org/x/crypto v0.45.0 + golang.org/x/sync v0.18.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 - k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.2.1 - sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + k8s.io/component-base v0.35.0 + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + sigs.k8s.io/controller-runtime v0.23.3 + sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c + sigs.k8s.io/multicluster-runtime v0.23.3 ) require ( - cel.dev/expr v0.19.1 // indirect - github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + cel.dev/expr v0.24.0 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -35,74 +38,70 @@ require ( github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.26.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.64.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + github.com/spf13/cobra v1.10.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/stoewer/go-strcase v1.3.1 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.41.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.33.0 // indirect + golang.org/x/tools v0.38.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.71.1 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/grpc v1.74.2 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index c472bd8b..42a98554 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,9 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= -github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -17,16 +19,22 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -42,17 +50,16 @@ github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZ github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI= +github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,18 +69,18 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -84,42 +91,43 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= -github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= +github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= +github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.1 h1:iS0MdW+kVTxgMoE1LAZyMiYJFKlOzLooE4MxjirtkAs= +github.com/stoewer/go-strcase v1.3.1/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -129,160 +137,125 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= -go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 h1:P3dePA6cCXKimZzE6d7Xxpj2rz54BxOHI8K8ic7VQ+c= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359/go.mod h1:Nr0PsCodkTW31vWVxR9dhAP9w0y+WHUYeyrcRnchcIE= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 h1:LSHyqLt/jus6iEMvo8pc731L+PyrTHP2bqfMMtHPSWc= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42/go.mod h1:p9O2kk194mvoL8rhqjwb+LWB+GIyY4vQqiTowwibVWo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 h1:m639+BofXTvcY1q8CGs4ItwQarYtJPOWmVobfM1HpVI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0/go.mod h1:LjReUci/F4BUyv+y4dwnq3h/26iNOeC3wAIqgvTIZVo= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 h1:0UOBWO4dC+e51ui0NFKSPbkHHiQ4TmrEfEZMLDyRmY8= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0/go.mod h1:8ytArBbtOy2xfht+y2fqKd5DRDJRUQhqbyEnQ4bDChs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 h1:MAKi5q709QWfnkkpNQ0M12hYJ1+e8qYVDyowc4U1XZM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= +google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= +k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= +k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= -sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 h1:Pq69tTKfN8ADw8m8A3wUtP8wJ9SPQbbOsgapm3BZEPw= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8/go.mod h1:CpBzLMLQKdm+UCchd2FiGPiDdCxM5dgCCPKuaQ6Fsv0= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c h1:GS4VnGRV90GEUjrgQ2GT5ii6yzWj3KtgUg+sVMdhs5c= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/multicluster-runtime v0.23.3 h1:vrzlXRzHTDsjspUAfoW2rCtr0agoI4q20p9x4Fz4png= +sigs.k8s.io/multicluster-runtime v0.23.3/go.mod h1:r/UA4GHgFoXCcR4tcvlZz7SiLx3l1kJKDuBAhILNIHs= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= From c3351b0a4d1741c7edbf723a9689b44c5a464d69 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:07:08 -0500 Subject: [PATCH 02/11] refactor(controller): remove the central WorkloadDeployment scheduler Delete the central scheduler that placed WorkloadDeployments from a single control plane. Placement now happens through the distributed federator and per-cell controllers introduced in the following commits. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../workloaddeployment_scheduler.go | 153 ------------------ 1 file changed, 153 deletions(-) delete mode 100644 internal/controller/workloaddeployment_scheduler.go diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d64..00000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} From 701867455535b491fe4ce1565cc1b29e216d0b75 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:07:24 -0500 Subject: [PATCH 03/11] feat(controller): add the WorkloadDeployment federator Introduce the federator that fans a WorkloadDeployment out to the cells selected for its placement, replacing the central scheduler. Add the city-code field indexer it uses to map subnet/location events back to the deployments that depend on them. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/indexers.go | 24 +- .../workloaddeployment_federator.go | 405 ++++++++++++++++++ .../workloaddeployment_federator_test.go | 398 +++++++++++++++++ 3 files changed, 814 insertions(+), 13 deletions(-) create mode 100644 internal/controller/workloaddeployment_federator.go create mode 100644 internal/controller/workloaddeployment_federator_test.go diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe88..7d9e1ae1 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,9 +33,10 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + // Index workload deployments by city code so that SubnetClaim/Subnet watch + // handlers can efficiently find deployments targeting the same city. + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil @@ -44,18 +48,12 @@ func deploymentWorkloadUIDIndexFunc(o client.Object) []string { } } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 00000000..9c736cf0 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" + + // kindWorkloadDeployment is the Kind string for WorkloadDeployment resources. + kindWorkloadDeployment = "WorkloadDeployment" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator; this logic will migrate to Milo +// once the shared library is promoted). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // FederationClient is a client pointed at the Karmada federation control + // plane (the federation hub that the management controllers read and write + // through). The caller (cmd/main.go) constructs it from --federation-kubeconfig. + FederationClient client.Client + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.FederationClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + // Using strategy.GetClient() for writes ensures the downstream namespace is + // created with UpstreamOwnerNamespaceLabel so the InstanceProjector can + // resolve the target project namespace without scanning all namespaces. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(req.ClusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + // Ensure the downstream namespace exists and carries the upstream tracking + // labels so the InstanceProjector can resolve the project namespace by label + // lookup instead of scanning all namespaces. + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, string(req.ClusterName)); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + // Lazily create the PropagationPolicy that targets clusters with the matching + // city-code label. + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + // Pull aggregated status from the downstream control plane back into the + // project namespace. + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(clusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + // Delete the downstream WorkloadDeployment. + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + // Clean up the PropagationPolicy if no other deployments with the same city + // code remain in this downstream namespace. + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.FederationClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = fmt.Sprintf("cluster-%s", strings.ReplaceAll(clusterName, "/", "_")) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: kindWorkloadDeployment, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when FederationClient is non-nil. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator"). + Complete(r) +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + // Sanitize the city code to a valid Kubernetes name: lower-case, spaces → hyphens. + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 00000000..2bd2169f --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: testDefaultPlacement, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + FederationClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoFederationClient verifies that the reconciler +// is a no-op when FederationClient is nil. +func TestWorkloadDeploymentFederator_NoFederationClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.FederationClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} From c7c1ec78eeb5916e87c9df7cbe8dea8fc248383f Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:07:42 -0500 Subject: [PATCH 04/11] feat(controller): add the InstanceProjector Add the projector that mirrors cell-side Instances back to the management plane, writing their status (readiness, placement, blocking reasons) onto the project-scoped Instance so callers see a single view across cells. Include the shared controller test helpers that build the project/Karmada fake clients and multi-cluster manager used by the federation tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_projector.go | 214 ++++++++ .../controller/instance_projector_test.go | 492 ++++++++++++++++++ .../controller/instance_writeback_test.go | 448 ++++++++++++++++ internal/controller/testing_helpers_test.go | 101 ++++ 4 files changed, 1255 insertions(+) create mode 100644 internal/controller/instance_projector.go create mode 100644 internal/controller/instance_projector_test.go create mode 100644 internal/controller/instance_writeback_test.go create mode 100644 internal/controller/testing_helpers_test.go diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 00000000..fa0b69b6 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the upstream +// Karmada/management control plane by POP-cell InstanceReconcilers and creates +// read-only projections in the corresponding project namespace within each +// project cluster. +// +// Namespace resolution: an upstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// upstream Karmada control plane — NOT the multicluster-runtime manager — so +// informer watches are scoped to the upstream control plane. +type InstanceProjector struct { + // FederationClient reads Instance objects from the Karmada federation control + // plane (configured via --federation-kubeconfig). Must be set before + // SetupWithManager is called. + FederationClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + // 1. Fetch the Instance from the upstream Karmada control plane. + var downstreamInstance computev1alpha.Instance + if err := r.FederationClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the upstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting upstream instance: %w", err) + } + + // Only project Instances that carry the upstream tracking label; others were + // not written by our InstanceReconciler write-back logic. + encodedClusterName, ok := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + logger.V(1).Info("skipping instance without upstream cluster label") + return ctrl.Result{}, nil + } + + // 2. Resolve the project cluster name. + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 3. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 4. Resolve the target project namespace from the Instance label. + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the upstream Karmada namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + logger.Info("Instance missing upstream-namespace label, requeueing", + "namespace", downstreamInstance.Namespace, "name", downstreamInstance.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 5. Resolve the owning WorkloadDeployment by NAME in the project cluster. + // + // Core invariant: the ownerReference MUST be built from a project-cluster + // object obtained via projectClient.Get — never from any edge/Karmada + // identity. The WD name is stable across all planes (project cluster, Karmada, + // edge) and is the correct cross-plane identifier. + // + // Resolution order: + // a) Read WorkloadDeploymentNameLabel from the downstream Instance (stamped by + // the edge stateful control strategy). + // b) If absent (Instances created before the label was introduced), fall back + // to stripping the trailing "-" suffix from the Instance name. + wdName := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if wdName == "" { + wdName = wdNameFromInstanceName(downstreamInstance.Name) + } + if wdName == "" { + logger.Info("cannot resolve WorkloadDeployment name from Instance — skipping projection", + "instance", downstreamInstance.Name) + return ctrl.Result{}, nil + } + + // Fetch the project-cluster WD directly by name. The returned object carries + // the project-cluster metadata.uid — the only UID that GC in the project + // cluster can act on. + var ownerWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: targetNamespace, Name: wdName}, &ownerWD); err != nil { + if apierrors.IsNotFound(err) { + // Either a transient ordering race (Instance projected before + // WorkloadReconciler created the project WD) or the WD has been + // deleted. In both cases, do NOT create an ownerless projection. + // Requeue so the projection is created with a correct owner + // reference once the WD exists. The 5 s interval matches the + // existing upstream-namespace label requeue above. + logger.Info("project WorkloadDeployment not found — requeueing without creating projection", + "wdName", wdName, "namespace", targetNamespace) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting WorkloadDeployment %s/%s in project cluster %s: %w", + targetNamespace, wdName, clusterName, err) + } + + // 6. Create or update the projection in the project namespace. + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference using the live project-cluster WD object. + // controllerutil.SetOwnerReference reads UID and GVK from ownerWD, which + // was fetched from projectClient — satisfying the core invariant. + return controllerutil.SetOwnerReference(&ownerWD, projection, projectCluster.GetScheme()) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// wdNameFromInstanceName derives the WorkloadDeployment name from an Instance +// name by stripping the trailing "-" suffix. Instance names follow the +// convention "-" (e.g. "my-api-default-dfw-0"), which is +// structurally enforced by the stateful control strategy. Returns empty string +// if the name does not contain a numeric suffix (unrecognised format). +// +// This is used as a fallback when the WorkloadDeploymentNameLabel is absent on +// Instances created before that label was introduced. +func wdNameFromInstanceName(name string) string { + lastDash := strings.LastIndex(name, "-") + if lastDash <= 0 { + return "" + } + suffix := name[lastDash+1:] + for _, c := range suffix { + if c < '0' || c > '9' { + return "" + } + } + if len(suffix) == 0 { + return "" + } + return name[:lastDash] +} + +// SetupWithManager registers the InstanceProjector with upstreamMgr, a standard +// manager.Manager configured against the upstream Karmada/federation control plane +// REST config. FederationClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(upstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(upstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 00000000..7dcc8168 --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "maps" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + // Follows the "-" convention: "my-wd-0". + projTestInstanceName = "my-wd-0" + + // projTestWDUID is the UID of the owning WorkloadDeployment as it exists in + // the PROJECT cluster. This is the UID that owner references must use, since + // Kubernetes GC in the project cluster only knows this UID. + projTestWDUID = types.UID("project-wd-uid-9999-aaaa-bbbb-cccc") + + // projTestEdgeWDUID is the UID of the WorkloadDeployment as it exists on the + // EDGE/Karmada plane. Each plane mints its own UID, so this is intentionally + // distinct from projTestWDUID. The WorkloadDeploymentUIDLabel on downstream + // Instances carries this edge UID — NOT the project UID. + projTestEdgeWDUID = types.UID("edge-uid-0000-1111-2222-3333") + + // projTestWDName is the name of the owning WorkloadDeployment. The name is + // the same across all planes (project cluster, Karmada, edge) and is the + // correct cross-plane stable identifier. + projTestWDName = "my-wd" + + // projTestWorkloadUID is the UID of the owning Workload (carried via WorkloadUIDLabel). + projTestWorkloadUID = "wl-uid-1111-2222-3333-4444" + + // projTestInstanceIndex is the ordinal index of the instance (carried via InstanceIndexLabel). + projTestInstanceIndex = "0" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // WorkloadDeploymentUIDLabel carries the EDGE UID — intentionally distinct + // from projTestWDUID (the project-cluster WD UID). Owner references must + // never be built from this value. + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestEdgeWDUID), + computev1alpha.WorkloadDeploymentNameLabel: projTestWDName, + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + } + maps.Copy(labels, labelOverrides) + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + FederationClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // request overrides the default projectorRequest() when set. + request *ctrl.Request + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantRequeue controls whether the reconcile result should request a requeue. + wantRequeue bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Cross-plane UID regression test: the Karmada Instance carries the EDGE + // WD UID in WorkloadDeploymentUIDLabel (projTestEdgeWDUID), which is + // intentionally different from the project-cluster WD UID (projTestWDUID). + // The owner reference on the projection must use the project-cluster UID. + // This test fails if someone reintroduces UID-based matching against the + // edge/Karmada plane. + name: "WD name label present, edge UID differs from project UID — owner ref UID equals project WD UID", + karmadaInstance: projTestKarmadaInstance(nil), // carries projTestEdgeWDUID, not projTestWDUID + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), // UID is projTestWDUID + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Fallback: when WorkloadDeploymentNameLabel is absent (Instances created + // before the label was introduced), the projector derives the WD name from + // the Instance name by stripping the trailing "-" suffix. + // Instance name "my-wd-0" → WD name "my-wd". + name: "WD name label absent, fallback name extraction from instance name — owner ref attached", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Remove the name label to exercise the fallback path. + computev1alpha.WorkloadDeploymentNameLabel: "", + }), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // NotFound requeue: when the project WD does not yet exist (transient + // ordering race — Instance projected before WorkloadReconciler created + // the project WD), the projector must requeue and NOT create an ownerless + // projection. A projection must never be created without an owner reference. + name: "project WD not found — requeue, no ownerless projection created", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment — simulates the transient ordering race. + }, + wantProjection: false, + wantRequeue: true, + }, + { + // Unresolvable WD name: both the label is absent and the Instance name has + // no numeric suffix to strip (unrecognised naming format). The projector + // should skip without error — no projection created, no requeue. + // The instance name "inst-no-ordinal" has no trailing numeric segment. + name: "WD name label absent and instance name yields no resolvable WD — skip, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // No WorkloadDeploymentNameLabel — no label, no numeric suffix. + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + }, + }, + }, + request: &ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: false, + }, + { + name: "missing upstream-cluster-name label — skipped, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + name: "missing upstream-namespace label — requeue", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + // Verify that all linking labels (WorkloadUID, WorkloadDeploymentUID, + // WorkloadDeploymentNameLabel, InstanceIndex) survive from the Karmada + // write-back object through to the projection. + name: "all linking labels propagated from Karmada to projection", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Build Karmada client. + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + // Build project client. + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + req := projectorRequest() + if tt.request != nil { + req = *tt.request + } + result, err := r.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + + if tt.wantRequeue { + assert.NotZero(t, result.RequeueAfter, "expected RequeueAfter to be set") + } else { + assert.Equal(t, ctrl.Result{}, result) + } + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Linking labels must survive from the Karmada instance to the projection + // so that the CLI can resolve Workload name, city, and instance ordinal. + if tt.wantProjection && tt.karmadaInstance != nil { + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], + projection.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.InstanceIndexLabel], + projection.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated to the projection") + } + + // Owner reference check. + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + // Core invariant: owner ref UID must be the PROJECT-cluster WD UID. + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID must match the project-cluster WorkloadDeployment UID") + // Regression guard: the edge UID must NOT appear in the owner ref. + // If this assertion fails, someone reintroduced cross-plane UID matching. + assert.NotEqual(t, string(projTestEdgeWDUID), string(ownerRef.UID), + "owner reference UID must NOT be the edge/Karmada WD UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference when WD not found") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true when err is a Kubernetes not-found error or is nil +// (object not found means Get returned NotFound, not that err is nil). +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + // Import apierrors to check — we already have it via the fake client package. + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go new file mode 100644 index 00000000..17c522f1 --- /dev/null +++ b/internal/controller/instance_writeback_test.go @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "sync" + "testing" + + "github.com/go-logr/logr" + "github.com/go-logr/logr/funcr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Log capture helper ─────────────────────────────────────────────────────── + +// logEntry holds a single captured log line (message + formatted key-value pairs). +type logEntry struct { + msg string + kvs string // funcr renders key-value pairs as a single string +} + +// captureLogger returns a logr.Logger backed by an in-memory sink and a pointer +// to the slice of captured entries. Thread-safe; safe to call from parallel tests. +func captureLogger() (logr.Logger, *[]logEntry) { + var mu sync.Mutex + var entries []logEntry + logger := funcr.New(func(prefix, args string) { + mu.Lock() + defer mu.Unlock() + entries = append(entries, logEntry{msg: prefix, kvs: args}) + }, funcr.Options{}) + return logger, &entries +} + +// ─── write-back test constants ──────────────────────────────────────────────── + +const ( + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + + // Four new self-describing labels. + wbTestWDName = "my-workload-deployment" + wbTestCityCode = "DFW" + wbTestWorkloadName = "my-workload" + wbTestPlacement = "us-central" +) + +// wbTestCellInstance builds a cell-side Instance with all seven owned labels +// pre-populated, as addInstanceControllerLabels would produce. +func wbTestCellInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + computev1alpha.WorkloadDeploymentNameLabel: wbTestWDName, + computev1alpha.CityCodeLabel: wbTestCityCode, + computev1alpha.WorkloadNameLabel: wbTestWorkloadName, + computev1alpha.PlacementNameLabel: wbTestPlacement, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceReadyReasonRunning, + Message: "Instance is ready", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + +// wbTestDownstreamNS returns a Namespace object in the downstream (Karmada) +// control plane that carries the upstream routing labels, simulating the +// namespace stamped by NSO's MappedNamespaceResourceStrategy. +func wbTestDownstreamNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + }, + }, + } +} + +// newWriteBackReconciler wires an InstanceReconciler whose FederationClient is set +// to federationClient and whose local cluster has a single cell instance. +func newWriteBackReconciler(federationClient client.Client) *InstanceReconciler { + return &InstanceReconciler{ + FederationClient: federationClient, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestWriteBackToUpstream_CreatePath_AllLabels (Case A) verifies that the first +// write-back to an empty Karmada control plane creates an Instance with all five +// expected labels (two routing + three linking) and also writes the cell-side +// status via Status().Update. +func TestWriteBackToUpstream_CreatePath_AllLabels(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // Verify the created Karmada Instance carries all five expected labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestEncodedCluster, created.Labels[downstreamclient.UpstreamOwnerClusterNameLabel], + "UpstreamOwnerClusterNameLabel must be set") + assert.Equal(t, wbTestUpstreamNS, created.Labels[downstreamclient.UpstreamOwnerNamespaceLabel], + "UpstreamOwnerNamespaceLabel must be set") + assert.Equal(t, wbTestWorkloadUID, created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestWDUID, created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestInstanceIndex, created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated from cell instance") + + // Status must have been written via Status().Update after Create. + require.Len(t, created.Status.Conditions, 1, + "Status().Update must be called after Create; condition should be present") + assert.Equal(t, computev1alpha.InstanceReady, created.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionTrue, created.Status.Conditions[0].Status) +} + +// TestWriteBackToUpstream_UpdatePath_LabelMerge (Case B) verifies that an +// existing Karmada Instance with a Karmada-managed label retains that label +// after the update path runs, while all five owned labels are written correctly. +func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + // Pre-populate the Karmada control plane with an Instance that has the old + // two-label map plus a simulated Karmada-managed label. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + // All five owned labels must be present with correct values. + assert.Equal(t, wbTestEncodedCluster, updated.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]) + assert.Equal(t, wbTestUpstreamNS, updated.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]) + assert.Equal(t, wbTestWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel]) + assert.Equal(t, wbTestWDUID, updated.Labels[computev1alpha.WorkloadDeploymentUIDLabel]) + assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) + + // The Karmada-managed label must survive the merge (not be replaced/deleted). + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after merge; should not be overwritten") +} + +// TestWriteBackToUpstream_LabelChangeTriggerUpdate (Case C) verifies that +// a changed linking label on the cell instance causes the Karmada object to +// be updated with the new value. +func TestWriteBackToUpstream_LabelChangeTriggerUpdate(t *testing.T) { + t.Parallel() + + newWorkloadUID := "wl-uid-CHANGED" + + // Pre-populate with the five-label map from a previous write-back. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Modify the WorkloadUIDLabel on the cell instance. + cellInstance := wbTestCellInstance() + cellInstance.Labels[computev1alpha.WorkloadUIDLabel] = newWorkloadUID + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, newWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel change on the cell instance must be reflected in the Karmada object") +} + +// TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal (Case D) verifies that +// writeBackToUpstream completes without error when the cell-side Instance has +// no linking labels (e.g. during an early reconcile before +// addInstanceControllerLabels has run). The created Karmada object will carry +// empty string values for the three linking labels, and the RC-2 warning log +// must fire listing all three missing label keys. +func TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Instance with nil Labels — simulates an early reconcile with no linking labels. + cellInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + // Inject a capturing logger so we can assert the RC-2 warning fires. + capLogger, entries := captureLogger() + ctx := log.IntoContext(context.Background(), capLogger) + + // Must not return an error — empty labels are non-fatal. + err := r.writeBackToUpstream(ctx, multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // The Karmada object should exist with empty string values for the linking labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel should be empty string when not set on cell instance") + + // Assert the RC-2 warning was emitted and named all three missing label keys. + // funcr encodes both the message and key-value pairs into the args string; + // we search across the full rendered output for each required substring. + warnMsg := "instance is missing linking labels for write-back" + allRendered := func() string { + parts := make([]string, len(*entries)) + for i, e := range *entries { + parts[i] = fmt.Sprintf("%s %s", e.msg, e.kvs) + } + return strings.Join(parts, "\n") + }() + + assert.True(t, strings.Contains(allRendered, warnMsg), + "expected RC-2 warning %q to be logged; got:\n%s", warnMsg, allRendered) + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + assert.True(t, strings.Contains(allRendered, key), + "expected missing label key %q to appear in warning log; got:\n%s", key, allRendered) + } +} + +// TestWriteBackToUpstream_FourNewLabels_CreatePath verifies that all four new +// self-describing labels (WorkloadDeploymentName, CityCode, WorkloadName, +// PlacementName) are written to the Karmada object on the create path. +func TestWriteBackToUpstream_FourNewLabels_CreatePath(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestWDName, created.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestCityCode, created.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must propagate to Karmada object") + assert.Equal(t, wbTestWorkloadName, created.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestPlacement, created.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must propagate to Karmada object") +} + +// TestWriteBackToUpstream_FourNewLabels_UpdatePath verifies that all four new +// self-describing labels are written on the update path and existing Karmada- +// managed labels on the downstream object are preserved. +func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, wbTestWDName, updated.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on update path") + assert.Equal(t, wbTestCityCode, updated.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must be set on update path") + assert.Equal(t, wbTestWorkloadName, updated.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on update path") + assert.Equal(t, wbTestPlacement, updated.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on update path") + + // Karmada-managed label must survive the merge. + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after the update merge") +} diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 00000000..cc3d3d9f --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name multicluster.ClusterName) (cluster.Cluster, error) { + if c, ok := m.clusters[string(name)]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} From d9cf75998f7753eabd960d3093a0b65531b0c920 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:07:51 -0500 Subject: [PATCH 05/11] feat(controller): distributed WorkloadDeployment and Workload reconciliation Rework the WorkloadDeployment and Workload controllers to run per cell, resolving networks and Locations locally and driving Instance lifecycle through the stateful instance-control logic rather than a central scheduler. Update the instance-control packages to manage Instances within a cell's control plane. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../instancecontrol/instancecontrol.go | 28 +- .../stateful/stateful_control.go | 109 ++++- .../stateful/stateful_control_test.go | 407 ++++++++++++++++++ internal/controller/workload_controller.go | 14 +- .../workloaddeployment_controller.go | 308 ++++++++----- .../workloaddeployment_location_test.go | 119 +++++ 6 files changed, 866 insertions(+), 119 deletions(-) create mode 100644 internal/controller/workloaddeployment_location_test.go diff --git a/internal/controller/instancecontrol/instancecontrol.go b/internal/controller/instancecontrol/instancecontrol.go index 6de9df99..d2c83692 100644 --- a/internal/controller/instancecontrol/instancecontrol.go +++ b/internal/controller/instancecontrol/instancecontrol.go @@ -26,10 +26,11 @@ type Strategy interface { type ActionType string const ( - ActionTypeCreate ActionType = "Create" - ActionTypeUpdate ActionType = "Update" - ActionTypeDelete ActionType = "Delete" - ActionTypeWait ActionType = "Wait" + ActionTypeCreate ActionType = "Create" + ActionTypeUpdate ActionType = "Update" + ActionTypeDelete ActionType = "Delete" + ActionTypeWait ActionType = "Wait" + ActionTypePatchLabels ActionType = "PatchLabels" ) type Action struct { @@ -104,3 +105,22 @@ func NewWaitAction(object client.Object) Action { fn: func(ctx context.Context, c client.Client) error { return nil }, } } + +// NewPatchLabelsAction returns an action that applies a metadata-only labels +// patch to the given object. It uses a MergeFrom patch so only the labels +// field is sent to the API server — the spec, template, and template-hash are +// never touched. This is intentionally separate from ActionTypeUpdate so that +// label backfill never participates in the ordered rolling-update flow. +func NewPatchLabelsAction(updated client.Object, base client.Object) Action { + patch := client.MergeFrom(base) + return Action{ + Object: updated, + actionType: ActionTypePatchLabels, + fn: func(ctx context.Context, c client.Client) error { + if err := c.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to patch labels on %T %s: %w", updated, updated.GetName(), err) + } + return nil + }, + } +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652c..2d2e3073 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -15,13 +15,30 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" ) +// Options controls optional behaviours of the stateful instance control strategy. +type Options struct { + // NetworkingEnabled controls whether the Network scheduling gate is added to + // newly created Instances. Set to false when the networking integration is + // disabled so that Instances are not blocked waiting for a NetworkBinding. + // Defaults to true. + NetworkingEnabled bool +} + // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset // Does not currently implement exact behavior. type statefulControl struct { + opts Options } +// New returns a stateful instance control strategy with networking enabled. func New() instancecontrol.Strategy { - return &statefulControl{} + return NewWithOptions(Options{NetworkingEnabled: true}) +} + +// NewWithOptions returns a stateful instance control strategy with the given +// options. +func NewWithOptions(opts Options) instancecontrol.Strategy { + return &statefulControl{opts: opts} } func (c *statefulControl) GetActions( @@ -68,15 +85,25 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } + // Set Location best-effort: when Status.Location is nil (no matching + // Location object for the city code) Instance.Spec.Location stays nil and + // instance creation proceeds normally — this must not block scheduling. desiredInstances[i].Spec.Location = deployment.Status.Location // TODO(jreese) consider adding scheduling gates via mutating webhooks - desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ - TemplateHash: instanceTemplateHash, - SchedulingGates: []v1alpha.SchedulingGate{ + gates := []v1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + } + if c.opts.NetworkingEnabled { + // Prepend the Network gate so it is cleared first; quota is + // independent and evaluated in parallel by InstanceReconciler. + gates = append([]v1alpha.SchedulingGate{ {Name: instancecontrol.NetworkSchedulingGate.String()}, - {Name: instancecontrol.QuotaSchedulingGate.String()}, - }, + }, gates...) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ + TemplateHash: instanceTemplateHash, + SchedulingGates: gates, } addInstanceControllerLabels(desiredInstances[i], getInstanceOrdinal(desiredInstances[i].Name), deployment) @@ -114,10 +141,37 @@ func (c *statefulControl) GetActions( } } + // Backfill controller-managed labels on every existing instance, regardless + // of Ready state or template hash. This ensures newly-introduced labels + // (e.g. city-code, workload-name) are applied to pre-existing instances that + // were never touched by a rolling update. The patch is metadata-only and is + // emitted outside the ordered rollout decision so it never gates or reorders + // instance creation/updates. + var patchLabelActions []instancecontrol.Action + for _, instance := range desiredInstances { + if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { + // Skip instances that don't exist yet or are being deleted. + continue + } + + desiredLabels := desiredControllerLabels(getInstanceOrdinal(instance.Name), deployment) + if labelsNeedBackfill(instance.Labels, desiredLabels) { + base := instance.DeepCopy() + patched := instance.DeepCopy() + for k, v := range desiredLabels { + if patched.Labels == nil { + patched.Labels = make(map[string]string) + } + patched.Labels[k] = v + } + patchLabelActions = append(patchLabelActions, instancecontrol.NewPatchLabelsAction(patched, base)) + } + } + slices.SortFunc(updateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -144,6 +198,11 @@ func (c *statefulControl) GetActions( } + // Label-backfill actions are appended after the rollout ordering/skip logic + // so they are never affected by the "skip all but first" rule and never + // participate in rollout sequencing. + actions = append(actions, patchLabelActions...) + return actions, nil } @@ -152,7 +211,37 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme instance.Labels = map[string]string{} } - instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(index) - instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) - instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + for k, v := range desiredControllerLabels(index, deployment) { + instance.Labels[k] = v + } +} + +// desiredControllerLabels returns the full set of controller-managed labels +// that every instance should carry. Used both when stamping a new/updated +// instance and when checking whether an existing instance needs a backfill +// patch. +func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { + return map[string]string{ + v1alpha.InstanceIndexLabel: strconv.Itoa(index), + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + // Self-describing labels for routing, filtering, and observability. + // Backfilled on every reconcile so they stay accurate even for instances + // that pre-date the labels or that were not reached by a rolling update. + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } +} + +// labelsNeedBackfill reports whether any of the desired controller-managed +// label key/value pairs are absent or incorrect on the current instance labels. +func labelsNeedBackfill(current map[string]string, desired map[string]string) bool { + for k, v := range desired { + if current[k] != v { + return true + } + } + return false } diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d45b24b3..ffc04272 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -13,6 +13,8 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" ) @@ -150,16 +152,407 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { assert.False(t, actions[0].IsSkipped()) } +// TestNetworkingEnabledAddsNetworkGate verifies that when networking is enabled +// (the default), newly created Instances receive both the Network and Quota +// scheduling gates so that they are held until the network is provisioned. +func TestNetworkingEnabledAddsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: true}) + + deployment := getWorkloadDeployment("test-deploy-net-on", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.Contains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must be present when networking is enabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must be present") +} + +// TestNetworkingDisabledOmitsNetworkGate verifies that when networking is +// disabled, newly created Instances do NOT receive the Network scheduling gate, +// so they are not blocked on network provisioning. The Quota gate is still +// added so quota enforcement remains active. +func TestNetworkingDisabledOmitsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: false}) + + deployment := getWorkloadDeployment("test-deploy-net-off", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.NotContains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must NOT be present when networking is disabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must still be present when networking is disabled") +} + // Add more test functions below for different scenarios. +// TestInstanceLabels_FourNewLabelsStamped verifies that all four new +// self-describing labels are stamped on newly created Instances, with values +// sourced from the WorkloadDeployment spec. +func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-deploy", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must equal deployment name") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must equal deployment.Spec.CityCode") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must equal deployment.Spec.WorkloadRef.Name") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must equal deployment.Spec.PlacementName") +} + +// TestInstanceLabels_PropagatedOnUpdate verifies that when an existing instance +// is updated (rolling update path), the four new labels are refreshed from the +// deployment so they remain accurate after spec changes. +func TestInstanceLabels_PropagatedOnUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-update", 1) + + // Build a ready existing instance. + currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} + + // Trigger a rolling update by changing the image. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must be refreshed on update") +} + +// TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when +// deployment.Status.Location is set, the new Instance receives it as Spec.Location. +func TestInstanceLocation_SetWhenDeploymentStatusLocationPresent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-set", 1) + deployment.Status.Location = &networkingv1alpha.LocationReference{ + Name: "loc-dfw-1", + Namespace: "networking-system", + } + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Location, + "Spec.Location must be set when deployment.Status.Location is non-nil") + assert.Equal(t, "loc-dfw-1", instance.Spec.Location.Name) + assert.Equal(t, "networking-system", instance.Spec.Location.Namespace) +} + +// TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent verifies that when +// deployment.Status.Location is nil (no Location object matches the city code), +// instance creation still succeeds and Spec.Location remains nil — no regression +// on the "create instances regardless of Location" contract. +func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-nil", 1) + // deployment.Status.Location is intentionally not set (nil) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err, "instance creation must succeed even when Status.Location is nil") + assert.Len(t, actions, 1, "exactly one create action must be produced") + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Nil(t, instance.Spec.Location, + "Spec.Location must remain nil when deployment.Status.Location is not set") + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType(), + "action must be a Create, proving instance creation is not gated on Location") +} + +// TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance +// with an unchanged template hash receives a PatchLabels action when it is +// missing controller-managed labels. The action must not be a rollout Update, +// must not alter spec/template, and must not block subsequent instances. +func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-notready", 2) + + // Instance 0: not-Ready, correct template hash, but missing city-code/workload-name labels. + instance0 := getInstanceForDeployment(deployment, 0) + apimeta.SetStatusCondition(&instance0.Status.Conditions, metav1.Condition{ + Type: v1alpha.InstanceReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + Message: "Instance is not ready", + LastTransitionTime: metav1.Now(), + }) + // Simulate pre-existing instance that only has the index label (missing the newer labels). + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + } + + // Instance 1: needs to be created (nil in desiredInstances), so we only provide instance0. + currentInstances := []v1alpha.Instance{*instance0} + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + // Collect actions by type. + var waitActions, createActions, updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeWait: + waitActions = append(waitActions, a) + case instancecontrol.ActionTypeCreate: + createActions = append(createActions, a) + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // The not-Ready instance must still produce a Wait (rollout is gated). + assert.Len(t, waitActions, 1, "not-Ready instance must still produce a Wait action") + assert.Equal(t, "test-backfill-notready-0", waitActions[0].Object.GetName()) + + // The missing instance-1 create is skipped (ordered policy, Wait is first). + assert.Len(t, createActions, 1, "instance-1 create action must be present") + assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") + + // No template Update actions must be produced. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash instance") + + // A PatchLabels action must be produced for instance-0. + assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") + assert.Equal(t, "test-backfill-notready-0", patchActions[0].Object.GetName()) + assert.False(t, patchActions[0].IsSkipped(), "PatchLabels must not be skipped by the rollout skip-loop") + + // The patched object must carry all desired labels. + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.GetName(), patched.Labels[v1alpha.WorkloadDeploymentNameLabel]) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel]) + assert.Equal(t, deployment.Spec.WorkloadRef.Name, patched.Labels[v1alpha.WorkloadNameLabel]) + assert.Equal(t, deployment.Spec.PlacementName, patched.Labels[v1alpha.PlacementNameLabel]) + + // The patched object's spec and template-hash must be unchanged. + assert.Equal(t, instancecontrol.ComputeHash(deployment.Spec.Template), patched.Spec.Controller.TemplateHash, + "template hash must be unchanged by the label backfill") + assert.Equal(t, deployment.Spec.Template.Spec.Runtime, patched.Spec.Runtime, + "spec must be unchanged by the label backfill") +} + +// TestLabelBackfill_Idempotent verifies that an instance already carrying all +// correct controller-managed labels produces no PatchLabels action. +func TestLabelBackfill_Idempotent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-idempotent", 1) + + // Instance already has all controller-managed labels set correctly. + instance := getInstanceForDeployment(deployment, 0) + instance.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + for _, a := range actions { + assert.NotEqual(t, instancecontrol.ActionTypePatchLabels, a.ActionType(), + "no PatchLabels action must be produced when all labels are already correct") + } +} + +// TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with +// correct template hash but drifted labels receives a PatchLabels action +// without triggering a template rollout Update. +func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-ready", 1) + + // Ready instance with matching hash but missing city-code label. + instance := getInstanceForDeployment(deployment, 0) + // Remove the city-code label to simulate drift. + delete(instance.Labels, v1alpha.CityCodeLabel) + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // No template Update must be produced — template hash matches. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash ready instance") + + // A PatchLabels action must be produced. + assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel], + "city-code label must be corrected by the backfill") +} + +// TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template +// change on a Ready instance still produces a normal ordered Update action and +// that the PatchLabels path does not interfere with or duplicate it. +func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-rolling", 2) + + // Two ready instances with all correct labels and matching current hash. + instance0 := getInstanceForDeployment(deployment, 0) + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + instance1 := getInstanceForDeployment(deployment, 1) + instance1.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "1", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + // Trigger a template change. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "rolling-update-image" + + currentInstances := []v1alpha.Instance{*instance0, *instance1} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // Two Update actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, updateActions, 2, "both instances must produce Update actions on template change") + assert.Equal(t, "test-backfill-rolling-1", updateActions[0].Object.GetName(), + "Update actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", updateActions[1].Object.GetName()) + assert.False(t, updateActions[0].IsSkipped(), "first Update must be active") + assert.True(t, updateActions[1].IsSkipped(), "second Update must be skipped (ordered rollout)") + + // No PatchLabels — all labels are already correct. + assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") +} + func getWorkloadDeployment(name string, minReplicas int32) *v1alpha.WorkloadDeployment { instance := getInstanceTemplate(name, 0) deployment := &v1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", + UID: "test-wd-uid", }, Spec: v1alpha.WorkloadDeploymentSpec{ + WorkloadRef: v1alpha.WorkloadReference{ + Name: "test-workload", + UID: "test-workload-uid", + }, + PlacementName: "test-placement", + CityCode: "DFW", ScaleSettings: v1alpha.HorizontalScaleSettings{ MinReplicas: minReplicas, InstanceManagementPolicy: v1alpha.OrderedReadyInstanceManagementPolicyType, @@ -180,6 +573,20 @@ func getInstanceForDeployment(deployment *v1alpha.WorkloadDeployment, ordinal in TemplateHash: instancecontrol.ComputeHash(deployment.Spec.Template), } + // Stamp all controller-managed labels so that the label-backfill path is a + // no-op for instances built by this helper. Tests that specifically exercise + // label drift should manipulate the labels directly after calling this helper. + if instance.Labels == nil { + instance.Labels = map[string]string{} + } + instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(ordinal) + instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) + instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + instance.Labels[v1alpha.WorkloadDeploymentNameLabel] = deployment.GetName() + instance.Labels[v1alpha.CityCodeLabel] = deployment.Spec.CityCode + instance.Labels[v1alpha.WorkloadNameLabel] = deployment.Spec.WorkloadRef.Name + instance.Labels[v1alpha.PlacementNameLabel] = deployment.Spec.PlacementName + return instance } diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6e907b65..6ca92e03 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -26,13 +26,17 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) -const workloadControllerFinalizer = "compute.datumapis.com/workload-controller" +const ( + workloadControllerFinalizer = "compute.datumapis.com/workload-controller" + workloadConditionTypeAvailable = "Available" +) // WorkloadReconciler reconciles a Workload object type WorkloadReconciler struct { @@ -118,7 +122,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ if len(notFoundNetworks) > 0 { missingNetworks := strings.Join(notFoundNetworks.UnsortedList(), ", ") changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, Reason: "NetworkNotFound", Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), @@ -383,9 +387,9 @@ func (r *WorkloadReconciler) getDeploymentsForWorkload( existingDeployments.Insert(deployment.Name) } - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := upstreamClient.List(ctx, &locations); err != nil { - return nil, nil, fmt.Errorf("failed to list locations: %w", err) + return nil, nil, fmt.Errorf("failed to list location bindings: %w", err) } if len(locations.Items) == 0 { @@ -463,7 +467,7 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Workload{}, mcbuilder.WithEngageWithLocalCluster(false)). Owns(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Watches(&networkingv1alpha.Network{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + Watches(&networkingv1alpha.Network{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, network client.Object) []mcreconcile.Request { logger := log.FromContext(ctx) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef0..9b17266e 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -24,6 +24,7 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" @@ -37,11 +38,28 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + // KarmadaClient is an optional client pointing at the Karmada control plane. + // When non-nil, the reconciler writes the WorkloadDeployment status back to + // the Karmada namespace after each reconcile so the WorkloadDeploymentFederator + // can aggregate it into the project-namespace object. Set to nil to disable. + KarmadaClient client.Client + + // NetworkingEnabled controls whether the networking integration with + // network-services-operator is active. When false, NetworkBinding creation is + // skipped, the Network scheduling gate is never added to Instances (and is + // actively removed if present), and the networking step is treated as + // immediately ready. Defaults to true. + NetworkingEnabled bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=locations,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkbindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkcontexts,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnetclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnets,verbs=get;list;watch func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) @@ -86,10 +104,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -100,7 +114,9 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, fmt.Errorf("failed listing instances: %w", err) } - instanceControl := instancecontrolstateful.New() + instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ + NetworkingEnabled: r.NetworkingEnabled, + }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) if err != nil { @@ -122,9 +138,26 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } } - networkReady, err := r.reconcileNetworks(ctx, cl.GetClient(), &deployment) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + // When networking is disabled, bypass the entire network provisioning path. + // The Network scheduling gate is treated as cleared and no NetworkBindings + // are created. This lets Instances reach the runtime on cells where + // network-services-operator (VPC) is not yet available. + var networkReady bool + if !r.NetworkingEnabled { + networkReady = true + } else { + var resolvedLocation *networkingv1alpha.LocationReference + networkReady, resolvedLocation, err = r.reconcileNetworks(ctx, cl.GetClient(), &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + } + // Persist the resolved Location to status so downstream components (e.g. + // the stateful instance control strategy) can propagate it to Instances. + // When no matching Location exists, resolvedLocation is nil and + // Status.Location remains nil — instance creation is not blocked. + if resolvedLocation != nil { + deployment.Status.Location = resolvedLocation + } } // Networks are all ready with subnets ready to use, remove any scheduling @@ -143,59 +176,59 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + if err := r.writeStatusToKarmada(ctx, &deployment); err != nil { + return ctrl.Result{}, err + } + + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -240,13 +273,70 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// writeStatusToKarmada copies the WorkloadDeployment status to the matching +// object in the Karmada namespace so the WorkloadDeploymentFederator can +// sync it back to the project-namespace object on the control plane. +// It is a no-op when KarmadaClient is nil. +func (r *WorkloadDeploymentReconciler) writeStatusToKarmada(ctx context.Context, deployment *computev1alpha.WorkloadDeployment) error { + if r.KarmadaClient == nil { + return nil + } + + var kd computev1alpha.WorkloadDeployment + if err := r.KarmadaClient.Get(ctx, client.ObjectKeyFromObject(deployment), &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed getting Karmada WD for status writeback: %w", err) + } + + kd.Status = deployment.Status + // Use Update (not Patch) so all required status fields are present in the + // request body; MergeFrom omits unchanged zero-value int32 fields which + // would fail the CRD's required constraints on currentReplicas/readyReplicas. + if err := r.KarmadaClient.Status().Update(ctx, &kd); err != nil { + return fmt.Errorf("failed updating Karmada WD status: %w", err) + } + + return nil +} + +// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all +// network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). +// resolvedLocation is non-nil when a Location matching the deployment's city code +// was found; nil otherwise. Instance creation is never gated on resolvedLocation +// being non-nil — callers must treat a nil location as best-effort only. func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, deployment *computev1alpha.WorkloadDeployment, -) (bool, error) { +) (bool, *networkingv1alpha.LocationReference, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, nil, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + logger.Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -260,7 +350,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkBindingObjectKey, &networkBinding); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network binding: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network binding: %w", err) } if networkBinding.CreationTimestamp.IsZero() { @@ -271,16 +361,16 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetControllerReference(deployment, &networkBinding, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on network binding: %w", err) + return false, nil, fmt.Errorf("failed to set controller on network binding: %w", err) } if err := c.Create(ctx, &networkBinding); err != nil { - return false, fmt.Errorf("failed creating network binding: %w", err) + return false, nil, fmt.Errorf("failed creating network binding: %w", err) } } @@ -293,7 +383,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( if !allNetworkBindingsReady { logger.Info("waiting for network bindings to be ready") - return false, nil + return false, locationRef, nil } // TODO(jreese): Currently this makes a SubnetClaim that will be used by @@ -312,12 +402,12 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkContextObjectKey, &networkContext); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network context: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network context: %w", err) } if !apimeta.IsStatusConditionTrue(networkContext.Status.Conditions, networkingv1alpha.NetworkContextReady) { logger.Info("waiting for network context to be ready", "network_context", networkContext.Name) - return false, nil + return false, locationRef, nil } var subnetClaims networkingv1alpha.SubnetClaimList @@ -326,7 +416,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.List(ctx, &subnetClaims, listOpts...); err != nil { - return false, fmt.Errorf("failed listing subnet claims: %w", err) + return false, nil, fmt.Errorf("failed listing subnet claims: %w", err) } var subnetClaim networkingv1alpha.SubnetClaim @@ -347,8 +437,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,28 +461,28 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetOwnerReference(&networkContext, &subnetClaim, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on subnet claim: %w", err) + return false, nil, fmt.Errorf("failed to set controller on subnet claim: %w", err) } if err := c.Create(ctx, &subnetClaim); err != nil { - return false, fmt.Errorf("failed creating subnet claim: %w", err) + return false, nil, fmt.Errorf("failed creating subnet claim: %w", err) } logger.Info("created subnet claim", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } logger.Info("found subnet claim", "subnetClaim", subnetClaim.Name) if !apimeta.IsStatusConditionTrue(subnetClaim.Status.Conditions, "Ready") { logger.Info("waiting for subnet claim to be ready", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } var subnet networkingv1alpha.Subnet @@ -401,19 +491,19 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( Name: subnetClaim.Status.SubnetRef.Name, } if err := c.Get(ctx, subnetObjectKey, &subnet); err != nil { - return false, fmt.Errorf("failed fetching subnet: %w", err) + return false, nil, fmt.Errorf("failed fetching subnet: %w", err) } if !apimeta.IsStatusConditionTrue(subnet.Status.Conditions, "Ready") { logger.Info("waiting for subnet to be ready", "subnet", subnet.Name) - return false, nil + return false, locationRef, nil } logger.Info("subnet is ready", "subnet", subnet.Name) } - return true, nil + return true, locationRef, nil } var errDeploymentHasInstances = errors.New("deployment has instances") @@ -468,47 +558,65 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } - return mcbuilder.ControllerManagedBy(mgr). + + b := mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Owns(&computev1alpha.Instance{}). - Owns(&networkingv1alpha.NetworkBinding{}). - Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnetClaim := o.(*networkingv1alpha.SubnetClaim) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + Owns(&computev1alpha.Instance{}) + + // Only watch networking resources when the networking integration is enabled. + // On cells without network-services-operator these watches would log spurious + // errors for missing CRDs. + if r.NetworkingEnabled { + b = b. + Owns(&networkingv1alpha.NetworkBinding{}). + Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnetClaim := o.(*networkingv1alpha.SubnetClaim) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + }) + }). + Watches(&networkingv1alpha.Subnet{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnet := o.(*networkingv1alpha.Subnet) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + }) }) - }). - Watches(&networkingv1alpha.Subnet{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnet := o.(*networkingv1alpha.Subnet) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) - }) - }). - Complete(r) + } + + return b.Complete(r) } -func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { +func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName multicluster.ClusterName, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + // Resolve the Location to find its city code, then look up WorkloadDeployments + // that target the same city via the deploymentCityCodeIndex. + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := clusterClient.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go new file mode 100644 index 00000000..ff996e73 --- /dev/null +++ b/internal/controller/workloaddeployment_location_test.go @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" +) + +// newNetworkingScheme returns a scheme with compute + networkingv1alpha types. +func newNetworkingScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = computev1alpha.AddToScheme(s) + _ = networkingv1alpha.AddToScheme(s) + return s +} + +// TestReconcileNetworks_PersistsLocation_WhenLocationFound verifies that when a +// Location object matching the deployment's city code exists in the cluster, the +// resolved LocationReference is returned by reconcileNetworks and can be persisted +// to deployment.Status.Location. Instance creation must not be blocked — the +// function returns networkReady=false only because no NetworkInterfaces exist on +// the deployment in this scenario (short-circuit before bindings), not because +// Location was absent. +func TestReconcileNetworks_PersistsLocation_WhenLocationFound(t *testing.T) { + t.Parallel() + + const cityCode = "DFW" + const locationName = "loc-dfw-1" + const locationNamespace = "networking-system" + + location := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{ + Name: locationName, + Namespace: locationNamespace, + }, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": cityCode, + }, + }, + } + + s := newNetworkingScheme() + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(location).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: cityCode, + // No NetworkInterfaces — the function returns false,locationRef,nil + // after the location is found but before bindings are checked. + }, + } + + r := &WorkloadDeploymentReconciler{} + _, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err) + require.NotNil(t, resolvedLocation, + "resolved location must be non-nil when a matching Location object exists") + assert.Equal(t, locationName, resolvedLocation.Name) + assert.Equal(t, locationNamespace, resolvedLocation.Namespace) + + // Simulate what the Reconcile loop does: persist resolvedLocation to Status. + deployment.Status.Location = resolvedLocation + assert.Equal(t, locationName, deployment.Status.Location.Name, + "Status.Location.Name must match the resolved Location object name") +} + +// TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound verifies that +// when no Location object in the cluster matches the deployment's city code, +// reconcileNetworks returns (false, nil, nil) — no error and no resolved +// location. The caller must treat nil location as best-effort and must NOT block +// instance creation. +func TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound(t *testing.T) { + t.Parallel() + + s := newNetworkingScheme() + // Cluster has a Location for a DIFFERENT city code. + otherLocation := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: "loc-ord-1", Namespace: "networking-system"}, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": "ORD", + }, + }, + } + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(otherLocation).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "DFW", // no matching Location + }, + } + + r := &WorkloadDeploymentReconciler{} + networkReady, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err, "missing location must not cause an error") + assert.False(t, networkReady, "network is not ready when no location is found") + assert.Nil(t, resolvedLocation, + "resolved location must be nil when no matching Location object exists") + + // Status.Location remains nil — callers must not update it in this case. + // Confirm the deployment's Status.Location is unaffected (nil → nil). + assert.Nil(t, deployment.Status.Location, + "Status.Location must remain nil when no Location matches the city code") +} From 400e5778b2625aeb0602eaf2b2dffe1539e135cb Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:08:04 -0500 Subject: [PATCH 06/11] feat(controller): instance controller for federated scheduling Update the Instance controller to compute the Ready condition and apply the per-project quota gate within a single reconcile pass, surfacing blocking reasons when quota is unavailable so federated placement reflects real allocatable capacity. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_controller.go | 811 +++++++++++-- .../controller/instance_controller_test.go | 1040 +++++++++++++++-- 2 files changed, 1620 insertions(+), 231 deletions(-) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index e5bc3564..f11520a7 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -5,52 +5,154 @@ package controller import ( "context" "fmt" + "maps" "strings" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - ctrlsource "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" + quotametrics "go.datum.net/compute/internal/quota" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" + + // instanceQuotaClaimSourceLabel is stamped on ResourceClaim objects with the + // name of the edge cluster that created them. The claim watch predicate uses + // this label to filter out claims written by other edge controllers targeting + // the same project control planes. + instanceQuotaClaimSourceLabel = "compute.datumapis.com/source-cluster" + + // quotaResourceTypeInstances is the quota resource type for Instance count. + quotaResourceTypeInstances = "compute.datumapis.com/instances" + + // miloProjectAPIGroup is the API group for Milo resource-manager resources. + miloProjectAPIGroup = "resourcemanager.miloapis.com" + + // miloProjectKind is the Kind used for Milo Project resources. + miloProjectKind = "Project" + + // msgNotProgrammed is the human-readable message for the not-programmed state. + msgNotProgrammed = "Instance has not been programmed" + + // msgInstanceReady is the human-readable message for the ready state. + msgInstanceReady = "Instance is ready" + + // msgInstanceProgrammed is the human-readable message for the programmed state. + msgInstanceProgrammed = "Instance has been programmed" + + // msgInstanceRunning is the human-readable message for the running state. + msgInstanceRunning = "Instance is running" + + // reasonNetworkFailedToCreate is the reason code for network creation failure. + reasonNetworkFailedToCreate = "NetworkFailedToCreate" +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. type clusterGetter interface { - GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) + GetCluster(ctx context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) } +// InstanceProjectIDFunc derives the Milo project ID for a given Instance. +// In Milo mode the project ID equals the multicluster ClusterName. In +// single-cell mode it is decoded from the upstream-cluster-name namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectIDFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + +// InstanceProjectNamespaceFunc derives the in-project namespace where +// ResourceClaims for a given Instance should be created. In Milo mode this +// equals instance.Namespace. In single-cell mode it comes from the +// upstream-namespace namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectNamespaceFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + // InstanceReconciler reconciles an Instance object type InstanceReconciler struct { - mgr clusterGetter - managementCluster cluster.Cluster + mgr clusterGetter + scheme *runtime.Scheme + quotaClientManager *quotametrics.ProjectQuotaClientManager + edgeClusterName string + // recorder emits Kubernetes events on the Instance object for quota failure + // modes so operators can diagnose issues via `kubectl describe`. + recorder record.EventRecorder + // projectIDForInstance derives the Milo project ID used for quota + // ResourceClaim management. In Milo mode it returns string(clusterName); in + // single-cell mode it reads the upstream-cluster-name label from the edge + // namespace and decodes "cluster-" → "". + projectIDForInstance InstanceProjectIDFunc + // projectNamespaceForInstance derives the in-project namespace where + // ResourceClaims must be created. In Milo mode the ResourceClaim lives in + // instance.Namespace (the project-level namespace); in single-cell mode the + // edge namespace is ns-{uid} which does not exist in the project control + // plane — the real namespace is the upstream-namespace label value (e.g. + // "default"). When nil, falls back to instance.Namespace. + projectNamespaceForInstance InstanceProjectNamespaceFunc + // clusterNameForProject maps a Milo project ID back to the multicluster + // ClusterName that owns that project's workloads. In Milo mode the + // ClusterName equals the project ID. In single-cell mode the only registered + // cluster is "single" regardless of project ID. When nil, falls back to + // multicluster.ClusterName(projectID), which is correct for Milo mode. + clusterNameForProject func(projectID string) multicluster.ClusterName + // FederationClient is an optional client pointing at the upstream + // Karmada/federation control plane (configured via --federation-kubeconfig). + // When non-nil, the reconciler writes a copy of each Instance back to the + // federation control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + FederationClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/finalizers,verbs=update // +kubebuilder:rbac:groups=quota.miloapis.com,resources=resourceclaims,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -69,29 +171,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), req.ClusterName, &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,94 +199,439 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, quotaErr := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) + + // Even when reconcileQuotaCondition returns a transient error, persist any + // condition change first so the failure reason is visible on the Instance. + // We return the error afterwards so controller-runtime requeues with backoff. + readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return ctrl.Result{}, err } - statusChanged := false + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + // Return with the quota error (nil or transient) so controller-runtime + // requeues with backoff on failures. On the success path (quotaErr==nil) + // we fall through to removeQuotaSchedulingGate below instead of returning + // early, so the gate is cleared in the same reconcile pass rather than + // waiting for a requeue that may never come (ResourceClaim is immutable + // and local Instances are not watched). + if quotaErr != nil { + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, quotaErr + } + } else if quotaErr != nil { + // No status change but quota evaluation failed — return error to requeue. + return ctrl.Result{}, quotaErr + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + if r.quotaClientManager != nil { + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project ID during deletion: %w", err) + } + if projectID == "" { + // Cannot locate the claim without a project ID. Log at ERROR and emit an + // event so the operator is aware of the orphaned claim. Fall through to + // finalizer removal so the Instance is not permanently stuck in Terminating. + // The orphaned claim will count against project budget until Milo's TTL/GC + // removes it. + log.FromContext(ctx).Error(nil, "project ID unresolvable during deletion; ResourceClaim may be orphaned — budget leak possible", + "instance", instance.Name, "namespace", instance.Namespace) + r.recorder.Event(instance, corev1.EventTypeWarning, + "QuotaClaimOrphaned", + "Skipping ResourceClaim cleanup: project ID could not be resolved; claim may be orphaned in Milo project control plane") + quotametrics.ClaimOrphanedTotal.Inc() + } else { + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + return fmt.Errorf("failed getting quota client for deletion: %w", err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project namespace during deletion: %w", err) + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + var claim quotav1alpha1.ResourceClaim + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: claimNamespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + } else { + if err := projectClient.Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } + } + } + } + + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns (changed, err) where +// changed=true means a status update is required, and err non-nil means the +// reconciler should requeue (with backoff) in addition to writing the condition. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, claimErr := r.reconcileQuotaClaim(ctx, clusterName, instance) + + // reconcileQuotaClaim returns (condition, err). A non-nil error signals a + // transient infrastructure failure; a non-nil condition carries the reason to + // write. Both can be non-nil: write the condition AND requeue with backoff. switch { - case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition == nil && claimErr == nil: + // No claim yet and no error: labels not yet propagated. Stay PendingEvaluation. + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, + }), nil + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionFalse && + grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason: + // Claim exists but pending — no AllowanceBucket. Distinct from "evaluating". + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceQuotaGrantedReasonNoBudget, + Message: "ResourceClaim is pending: no AllowanceBucket configured for this project", + ObservedGeneration: instance.Generation, + }) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonNoBudget, + "ResourceClaim pending: no AllowanceBucket configured for this project") + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonNoBudget).Inc() + return changed, claimErr + + case grantedCondition != nil && grantedCondition.Type == computev1alpha.InstanceQuotaGranted: + // reconcileQuotaClaim populated a structured failure condition. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: grantedCondition.Status, + Reason: grantedCondition.Reason, + Message: grantedCondition.Message, + ObservedGeneration: instance.Generation, }) + return changed, claimErr - case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionTrue: + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr - case grantedCondition.Status == metav1.ConditionFalse: + case grantedCondition != nil: // False, non-pending reason from ResourceClaim reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr + + default: // grantedCondition == nil && claimErr != nil — should not reach here + return false, claimErr } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +// It guards on ObservedGeneration to prevent a stale True condition from +// generation N unblocking a generation N+1 instance before quota for the +// new spec has been evaluated. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + // Stale condition guard: only remove the gate if the condition reflects the + // current spec generation. A condition from an older generation means quota + // has not yet been evaluated for the current spec. + if quotaGrantedCond.ObservedGeneration != instance.Generation { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + // Already gone — nothing to do. + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.FederationClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToUpstream copies the Instance spec and status to the upstream +// Karmada/federation control plane so that the InstanceProjector can aggregate +// state from all POP cells. It is a no-op when FederationClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToUpstream(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if r.FederationClient == nil { + return nil + } + + // Encode the POP-cell cluster name using the same convention as NSO's + // MappedNamespaceResourceStrategy: "cluster-" with "/" → "_". + // This is the fallback; the namespace label takes precedence when present. + encodedClusterName := "cluster-" + strings.ReplaceAll(string(clusterName), "/", "_") + + // Read the upstream project namespace name and cluster name from the namespace + // labels stamped by NSO's MappedNamespaceResourceStrategy. These carry the true + // project cluster name (e.g. "cluster-datum-cloud") and upstream namespace (e.g. + // "default"), which the InstanceProjector needs to find the right project cluster. + upstreamNamespace := instance.Namespace // fallback: cell namespace (ns-) + var downstreamNS corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err == nil { + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]; v != "" { + upstreamNamespace = v + } + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]; v != "" { + encodedClusterName = v } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + logger := log.FromContext(ctx) + missingLabels := []string{} + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + if instance.Labels[key] == "" { + missingLabels = append(missingLabels, key) } } + if len(missingLabels) > 0 { + logger.Info("instance is missing linking labels for write-back; projection owner-ref will not be set", + "instance", instance.Name, "namespace", instance.Namespace, + "missingLabels", missingLabels) + } - return ctrl.Result{}, nil + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + computev1alpha.WorkloadUIDLabel: instance.Labels[computev1alpha.WorkloadUIDLabel], + computev1alpha.WorkloadDeploymentUIDLabel: instance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + computev1alpha.InstanceIndexLabel: instance.Labels[computev1alpha.InstanceIndexLabel], + computev1alpha.WorkloadDeploymentNameLabel: instance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + computev1alpha.CityCodeLabel: instance.Labels[computev1alpha.CityCodeLabel], + computev1alpha.WorkloadNameLabel: instance.Labels[computev1alpha.WorkloadNameLabel], + computev1alpha.PlacementNameLabel: instance.Labels[computev1alpha.PlacementNameLabel], + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // Ensure the namespace exists in the downstream control plane before creating the Instance. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: instance.Namespace}} + if err := r.FederationClient.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed ensuring downstream namespace: %w", err) + } + if err := r.FederationClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) + } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Build a comparable map containing only the keys this function owns so that + // Karmada-managed labels on the existing object do not cause spurious updates. + ownedLabels := make(map[string]string, len(writeBack.Labels)) + for k := range writeBack.Labels { + ownedLabels[k] = existing.Labels[k] + } + + // Update spec + labels only if owned keys differ. + if !apiequality.Semantic.DeepEqual(existing.Spec, instance.Spec) || + !apiequality.Semantic.DeepEqual(ownedLabels, writeBack.Labels) { + existing.Spec = instance.Spec + // Merge writeBack.Labels into existing.Labels. Only keys owned by + // writeBackToUpstream are written; any labels Karmada or other actors + // have placed on the downstream object are preserved. + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + maps.Copy(existing.Labels, writeBack.Labels) + if err := r.FederationClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + } + + // Update status only if it differs. + if !apiequality.Semantic.DeepEqual(existing.Status, instance.Status) { + existing.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + } + + return nil } -func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { +// reconcileQuotaClaim attempts to create or observe a ResourceClaim for the +// given instance. It returns: +// - (nil, nil) — labels not yet propagated; caller sets PendingEvaluation +// - (condition, nil) — terminal condition (True/False/Unknown from claim or failure) +// - (condition, err) — condition to write + transient error to requeue with backoff +// +// The condition's Type field is always InstanceQuotaGranted when set by this function +// to distinguish it from ResourceClaim conditions returned directly. +func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (*metav1.Condition, error) { + if r.quotaClientManager == nil { + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, + Message: "Quota enforcement disabled: no credential configured", + }, nil + } + logger := log.FromContext(ctx) + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + // Transient: namespace API unreachable. Return structured condition + error. + msg := fmt.Sprintf("Could not resolve project ID: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project ID for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if projectID == "" { + // Labels not yet propagated — bootstrap transient, not an error. + return nil, nil + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + msg := fmt.Sprintf("Failed to build quota client for project %q: %v", projectID, err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting quota client for project %q: %w", projectID, err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + msg := fmt.Sprintf("Could not resolve project namespace: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project namespace for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if claimNamespace == "" { + return nil, nil + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) requests := []quotav1alpha1.ResourceRequest{ { - ResourceType: "compute.datumapis.com/instances", + ResourceType: quotaResourceTypeInstances, Amount: 1, }, } @@ -213,39 +655,99 @@ func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterNam desired := "av1alpha1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ Name: claimName, - Namespace: instance.Namespace, + Namespace: claimNamespace, + Labels: map[string]string{ + instanceQuotaClaimSourceLabel: r.edgeClusterName, + }, }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", - Name: clusterName, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instance.Name, - Namespace: instance.Namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, Requests: requests, }, } var existing quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { - if !apierrors.IsNotFound(err) { - return nil, fmt.Errorf("failed getting resource claim: %w", err) - } - if err := r.managementCluster.GetClient().Create(ctx, desired); err != nil { - return nil, fmt.Errorf("failed creating resource claim: %w", err) + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { + if apierrors.IsNotFound(err) { + // Claim doesn't exist yet — attempt to create it. + createErr := projectClient.Create(ctx, desired) + if createErr == nil { + return nil, nil + } + return r.classifyCreateError(instance, projectID, claimNamespace, createErr) } - return nil, nil + // GET itself failed — treat as backend unavailable. + msg := fmt.Sprintf("Quota backend unreachable getting ResourceClaim: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting resource claim: %w", err) } grantedCondition := apimeta.FindStatusCondition(existing.Status.Conditions, quotav1alpha1.ResourceClaimGranted) return grantedCondition, nil } +// classifyCreateError maps a ResourceClaim creation error to a structured +// QuotaGranted condition with a specific reason, emits a Kubernetes event, and +// increments the appropriate metric counter. +func (r *InstanceReconciler) classifyCreateError( + instance *computev1alpha.Instance, + projectID, claimNamespace string, + err error, +) (*metav1.Condition, error) { + var reason, metricLabel, msg string + + switch { + case apierrors.IsNotFound(err): + // 404 on Create: either the project control plane path doesn't exist + // (project deleted) or the namespace doesn't exist yet. + if claimNamespace != "" { + // Namespace-level 404. + reason = computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound + metricLabel = quotametrics.ReasonNamespaceNotFound + msg = fmt.Sprintf("Quota claim namespace %q not found on project %q control plane", claimNamespace, projectID) + } else { + reason = computev1alpha.InstanceQuotaGrantedReasonProjectNotFound + metricLabel = quotametrics.ReasonProjectNotFound + msg = fmt.Sprintf("Milo project %q not found", projectID) + } + case apierrors.IsForbidden(err) || apierrors.IsInvalid(err): + // 403/422: quota admission plugin rejected the claim. + reason = computev1alpha.InstanceQuotaGrantedReasonMisconfigured + metricLabel = quotametrics.ReasonMisconfigured + msg = fmt.Sprintf("Quota admission rejected ResourceClaim for project %q: %v", projectID, err) + default: + // Connectivity or server error — treat as backend unavailable. + reason = computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable + metricLabel = quotametrics.ReasonBackendUnavailable + msg = fmt.Sprintf("Quota backend unreachable creating ResourceClaim: %v", err) + } + + r.recorder.Event(instance, corev1.EventTypeWarning, reason, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(metricLabel).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: reason, + Message: msg, + }, fmt.Errorf("failed creating resource claim: %w", err) +} + func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime if rt.Sandbox != nil { @@ -327,7 +829,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, ObservedGeneration: instance.Generation, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, } } else { readyCondition = readyCondition.DeepCopy() @@ -344,8 +846,9 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = "NetworkFailedToCreate" + readyCondition.Reason = reasonNetworkFailedToCreate readyCondition.Message = networkCreationFailureMessage } else { readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent @@ -360,12 +863,13 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason } - readyCondition.Message = "Instance has not been programmed" + readyCondition.Message = msgNotProgrammed if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { readyCondition.Message = programmedCondition.Message } @@ -379,6 +883,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { logger.Info("instance is not running", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason if runningCondition != nil && runningCondition.Reason != pendingReason { readyCondition.Reason = runningCondition.Reason @@ -394,7 +899,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( readyCondition.Status = metav1.ConditionTrue readyCondition.Reason = computev1alpha.InstanceReadyReasonRunning - readyCondition.Message = "Instance is ready" + readyCondition.Message = msgInstanceReady return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -436,38 +941,118 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, return false, "", nil } +// resolveProjectID returns the Milo project ID to use for quota calls. +// When projectIDForInstance is set it delegates to that function; otherwise it +// falls back to string(clusterName), which is correct for Milo-mode deployments +// where the cluster name IS the project name. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectID(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectIDForInstance != nil { + return r.projectIDForInstance(ctx, clusterName, instance) + } + return string(clusterName), nil +} + +// resolveProjectNamespace returns the namespace within the Milo project control +// plane where ResourceClaims for this instance should be created. +// When projectNamespaceForInstance is set it delegates to that function; +// otherwise it falls back to instance.Namespace, which is correct for +// Milo-mode deployments where the project-side namespace already matches the +// instance namespace. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectNamespace(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectNamespaceForInstance != nil { + return r.projectNamespaceForInstance(ctx, clusterName, instance) + } + return instance.Namespace, nil +} + +// resolveClusterNameForProject returns the multicluster ClusterName for the +// given project ID. When clusterNameForProject is set it delegates to that +// function; otherwise it falls back to multicluster.ClusterName(projectID), +// which is correct for Milo-mode deployments where the cluster name IS the +// project name. +func (r *InstanceReconciler) resolveClusterNameForProject(projectID string) multicluster.ClusterName { + if r.clusterNameForProject != nil { + return r.clusterNameForProject(projectID) + } + return multicluster.ClusterName(projectID) +} + // SetupWithManager sets up the controller with the Manager. -func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementCluster cluster.Cluster) error { +// +// quotaRestConfig is the REST config used to reach Milo project control planes +// for ResourceClaim management. Pass nil to disable quota accounting. +// +// projectIDForInstance derives the Milo project ID for each reconcile request. +// In Milo mode pass nil (falls back to using ClusterName). In single-cell mode +// pass a function that returns instance.Namespace. +// +// clusterNameForProject maps a project ID back to the multicluster ClusterName. +// In Milo mode pass nil (falls back to ClusterName(projectID)). In single-cell +// mode pass a function that always returns "single". +func (r *InstanceReconciler) SetupWithManager( + mgr mcmanager.Manager, + quotaRestConfig *rest.Config, + projectIDForInstance InstanceProjectIDFunc, + projectNamespaceForInstance InstanceProjectNamespaceFunc, + edgeClusterName string, + clusterNameForProject func(projectID string) multicluster.ClusterName, +) error { r.mgr = mgr - r.managementCluster = managementCluster - - // Watch ResourceClaim objects on the management cluster directly, bypassing - // the multicluster clusterInjectingQueue which would overwrite ClusterName. - // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request - // values with the correct ClusterName taken from claim.Spec.ConsumerRef.Name. - claimSource := ctrlsource.TypedKind( - managementCluster.GetCache(), - "av1alpha1.ResourceClaim{}, - handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, claim *quotav1alpha1.ResourceClaim) []mcreconcile.Request { - if claim.Spec.ResourceRef.Kind != "Instance" || claim.Spec.ResourceRef.APIGroup != "compute.datumapis.com" { - return nil - } - return []mcreconcile.Request{ - { - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: claim.Spec.ResourceRef.Name, - Namespace: claim.Spec.ResourceRef.Namespace, - }, - }, - ClusterName: claim.Spec.ConsumerRef.Name, - }, - } - }), - ) + r.scheme = mgr.GetLocalManager().GetScheme() + //nolint:staticcheck // GetEventRecorder (new events API) has an incompatible Eventf + // signature (requires related object + action args) that would require migrating + // all emit sites. GetEventRecorderFor remains correct; migration is deferred. + r.recorder = mgr.GetLocalManager().GetEventRecorderFor("instance-controller") + r.edgeClusterName = edgeClusterName + r.projectIDForInstance = projectIDForInstance + r.projectNamespaceForInstance = projectNamespaceForInstance + r.clusterNameForProject = clusterNameForProject + if quotaRestConfig != nil { + if edgeClusterName == "" { + return fmt.Errorf("edgeClusterName must be set when quota enforcement is enabled; set discovery.clusterName in the server config") + } + r.quotaClientManager = quotametrics.New(quotaRestConfig) + } + + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + + edgeClusterNameVal := r.edgeClusterName return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Instance{}, mcbuilder.WithEngageWithLocalCluster(false)). - WatchesRawSource(claimSource). + Watches( + "av1alpha1.ResourceClaim{}, + func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []mcreconcile.Request { + claim := obj.(*quotav1alpha1.ResourceClaim) + if claim.Spec.ResourceRef.Name == "" { + return nil + } + return []mcreconcile.Request{ + { + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: claim.Spec.ResourceRef.Namespace, + Name: claim.Spec.ResourceRef.Name, + }, + }, + ClusterName: r.resolveClusterNameForProject(claim.Spec.ConsumerRef.Name), + }, + } + }, + ) + }, + mcbuilder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal + })), + ). Complete(r) } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1a15090b..31636c3f 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "net/http" "testing" "github.com/stretchr/testify/assert" @@ -12,50 +11,39 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/quota" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} +// Test constants for repeated string literals across controller package tests. +const ( + testInstanceName = "test-instance" + testReasonString = "TestReason" + testMessageString = "Test message" + testUIDString = "test-uid" + testInstanceType = "d1-standard-2" + testDefaultPlacement = "default" + testDefaultNamespace = "default" + testEdgeClusterName = "test-edge" + testComputeAPIVersion = "compute.datumapis.com/v1alpha" + testQuotaAPIGroup = "quota.miloapis.com" + testQuotaResource = "resourceclaims" + kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment +) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { @@ -79,8 +67,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance without ready condition should create default", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, }, @@ -89,7 +77,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, }, }, @@ -97,8 +85,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates should set scheduling gates present", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -114,7 +102,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -134,8 +122,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates and network failure should set network failed", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -153,7 +141,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "NetworkFailedToCreate", + Reason: reasonNetworkFailedToCreate, Message: "Network creation failed: timeout", ObservedGeneration: 1, }, @@ -162,8 +150,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance not programmed should set pending programming", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -171,8 +159,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -181,8 +169,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -190,8 +178,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance programmed but not running should wait for running", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -200,13 +188,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -215,8 +203,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -224,8 +212,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance fully ready should set ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -234,13 +222,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, }, }, }, @@ -250,7 +238,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -258,8 +246,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "no change when condition already matches", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -268,7 +256,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -276,13 +264,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, }, }, }, @@ -292,7 +280,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -343,8 +331,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota denied blocks ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -360,14 +348,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, LastTransitionTime: metav1.Now(), }, }, @@ -385,8 +373,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota available does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -402,14 +390,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, LastTransitionTime: metav1.Now(), }, }, @@ -420,15 +408,15 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, }, }, { name: "quota pending unknown does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -448,7 +436,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, }, }, } @@ -501,25 +489,28 @@ func TestReconcileQuota(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, Namespace: namespace, - UID: "test-uid", + UID: testUIDString, }, } } // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, - Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Namespace: testDefaultNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "compute.datumapis.com/v1alpha", - Kind: "WorkloadDeployment", + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, Name: deploymentName, - UID: "test-uid", + UID: testUIDString, Controller: func() *bool { b := true; return &b }(), }, }, @@ -529,7 +520,7 @@ func TestReconcileQuota(t *testing.T) { SchedulingGates: gates, }, Runtime: computev1alpha.InstanceRuntimeSpec{ - Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, }, NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, }, @@ -544,18 +535,21 @@ func TestReconcileQuota(t *testing.T) { }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, Name: clusterName, }, + // ResourceRef points at the Project resource (cluster-scoped), not the + // Instance. The quota admission plugin validates against the + // ResourceRegistration's claimingResources, which only allows + // resourcemanager.miloapis.com/Project. ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instanceName, - Namespace: namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: clusterName, }, Requests: []quotav1alpha1.ResourceRequest{ - {ResourceType: "compute.datumapis.com/instances", Amount: 1}, + {ResourceType: quotaResourceTypeInstances, Amount: 1}, }, }, Status: quotav1alpha1.ResourceClaimStatus{ @@ -572,7 +566,7 @@ func TestReconcileQuota(t *testing.T) { } } - newReconciler := func(t *testing.T, projectObjs []client.Object, mgmtObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { + newReconciler := func(t *testing.T, projectObjs []client.Object, quotaObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { t.Helper() s := newTestScheme(t) @@ -582,26 +576,44 @@ func TestReconcileQuota(t *testing.T) { WithStatusSubresource(&computev1alpha.Instance{}). Build() - mgmtClient := fake.NewClientBuilder(). + quotaClient := fake.NewClientBuilder(). WithScheme(s). - WithObjects(mgmtObjs...). + WithObjects(quotaObjs...). WithStatusSubresource("av1alpha1.ResourceClaim{}). Build() mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ - mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + // Milo mode: project ID == ClusterName; claim namespace == instance.Namespace. + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + // nil → falls back to instance.Namespace, which is correct for Milo mode. + projectNamespaceForInstance: nil, } - return r, projectClient, mgmtClient + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + return r, projectClient, quotaClient } - t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True", func(t *testing.T) { + t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True in single reconcile", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -611,7 +623,10 @@ func TestReconcileQuota(t *testing.T) { r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) - // First reconcile: sets QuotaGranted=True in status, returns early. + // Single reconcile: sets QuotaGranted=True in status AND removes the + // Quota scheduling gate in the same pass. The early-return-before-gate- + // removal bug required a second reconcile that never arrived because + // ResourceClaims are immutable and local Instances are not watched. _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -623,19 +638,13 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) - // Second reconcile: status is already set, so removes the scheduling gate. - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) - hasQuotaGate := false for _, g := range updated.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate must be removed in the same reconcile pass as the status update") }) t.Run("quota exceeded flow: conditions cascade to block Programmed/Running/Ready", func(t *testing.T) { @@ -709,7 +718,9 @@ func TestReconcileQuota(t *testing.T) { } require.NoError(t, mgmtClient.Status().Update(context.Background(), &existingClaim)) - // Second reconcile should see granted claim and update status. + // Second reconcile should see the granted claim, update status to + // QuotaGranted=True, AND remove the gate in the same pass (no third + // reconcile required). _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -719,28 +730,41 @@ func TestReconcileQuota(t *testing.T) { require.NotNil(t, quotaCond) assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) - // Third reconcile removes the gate (status is already true, no more status write needed). - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &recovered)) hasQuotaGate := false for _, g := range recovered.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed after quota granted") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate should be removed in the same reconcile pass that sets QuotaGranted=True") }) t.Run("deleted before grant: finalizer deletes claim and is removed", func(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) @@ -766,3 +790,783 @@ func TestReconcileQuota(t *testing.T) { } }) } + +// TestQuotaGateRemovedInSingleReconcile is a regression test for the bug where +// the Quota scheduling gate was never removed from an Instance after quota was +// granted. The root cause was an early return in the Reconcile function: when +// reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code +// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// Because ResourceClaims are immutable (no further transitions) and local +// Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever +// arrived — leaving the Quota gate stranded in spec.controller.schedulingGates +// and the projected Instance stuck "Pending (SchedulingGatesPresent)". +// +// The fix: on the success path (quotaErr==nil), fall through to +// removeQuotaSchedulingGate after persisting the status update, so gate removal +// happens in the same reconcile pass as the QuotaGranted=True status write. +func TestQuotaGateRemovedInSingleReconcile(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + deploymentName = "my-deployment" + ) + + claimName := namespace + "--" + instanceName + + tests := []struct { + name string + initialGates []computev1alpha.SchedulingGate + expectGateGone bool + }{ + { + name: "Quota gate only: removed in single reconcile when claim is granted", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "Quota gate plus Network gate: Quota removed, Network preserved", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "No gates: no-op, reconcile completes cleanly", + initialGates: []computev1alpha.SchedulingGate{}, + expectGateGone: false, // no gate to begin with + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Generation: 1, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: tt.initialGates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: namespace, UID: testUIDString}, + } + + // ResourceClaim already in QuotaAvailable state — simulates the state + // that triggered the bug: claim already granted but gate still present. + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota available", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Exactly one reconcile — must be sufficient to both set QuotaGranted=True + // and remove the Quota gate. No second reconcile should be required. + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + // QuotaGranted condition must be set to True. + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be present") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Quota gate must be gone after the single reconcile. + hasQuotaGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasQuotaGate = true + } + } + if tt.expectGateGone { + assert.False(t, hasQuotaGate, + "Quota gate must be removed in the same reconcile pass as the QuotaGranted=True status write; "+ + "a stranded gate leaves the projected Instance stuck Pending (SchedulingGatesPresent)") + } + + // Network gate (if present) must be preserved — only the Quota gate is + // cleared by InstanceReconciler; NetworkSchedulingGate is owned by + // WorkloadDeploymentReconciler. + for _, g := range updated.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.QuotaSchedulingGate.String(), g.Name, + "Quota gate must not remain after granted claim") + } + }) + } +} + +// TestReconcileQuotaSingleMode verifies that in single-cell mode: +// - the project ID is decoded from the upstream-cluster-name label on the edge +// namespace (not taken from the always-"single" ClusterName) +// - the ResourceClaim is created in the in-project namespace (upstream-namespace +// label, e.g. "default"), not in the edge namespace (ns-abc123) +// - the ResourceRef points at resourcemanager.miloapis.com/Project, not Instance +func TestReconcileQuotaSingleMode(t *testing.T) { + const ( + instanceName = "my-instance" + edgeNS = "ns-abc123" // edge namespace (ns-{uid}) — does NOT exist in project CP + projectID = "datum-cloud" // decoded from "cluster-datum-cloud" + projectNS = "default" // upstream-namespace label value — where claims live + deploymentName = "my-deployment" + ) + + // Claim name uses the edge namespace prefix (stable identifier for the claim) + // but the claim object itself lives in projectNS. + claimName := edgeNS + "--" + instanceName + + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: edgeNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: edgeNS, UID: "test-uid"}, + } + + // ResourceClaim lives in projectNS ("default"), not edgeNS ("ns-abc123"). + // ResourceRef points at the Project resource, matching the ResourceRegistration's + // claimingResources (resourcemanager.miloapis.com/Project only). + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: projectNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + // The quota client is keyed by projectID ("datum-cloud"), matching what + // projectIDForInstance returns after decoding "cluster-datum-cloud". + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(projectID, quotaClient) + + const singleCluster = "single" + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + singleCluster: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: singleCluster, + // Single-cell mode: project ID decoded from upstream-cluster-name label. + // Simulates what cmd/main.go does for "cluster-datum-cloud" → "datum-cloud". + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectID, nil + }, + // Single-cell mode: claim namespace comes from upstream-namespace label. + // Simulates what cmd/main.go does by reading the edge namespace labels. + projectNamespaceForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectNS, nil + }, + // Single-cell mode: watch map func must always return "single". + clusterNameForProject: func(_ string) multicluster.ClusterName { + return singleCluster + }, + } + + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: edgeNS, Name: instanceName}}, + ClusterName: singleCluster, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: edgeNS, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be set") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status, "quota should be granted in single mode") + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Verify clusterNameForProject always returns "single" so the watch map func + // never enqueues an unknown cluster name. + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject(projectID)) + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject("any-other-project")) + + // Verify resolveProjectNamespace returns the in-project namespace, not the edge namespace. + resolvedNS, resolveErr := r.resolveProjectNamespace(context.Background(), singleCluster, instance) + require.NoError(t, resolveErr) + assert.Equal(t, projectNS, resolvedNS, "claim namespace must be the in-project namespace, not the edge namespace") +} + +// TestReconcileQuotaFailureModes verifies that infrastructure failures in the +// quota path set specific QuotaGranted=False conditions (fail-closed) rather +// than silently allowing workloads to schedule. +func TestReconcileQuotaFailureModes(t *testing.T) { + const ( + testProject = "test-project" + testNS = "default" + testInstance = "my-instance" + testDeployment = "my-deployment" + ) + + makeInstance := func() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstance, + Namespace: testNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: testDeployment, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + } + + makeDeployment := func() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testDeployment, Namespace: testNS, UID: testUIDString}, + } + } + + newReconcilerWithInterceptor := func( + t *testing.T, + funcs interceptor.Funcs, + fakeRecorder *record.FakeRecorder, + ) (*InstanceReconciler, client.Client) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(funcs). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient + } + + reconcileReq := func() mcreconcile.Request { + return mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: testNS, Name: testInstance}}, + ClusterName: testProject, + } + } + + t.Run("FM-2: backend unreachable sets QuotaBackendUnavailable", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return fmt.Errorf("connection refused") + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + // Reconcile returns error for transient failures. + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, cond.Reason) + + // Event should have been emitted. + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable) + default: + t.Error("expected a Warning event for backend unavailable, got none") + } + }) + + // FM-4/FM-5: 404 on Create maps to NamespaceNotFound when the claim namespace + // is known (the more common case for project-exists-but-namespace-absent), and + // to ProjectNotFound when the namespace itself is empty (project CP path missing). + t.Run("FM-5: 404 on Create with known namespace sets QuotaNamespaceNotFound", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + notFoundErr := apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return notFoundErr + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return notFoundErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // claimNamespace == testNS (non-empty) → NamespaceNotFound, not ProjectNotFound. + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound, cond.Reason, + "404 on Create with known namespace should map to NamespaceNotFound") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound) + default: + t.Error("expected a Warning event for namespace not found, got none") + } + }) + + t.Run("FM-6: 403 on Create sets QuotaMisconfigured", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + forbiddenErr := apierrors.NewForbidden( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim", + fmt.Errorf("ResourceRegistration not found")) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return forbiddenErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonMisconfigured, cond.Reason, + "403 on Create should map to Misconfigured") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonMisconfigured) + default: + t.Error("expected a Warning event for misconfigured quota, got none") + } + }) + + t.Run("FM-7: claim pending with no budget sets QuotaNoBudget", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + claimName := testNS + "--" + testInstance + pendingClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionFalse, + Reason: quotav1alpha1.ResourceClaimPendingReason, + Message: "No AllowanceBucket configured", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(pendingClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err, "pending-no-budget is not a transient error — no requeue needed") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNoBudget, cond.Reason, + "pending claim with no budget should use NoBudget reason, not PendingEvaluation") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNoBudget) + default: + t.Error("expected a Warning event for no budget, got none") + } + }) + + t.Run("quota disabled: quotaClientManager nil sets QuotaDisabled (not QuotaAvailable)", func(t *testing.T) { + s := newTestScheme(t) + instance := makeInstance() + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: nil, // explicitly disabled + edgeClusterName: testEdgeClusterName, + recorder: record.NewFakeRecorder(10), + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, cond.Reason, + "intentionally disabled quota should use QuotaDisabled reason") + }) + + t.Run("observedGeneration guard: stale True condition does not remove gate for new generation", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + // Instance at generation 2 with a stale QuotaGranted=True from generation 1. + instance := makeInstance() + instance.Generation = 2 + instance.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "quota granted (generation 1)", + ObservedGeneration: 1, // stale — does not match instance.Generation=2 + LastTransitionTime: metav1.Now(), + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + claimName := testNS + "--" + testInstance + grantedClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(grantedClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with + // ObservedGeneration=2 into the in-memory instance, status is persisted, + // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // instance.Generation=2) and removes the gate — all in one pass. + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + hasGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be removed in the same reconcile that refreshes the condition to current generation") + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition must reflect current generation") + }) +} From a8c2bd550bd19e4e783e830149c7a0a00a78b9ce Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:08:16 -0500 Subject: [PATCH 07/11] feat(cmd): cell and management-plane wiring with feature gates Wire the manager to run in either cell or management-plane mode, gating the federator, projector, and per-cell controllers behind feature flags. Add the feature-gate registry and extend configuration to carry the downstream kubeconfig and discovery settings each mode needs. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/main.go | 347 ++++++++++++++++++++++++----- internal/config/config.go | 47 ++++ internal/config/config_test.go | 67 ++++++ internal/features/features.go | 59 +++++ internal/features/features_test.go | 43 ++++ 5 files changed, 502 insertions(+), 61 deletions(-) create mode 100644 internal/features/features.go create mode 100644 internal/features/features_test.go diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc9..01d3eddd 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -8,6 +8,8 @@ import ( "flag" "fmt" "os" + "strings" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -18,29 +20,42 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" + "go.datum.net/compute/internal/features" + quotametrics "go.datum.net/compute/internal/quota" computewebhook "go.datum.net/compute/internal/webhook" computev1alphawebhooks "go.datum.net/compute/internal/webhook/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" multiclusterproviders "go.miloapis.com/milo/pkg/multicluster-runtime" milomulticluster "go.miloapis.com/milo/pkg/multicluster-runtime/milo" + corev1 "k8s.io/api/core/v1" // +kubebuilder:scaffold:imports ) +// singleClusterName is the fixed cluster name that mcsingle.New registers. +// All single-mode wiring that references this cluster must use this constant. +const singleClusterName = "single" + var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") @@ -51,6 +66,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // federationRestConfig holds the REST config for the Karmada federation control + // plane. It is populated from --federation-kubeconfig when set, and is nil + // when the flag is omitted. + federationRestConfig *rest.Config ) func init() { @@ -61,22 +81,45 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } +//nolint:gocyclo // main wires all controller paths; complexity is inherent to startup sequencing func main() { var enableLeaderElection bool var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var federationKubeconfig string + var federationContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&federationKubeconfig, "federation-kubeconfig", "", + "Path to the kubeconfig file for the Karmada federation control plane. "+ + "Required when --enable-management-controllers is set. "+ + "When omitted, federation features are disabled.") + flag.StringVar(&federationContext, "federation-context", "", + "Context to use from the federation kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", false, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector).") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", false, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler).") + + var featureGatesFlag string + flag.StringVar(&featureGatesFlag, "feature-gates", "", + "A set of key=value pairs that describe feature gates for the compute operator. "+ + "Example: --feature-gates=NetworkingIntegration=false. "+ + "Available features: NetworkingIntegration (default=true).") opts := zap.Options{ Development: true, @@ -87,8 +130,47 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if featureGatesFlag != "" { + if err := features.MutableFeatureGate.Set(featureGatesFlag); err != nil { + setupLog.Error(err, "unable to parse feature gates", "feature-gates", featureGatesFlag) + os.Exit(1) + } + } + setupLog.Info("feature gates", "NetworkingIntegration", features.FeatureGate.Enabled(features.NetworkingIntegration)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Load the federation (Karmada) control plane REST config when + // --federation-kubeconfig is provided. When the flag is omitted, + // federationRestConfig remains nil; management controllers will refuse to + // start if --enable-management-controllers is also set. + if federationKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: federationKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: federationContext}, + ) + var err error + federationRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load federation kubeconfig", "path", federationKubeconfig) + os.Exit(1) + } + setupLog.Info("federation kubeconfig loaded", "path", federationKubeconfig) + } + + // Fail loud: management controllers require a federation kubeconfig. Silently + // skipping them when --enable-management-controllers is set would leave + // federation and instance projection broken with no visible signal — the same + // class of failure as the quota P1 issue. An operator who explicitly enables + // management controllers but omits --federation-kubeconfig has a misconfiguration + // that must surface immediately rather than at runtime. + if enableManagementControllers && federationRestConfig == nil { + setupLog.Error(nil, + "management controllers enabled but no federation kubeconfig configured", + "hint", "set --federation-kubeconfig") + os.Exit(1) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -96,24 +178,28 @@ func main() { "buildDate", buildDate, ) - var serverConfig config.WorkloadOperator - var configData []byte - if len(serverConfigFile) > 0 { - var err error - configData, err = os.ReadFile(serverConfigFile) - if err != nil { - setupLog.Error(fmt.Errorf("unable to read server config from %q", serverConfigFile), "") - os.Exit(1) - } - } - - if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { - setupLog.Error(err, "unable to decode server config") + serverConfig, err := loadServerConfig(serverConfigFile) + if err != nil { + setupLog.Error(err, "unable to load server config") os.Exit(1) } setupLog.Info("server config", "config", serverConfig) + quotaRestConfig, err := serverConfig.Discovery.QuotaRestConfig() + if err != nil { + setupLog.Error(err, "unable to load quota REST config") + os.Exit(1) + } + if quotaRestConfig != nil { + setupLog.Info("quota REST config loaded", "path", serverConfig.Discovery.QuotaKubeconfigPath) + quotametrics.EnforcementEnabled.Set(1) + } else { + setupLog.Error(nil, "quota enforcement is DISABLED — workloads will schedule without quota accounting; "+ + "set quotaKubeconfigPath in server config to enable enforcement") + quotametrics.EnforcementEnabled.Set(0) + } + cfg := ctrl.GetConfigOrDie() deploymentCluster, err := cluster.New(cfg, func(o *cluster.Options) { @@ -124,7 +210,9 @@ func main() { os.Exit(1) } - runnables, provider, err := initializeClusterDiscovery(serverConfig, deploymentCluster, scheme) + runnables, provider, edgeClusterName, err := initializeClusterDiscovery( + serverConfig, deploymentCluster, scheme, + ) if err != nil { setupLog.Error(err, "unable to initialize cluster discovery") os.Exit(1) @@ -176,21 +264,65 @@ func main() { os.Exit(1) } - if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workload") - os.Exit(1) + if enableManagementControllers { + if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single federation client shared across all controllers that need to + // read or write to the Karmada federation control plane. This is the hub that + // the management controllers federate through and that edge cells write back to. + // Nil when --federation-kubeconfig is not set (i.e. federation is disabled). + var federationClient client.Client + if federationRestConfig != nil { + federationClient, err = client.New(federationRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create federation client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{ + NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + clusterNameForProject := func(_ string) multicluster.ClusterName { + return multicluster.ClusterName(singleClusterName) + } + instanceReconciler := &controller.InstanceReconciler{FederationClient: federationClient} + err = instanceReconciler.SetupWithManager( + mgr, + quotaRestConfig, + singleModeProjectID(mgr), + singleModeProjectNamespace(mgr), + edgeClusterName, + clusterNameForProject, + ) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // WorkloadDeploymentFederator and InstanceProjector are management-plane + // controllers that run on the control-plane cluster. The fail-loud guard above + // ensures federationRestConfig is non-nil when enableManagementControllers is + // true; the nil check here is a defensive belt-and-suspenders guard. + if enableManagementControllers && federationRestConfig != nil { + extra, err := setupManagementControllers(mgr, federationClient) + if err != nil { + setupLog.Error(err, "unable to set up management controllers") + os.Exit(1) + } + runnables = append(runnables, extra...) } if serverConfig.WebhookServer != nil { @@ -223,11 +355,6 @@ func main() { }) } - setupLog.Info("starting cluster discovery provider") - g.Go(func() error { - return ignoreCanceled(provider.Run(ctx, mgr)) - }) - setupLog.Info("starting multicluster manager") g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) @@ -239,51 +366,33 @@ func main() { } } -type runnableProvider interface { - multicluster.Provider - Run(context.Context, mcmanager.Manager) error -} - -// Needed until we contribute the patch in the following PR again (need to sign CLA): -// -// See: https://github.com/kubernetes-sigs/multicluster-runtime/pull/18 -type wrappedSingleClusterProvider struct { - multicluster.Provider - cluster cluster.Cluster -} - -func (p *wrappedSingleClusterProvider) Run(ctx context.Context, mgr mcmanager.Manager) error { - if err := mgr.Engage(ctx, "single", p.cluster); err != nil { - return err - } - return p.Provider.(runnableProvider).Run(ctx, mgr) -} - func initializeClusterDiscovery( serverConfig config.WorkloadOperator, deploymentCluster cluster.Cluster, scheme *runtime.Scheme, -) (runnables []manager.Runnable, provider runnableProvider, err error) { +) (runnables []manager.Runnable, provider multicluster.Provider, edgeClusterName string, err error) { runnables = append(runnables, deploymentCluster) switch serverConfig.Discovery.Mode { case multiclusterproviders.ProviderSingle: - provider = &wrappedSingleClusterProvider{ - Provider: mcsingle.New("single", deploymentCluster), - cluster: deploymentCluster, + provider = mcsingle.New(multicluster.ClusterName(singleClusterName), deploymentCluster) + edgeClusterName = serverConfig.Discovery.ClusterName + if edgeClusterName == "" { + edgeClusterName = singleClusterName } case multiclusterproviders.ProviderMilo: discoveryRestConfig, err := serverConfig.Discovery.DiscoveryRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get discovery rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get discovery rest config: %w", err) } projectRestConfig, err := serverConfig.Discovery.ProjectRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get project rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get project rest config: %w", err) } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, @@ -291,7 +400,7 @@ func initializeClusterDiscovery( }, }) if err != nil { - return nil, nil, fmt.Errorf("unable to set up overall controller manager: %w", err) + return nil, nil, "", fmt.Errorf("unable to set up overall controller manager: %w", err) } provider, err = milomulticluster.New(discoveryManager, milomulticluster.Options{ @@ -304,10 +413,11 @@ func initializeClusterDiscovery( ProjectRestConfig: projectRestConfig, }) if err != nil { - return nil, nil, fmt.Errorf("unable to create datum project provider: %w", err) + return nil, nil, "", fmt.Errorf("unable to create datum project provider: %w", err) } runnables = append(runnables, discoveryManager) + edgeClusterName = serverConfig.Discovery.ClusterName // case providers.ProviderKind: // provider = mckind.New(mckind.Options{ @@ -319,13 +429,29 @@ func initializeClusterDiscovery( // }) default: - return nil, nil, fmt.Errorf( + return nil, nil, "", fmt.Errorf( "unsupported cluster discovery mode %s", serverConfig.Discovery.Mode, ) } - return runnables, provider, nil + return runnables, provider, edgeClusterName, nil +} + +func loadServerConfig(path string) (config.WorkloadOperator, error) { + var serverConfig config.WorkloadOperator + var configData []byte + if len(path) > 0 { + var err error + configData, err = os.ReadFile(path) + if err != nil { + return serverConfig, fmt.Errorf("unable to read server config from %q: %w", path, err) + } + } + if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { + return serverConfig, fmt.Errorf("unable to decode server config: %w", err) + } + return serverConfig, nil } func ignoreCanceled(err error) error { @@ -334,3 +460,102 @@ func ignoreCanceled(err error) error { } return err } + +// setupManagementControllers wires the WorkloadDeploymentFederator and +// InstanceProjector onto mgr. It returns any additional Runnable objects that +// must be started alongside the main manager (the federation manager used by +// InstanceProjector). Called only when management controllers are enabled and +// a federation REST config is available. +func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + federator := &controller.WorkloadDeploymentFederator{FederationClient: federationClient} + if err := federator.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentFederator: %w", err) + } + + // InstanceProjector runs in the management plane, watches Instances written + // back by POP-cell operators to the Karmada federation control plane, and + // projects them into the corresponding project namespaces via the multicluster manager. + federationMgr, err := manager.New(federationRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + return nil, fmt.Errorf("federation manager for InstanceProjector: %w", err) + } + if err = (&controller.InstanceProjector{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("InstanceProjector: %w", err) + } + + return []manager.Runnable{federationMgr}, nil +} + +// singleModeProjectID returns an InstanceProjectIDFunc for single-cell mode. +// It reads the upstream-cluster-name label on the edge namespace (e.g. +// "cluster-datum-cloud") and decodes it to the project ID ("datum-cloud"). +// This is the inverse of the "cluster-" encoding used by NSO's +// MappedNamespaceResourceStrategy when stamping cluster-scoped namespace labels. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectID(mgr mcmanager.Manager) controller.InstanceProjectIDFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + encoded := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encoded == "" { + setupLog.Info("singleModeProjectID: upstream-cluster-name label missing", + "namespace", inst.Namespace) + return "", nil + } + projectID := strings.TrimPrefix(encoded, "cluster-") + return strings.ReplaceAll(projectID, "_", "/"), nil + } +} + +// singleModeProjectNamespace returns an InstanceProjectNamespaceFunc for +// single-cell mode. It reads the upstream-namespace label on the edge namespace +// (e.g. "ns-efdf8ca1-...") to find the in-project namespace ("default") where +// ResourceClaims must be created in the project control plane. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectNamespace(mgr mcmanager.Manager) controller.InstanceProjectNamespaceFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + projectNS := ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNS == "" { + setupLog.Info("singleModeProjectNamespace: upstream-namespace label missing", + "namespace", inst.Namespace) + return "", nil + } + return projectNS, nil + } +} + +// readEdgeNamespace reads the edge namespace object via the uncached APIReader +// (no informer started, no cache sync required) with a short deadline. +// Returns a transient error on API failures so callers can requeue with backoff. +func readEdgeNamespace( + ctx context.Context, + mgr mcmanager.Manager, + clusterName multicluster.ClusterName, + namespace string, +) (corev1.Namespace, error) { + cl, err := mgr.GetCluster(ctx, clusterName) + if err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: getting cluster %q: %w", clusterName, err) + } + var ns corev1.Namespace + getCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := cl.GetAPIReader().Get(getCtx, client.ObjectKey{Name: namespace}, &ns); err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: reading namespace %q: %w", namespace, err) + } + return ns, nil +} diff --git a/internal/config/config.go b/internal/config/config.go index dddb7926..4a6e8e76 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -229,6 +229,23 @@ type DiscoveryConfig struct { // template when connecting to project control planes. When not provided, // the operator will use the in-cluster config. ProjectKubeconfigPath string `json:"projectKubeconfigPath"` + + // ClusterName is the stable, unique name for this edge cluster. It is + // stamped onto ResourceClaim objects so that each edge controller can + // distinguish its own claims from those created by other edge controllers + // in the same project control planes. + // + // Required when Mode is "milo". Optional in single mode; defaults to "single". + ClusterName string `json:"clusterName"` + + // QuotaKubeconfigPath is the path to the kubeconfig file used when creating + // ResourceClaim objects against Milo project control planes. When set it + // takes precedence over ProjectKubeconfigPath for quota calls. When both are + // unset, quota accounting is disabled. + // + // Use this field in deployments (mode: single or mode: milo) that need to + // talk to api.datum.net for quota enforcement. + QuotaKubeconfigPath string `json:"quotaKubeconfigPath"` } func SetDefaults_DiscoveryConfig(obj *DiscoveryConfig) { @@ -253,6 +270,36 @@ func (c *DiscoveryConfig) ProjectRestConfig() (*rest.Config, error) { return clientcmd.BuildConfigFromFlags("", c.ProjectKubeconfigPath) } +// QuotaRestConfig returns the REST config for quota ResourceClaim management +// against Milo project control planes. QuotaKubeconfigPath is preferred; if +// unset, ProjectKubeconfigPath is used as a fallback. +// +// Returns (nil, nil) when no credential path is configured at all — this is +// the intentional opt-out case and the caller should disable quota enforcement. +// +// Returns (nil, error) when a credential path IS configured but the file does +// not exist on disk. This is a misconfiguration (Secret not mounted, wrong +// path) that must not silently disable enforcement; callers should treat this +// as a fatal startup error. +func (c *DiscoveryConfig) QuotaRestConfig() (*rest.Config, error) { + path := c.QuotaKubeconfigPath + if path == "" { + path = c.ProjectKubeconfigPath + } + if path == "" { + // No credential path configured: intentional opt-out. Caller logs and + // disables enforcement. + return nil, nil + } + if _, err := os.Stat(path); os.IsNotExist(err) { + // Path explicitly configured but file absent: operator intended enforcement + // but the credential is missing (unmounted Secret, wrong path). Fail loud. + return nil, fmt.Errorf("quota kubeconfig path %q is configured but file does not exist: "+ + "ensure the quota credential Secret is mounted correctly", path) + } + return clientcmd.BuildConfigFromFlags("", path) +} + func init() { SchemeBuilder.Register(&WorkloadOperator{}) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5f586932..bff584a6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "os" + "path/filepath" "testing" "k8s.io/apimachinery/pkg/runtime" @@ -56,3 +58,68 @@ webhookServer: t.Error("TLS.CertDir was not defaulted") } } + +// TestQuotaRestConfig_NilWhenNoPath verifies that omitting quotaKubeconfigPath +// returns (nil, nil) — the intentional opt-out / enforcement-disabled case. +func TestQuotaRestConfig_NilWhenNoPath(t *testing.T) { + cfg := &DiscoveryConfig{} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() = non-nil, want nil (no path configured)") + } +} + +// TestQuotaRestConfig_ErrorWhenPathMissing verifies that explicitly setting a +// kubeconfig path that does not exist on disk returns a non-nil error (fail-loud). +// This reverses the old da63916 behavior of silently returning (nil, nil). +func TestQuotaRestConfig_ErrorWhenPathMissing(t *testing.T) { + cfg := &DiscoveryConfig{ + QuotaKubeconfigPath: "/nonexistent/path/quota.kubeconfig", + } + restCfg, err := cfg.QuotaRestConfig() + if err == nil { + t.Fatal("QuotaRestConfig() error = nil, want non-nil error when path is configured but file absent") + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() returned non-nil config alongside error") + } +} + +// TestQuotaRestConfig_SuccessWhenFileExists verifies that a configured path +// pointing to an existing (though minimal) kubeconfig file succeeds. +func TestQuotaRestConfig_SuccessWhenFileExists(t *testing.T) { + // Write a minimal kubeconfig that clientcmd can parse. + dir := t.TempDir() + kubeconfigPath := filepath.Join(dir, "quota.kubeconfig") + minimalKubeconfig := []byte(`apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://localhost:1234 + name: test +contexts: +- context: + cluster: test + user: test + name: test +current-context: test +users: +- name: test + user: {} +`) + if err := os.WriteFile(kubeconfigPath, minimalKubeconfig, 0600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg := &DiscoveryConfig{QuotaKubeconfigPath: kubeconfigPath} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg == nil { + t.Error("QuotaRestConfig() = nil, want non-nil when file exists") + } +} diff --git a/internal/features/features.go b/internal/features/features.go new file mode 100644 index 00000000..8db20f09 --- /dev/null +++ b/internal/features/features.go @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package features defines the feature gates for the compute operator. Feature +// gates follow the Kubernetes component-base convention: each feature is +// declared as a Feature constant, registered with a FeatureSpec that includes +// its default enablement state, and toggled at runtime via the --feature-gates +// flag exposed by the binary. +// +// Usage in cmd/main.go: +// +// features.MutableFeatureGate.AddFlag(flag.CommandLine) +// +// Usage in controllers: +// +// if features.MutableFeatureGate.Enabled(features.NetworkingIntegration) { ... } +package features + +import ( + "k8s.io/component-base/featuregate" +) + +const ( + // NetworkingIntegration controls whether the compute operator integrates with + // the network-services-operator (VPC) for NetworkBinding provisioning and the + // Network scheduling gate on Instances. + // + // When disabled: + // - No NetworkBinding objects are created. + // - The Network scheduling gate is not added to newly created Instances. + // - Any existing Network scheduling gate is actively removed. + // - The networking step is treated as immediately ready so Instances + // proceed to the runtime without a NetworkBinding. + // + // This flag exists so operators can run compute on edge/lab cells where + // VPC/NSO is not yet functional. The default is true (enabled) so that + // existing production deployments are unaffected. + // + // alpha: v0.1 + NetworkingIntegration featuregate.Feature = "NetworkingIntegration" +) + +// MutableFeatureGate is the mutable feature gate for the compute operator. +// Call MutableFeatureGate.AddFlag to register the --feature-gates flag before +// flag.Parse(). Controllers should read from FeatureGate (the read-only view) +// after startup. +var MutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() + +// FeatureGate is the read-only view of the compute operator feature gate. +// Use this in controllers and reconcilers rather than MutableFeatureGate to +// avoid accidental mutations after startup. +var FeatureGate featuregate.FeatureGate = MutableFeatureGate + +func init() { + if err := MutableFeatureGate.Add(map[featuregate.Feature]featuregate.FeatureSpec{ + NetworkingIntegration: {Default: true, PreRelease: featuregate.Alpha}, + }); err != nil { + panic(err) + } +} diff --git a/internal/features/features_test.go b/internal/features/features_test.go new file mode 100644 index 00000000..61687064 --- /dev/null +++ b/internal/features/features_test.go @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package features + +import ( + "testing" +) + +// TestNetworkingIntegration_DefaultEnabled verifies that the NetworkingIntegration +// feature gate defaults to enabled so that existing production deployments are +// unaffected when the flag is not set. +func TestNetworkingIntegration_DefaultEnabled(t *testing.T) { + // Use a fresh gate so this test is independent of any global state mutations. + gate := MutableFeatureGate.DeepCopy() + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration default = false, want true") + } +} + +// TestNetworkingIntegration_CanBeDisabled verifies that setting +// NetworkingIntegration=false via the feature gate string disables the +// integration, allowing operators to run compute without VPC/NSO. +func TestNetworkingIntegration_CanBeDisabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=false"); err != nil { + t.Fatalf("Set(NetworkingIntegration=false): %v", err) + } + if gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = true after Set=false, want false") + } +} + +// TestNetworkingIntegration_ExplicitlyEnabled verifies that the gate can be +// explicitly set to true (round-trip). +func TestNetworkingIntegration_ExplicitlyEnabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=true"); err != nil { + t.Fatalf("Set(NetworkingIntegration=true): %v", err) + } + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = false after Set=true, want true") + } +} From f97930f20d96b0bbd2757b03e0eb1738c00d8589 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:08:21 -0500 Subject: [PATCH 08/11] feat(webhook): validation updates for federation Update Workload webhook and Instance validation so the API accepts the fields federated scheduling adds and continues to reject invalid placement and runtime specs. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/validation/instance_validation.go | 31 +++++++-- .../validation/workload_validation_test.go | 65 +++++++++++-------- internal/webhook/v1alpha/workload_webhook.go | 58 ++++------------- 3 files changed, 75 insertions(+), 79 deletions(-) diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 7f112822..faa5ba0b 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -17,6 +17,19 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Validation constants for well-known string literals used across multiple +// validation functions. +const ( + // diskTypePDStandard is the only currently supported disk type. + diskTypePDStandard = "pd-standard" + + // defaultImageName is the only currently supported container image. + defaultImageName = "datumcloud/ubuntu-2204-lts" + + // defaultInstanceType is the only currently supported instance type. + defaultInstanceType = "datumcloud/d1-standard-2" +) + func validateInstanceTemplate( template computev1alpha.InstanceTemplateSpec, fieldPath *field.Path, @@ -97,6 +110,11 @@ func validateInstanceNetworkInterfaces( allErrs = append(allErrs, field.Invalid(networkNameField, networkInterface.Network, msg)) } + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + review := authorizationv1.SubjectAccessReview{ Spec: authorizationv1.SubjectAccessReviewSpec{ ResourceAttributes: &authorizationv1.ResourceAttributes{ @@ -110,6 +128,7 @@ func validateInstanceNetworkInterfaces( User: opts.AdmissionRequest.UserInfo.Username, Groups: opts.AdmissionRequest.UserInfo.Groups, UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, }, } @@ -258,8 +277,8 @@ func validateDiskVolumeSource(diskSource *computev1alpha.DiskTemplateVolumeSourc diskTemplateSpecField := diskTemplateField.Child("spec") // TODO(jrese) look up valid disk types - if diskTemplate.Spec.Type != "pd-standard" { - allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{"pd-standard"})) + if diskTemplate.Spec.Type != diskTypePDStandard { + allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{diskTypePDStandard})) } populatorResourceRequests, errs := validateDiskPopulator(diskTemplate.Spec.Populator, diskTemplateField.Child("populator")) @@ -400,8 +419,8 @@ func validateDiskPopulator(populator *computev1alpha.DiskPopulator, fieldPath *f // TODO(jreese) look up image imagePopulator := populator.Image - if imagePopulator.Name != "datumcloud/ubuntu-2204-lts" { - allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{"datumcloud/ubuntu-2204-lts"})) + if imagePopulator.Name != defaultImageName { + allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{defaultImageName})) } } } @@ -657,8 +676,8 @@ func validateInstanceRuntimeResources(resources computev1alpha.InstanceRuntimeRe allErrs := field.ErrorList{} // TODO(jreese) look up available instance types - if resources.InstanceType != "datumcloud/d1-standard-2" { - allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{"datumcloud/d1-standard-2"})) + if resources.InstanceType != defaultInstanceType { + allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{defaultInstanceType})) } if resources.Requests != nil { diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index f73e4c9f..b4e70df7 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -23,6 +23,15 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Test constants for repeated string literals. +const ( + testCPUResource = "cpu" + testVolName = "vol" + testDuplicateMountPath = "duplicate-mount-path" + testDefaultNamespace = "default" + testCityCodeDFW = "DFW" +) + func TestValidateWorkloads(t *testing.T) { scenarios := map[string]struct { workload *computev1alpha.Workload @@ -157,7 +166,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(50, resource.DecimalSI), AverageValue: resource.NewQuantity(50, resource.DecimalSI), @@ -181,7 +190,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -202,7 +211,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageValue: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -223,7 +232,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageUtilization: proto.Int32(0), }, @@ -336,16 +345,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Gi"), @@ -369,16 +378,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Pi"), @@ -402,16 +411,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10.5Gi"), @@ -436,7 +445,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -473,7 +482,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -490,11 +499,11 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { @@ -503,7 +512,7 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, VolumeSource: volumeSource, }, } @@ -540,7 +549,7 @@ func TestValidateWorkloads(t *testing.T) { interceptorFuncs: &interceptor.Funcs{ Create: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.CreateOption) error { if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { - if sar.Spec.ResourceAttributes.Name == "default" && + if sar.Spec.ResourceAttributes.Name == testDefaultNamespace && sar.Spec.ResourceAttributes.Group == networkingv1alpha.GroupVersion.Group && sar.Spec.ResourceAttributes.Version == networkingv1alpha.GroupVersion.Version && sar.Spec.ResourceAttributes.Resource == "networks" { @@ -559,8 +568,8 @@ func TestValidateWorkloads(t *testing.T) { initObjs := []client.Object{ &networkingv1alpha.Network{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "default", + Namespace: testDefaultNamespace, + Name: testDefaultNamespace, }, }, } @@ -606,7 +615,7 @@ func TestValidateWorkloads(t *testing.T) { ) if len(scenario.opts.ValidCityCodes) == 0 { - scenario.opts.ValidCityCodes = []string{"DFW"} + scenario.opts.ValidCityCodes = []string{testCityCodeDFW} } t.Run(name, func(t *testing.T) { @@ -645,7 +654,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, Sandbox: &computev1alpha.SandboxRuntime{ Containers: []computev1alpha.SandboxContainer{ @@ -661,7 +670,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, @@ -702,7 +711,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, VirtualMachine: &computev1alpha.VirtualMachineRuntime{ VolumeAttachments: []computev1alpha.VolumeAttachment{ @@ -719,10 +728,10 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Populator: &computev1alpha.DiskPopulator{ Image: &computev1alpha.ImageDiskPopulator{ - Name: "datumcloud/ubuntu-2204-lts", + Name: defaultImageName, }, }, }, @@ -736,7 +745,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index e3f3735c..a8b94b38 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -6,12 +6,12 @@ import ( "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/validation" @@ -27,8 +27,7 @@ func SetupWorkloadWebhookWithManager(mgr mcmanager.Manager) error { mgr: mgr, } - return ctrl.NewWebhookManagedBy(mgr.GetLocalManager()). - For(&computev1alpha.Workload{}). + return ctrl.NewWebhookManagedBy(mgr.GetLocalManager(), &computev1alpha.Workload{}). WithDefaulter(webhook). WithValidator(webhook). Complete() @@ -40,17 +39,11 @@ type workloadWebhook struct { mgr mcmanager.Manager } -var _ admission.CustomDefaulter = &workloadWebhook{} -var _ admission.CustomValidator = &workloadWebhook{} - -// Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return fmt.Errorf("unexpected type %T", obj) - } - _ = workload +var _ admission.Defaulter[*computev1alpha.Workload] = &workloadWebhook{} +var _ admission.Validator[*computev1alpha.Workload] = &workloadWebhook{} +// Default implements admission.Defaulter so a mutating webhook will be registered for the type. +func (r *workloadWebhook) Default(_ context.Context, _ *computev1alpha.Workload) error { // // TODO(jreese) review and test gateway defaulting / logic // if gw := workload.Spec.Gateway; gw != nil { // for i, tcpRoute := range gw.TCPRoutes { @@ -75,15 +68,10 @@ func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error // +kubebuilder:webhook:path=/validate-compute-datumapis-com-v1alpha-workload,mutating=false,failurePolicy=fail,sideEffects=None,groups=compute.datumapis.com,resources=workloads,verbs=create;update,versions=v1alpha,name=vworkload.kb.io,admissionReviewVersions=v1 -func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - +func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev1alpha.Workload) (admission.Warnings, error) { clusterName := computewebhook.ClusterNameFromContext(ctx) - cluster, err := r.mgr.GetCluster(ctx, clusterName) + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) if err != nil { return nil, err } @@ -101,9 +89,9 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object // that means for the scheduling phase, since there would not currently be // sufficient context to know who created the workload and what locations // are valid candidates based on that. Maybe an annotation, or spec field? - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := clusterClient.List(ctx, &locations); err != nil { - return nil, fmt.Errorf("failed to list locations: %w", err) + return nil, fmt.Errorf("failed to list location bindings: %w", err) } validCityCodes := sets.Set[string]{} @@ -123,38 +111,18 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object } if errs := validation.ValidateWorkloadCreate(workload, opts); len(errs) > 0 { - return nil, errors.NewInvalid(obj.GetObjectKind().GroupVersionKind().GroupKind(), workload.Name, errs) + return nil, errors.NewInvalid(workload.GroupVersionKind().GroupKind(), workload.Name, errs) } return nil, nil } -func (r *workloadWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldworkload, ok := oldObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", oldObj) - } - - _ = oldworkload - - newworkload, ok := newObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", newObj) - } - - _ = newworkload - +func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object update. return nil, nil } -func (r *workloadWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - _ = workload - +func (r *workloadWebhook) ValidateDelete(_ context.Context, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object deletion. return nil, nil } From daa41149394ef042e5334da2ea7c0e6860d5551d Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:08:39 -0500 Subject: [PATCH 09/11] feat(config): CRDs, RBAC, and kustomize overlays for federation Regenerate the Instance, Workload, and WorkloadDeployment CRDs for the new API fields and add the kustomize structure that deploys the manager in cell or management-plane mode: federation and downstream RBAC bases, cell/management/quota-credentials components, the WorkloadDeployment status interpreter, and the matching overlays. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../compute.datumapis.com_instances.yaml | 73 ++++++++++++++++++- ...ute.datumapis.com_workloaddeployments.yaml | 64 +++++++++++++++- .../compute.datumapis.com_workloads.yaml | 64 +++++++++++++++- .../base/downstream-rbac/kustomization.yaml | 5 ++ config/base/downstream-rbac/rbac.yaml | 35 +++++++++ config/base/federation/kustomization.yaml | 10 +++ config/base/manager/manager.yaml | 31 ++++++-- config/base/manager/service_account.yaml | 2 +- .../cell-controllers/kustomization.yaml | 20 +++++ .../metrics_auth_role_binding.yaml | 2 +- config/components/controller_rbac/role.yaml | 40 ++++++++++ .../controller_rbac/role_binding.yaml | 2 +- .../components/federation/kustomization.yaml | 5 ++ .../workloaddeployment-interpreter.yaml | 28 +++++++ .../leader_election_role_binding.yaml | 2 +- .../management-controllers/kustomization.yaml | 20 +++++ .../quota-credentials/kustomization.yaml | 26 +++++++ .../service-configuration.yaml | 25 ++++++- .../components/service-catalog/service.yaml | 2 - .../overlays/cell/disable_webhook_patch.yaml | 12 +++ config/overlays/cell/kustomization.yaml | 17 +++++ .../discovery_mode_patch.yaml | 13 ++++ .../downstream_kubeconfig_patch.yaml | 29 ++++++++ .../management-plane/kustomization.yaml | 21 ++++++ .../single-cluster/kustomization.yaml | 2 + 25 files changed, 531 insertions(+), 19 deletions(-) create mode 100644 config/base/downstream-rbac/kustomization.yaml create mode 100644 config/base/downstream-rbac/rbac.yaml create mode 100644 config/base/federation/kustomization.yaml create mode 100644 config/components/cell-controllers/kustomization.yaml create mode 100644 config/components/federation/kustomization.yaml create mode 100644 config/components/federation/workloaddeployment-interpreter.yaml create mode 100644 config/components/management-controllers/kustomization.yaml create mode 100644 config/components/quota-credentials/kustomization.yaml create mode 100644 config/overlays/cell/disable_webhook_patch.yaml create mode 100644 config/overlays/cell/kustomization.yaml create mode 100644 config/overlays/management-plane/discovery_mode_patch.yaml create mode 100644 config/overlays/management-plane/downstream_kubeconfig_patch.yaml create mode 100644 config/overlays/management-plane/kustomization.yaml diff --git a/config/base/crd/bases/compute.datumapis.com_instances.yaml b/config/base/crd/bases/compute.datumapis.com_instances.yaml index 8c86fb90..c9301561 100644 --- a/config/base/crd/bases/compute.datumapis.com_instances.yaml +++ b/config/base/crd/bases/compute.datumapis.com_instances.yaml @@ -35,6 +35,10 @@ spec: name: Message priority: 1 type: string + - jsonPath: .status.conditions[?(@.type=="QuotaGranted")].reason + name: Quota + priority: 1 + type: string name: v1alpha schema: openAPIV3Schema: @@ -262,6 +266,28 @@ spec: description: A list of containers to run within the sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -272,8 +298,9 @@ spec: present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -332,6 +359,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume mount + containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests @@ -829,6 +893,11 @@ spec: reason: Pending status: Unknown type: Ready + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for quota evaluation + reason: PendingEvaluation + status: Unknown + type: QuotaGranted description: Status defines the current state of an Instance. properties: conditions: diff --git a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml index 50c9458b..48a2501d 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml @@ -375,6 +375,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -385,8 +407,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -448,6 +471,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/crd/bases/compute.datumapis.com_workloads.yaml b/config/base/crd/bases/compute.datumapis.com_workloads.yaml index edae1e1c..c452910f 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloads.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloads.yaml @@ -385,6 +385,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -395,8 +417,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -458,6 +481,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 00000000..4c4dbe44 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 00000000..1937ef02 --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 00000000..1261dac6 --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index e2c06e97..8ef18135 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,33 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --federation-kubeconfig=$(FEDERATION_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + - --feature-gates=$(FEATURE_GATES) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: FEDERATION_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" + - name: FEATURE_GATES + value: "" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -66,7 +85,7 @@ spec: volumeMounts: - name: config mountPath: /config - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711deb..cc6bd6cc 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 00000000..3f32da3b --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d974..ada1a1de 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index 5d803d2c..e8721899 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,13 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list - apiGroups: - compute.datumapis.com resources: @@ -36,3 +43,36 @@ rules: - get - patch - update +- apiGroups: + - networking.datumapis.com + resources: + - locations + - networkcontexts + - subnets + verbs: + - get + - list + - watch +- apiGroups: + - networking.datumapis.com + resources: + - networkbindings + - subnetclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - quota.miloapis.com + resources: + - resourceclaims + verbs: + - create + - delete + - get + - list + - watch diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3f..2f3e2676 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 00000000..3ba207ff --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 00000000..2743a63b --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,28 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if statusItems[1].status ~= nil then + desiredObj.status = statusItems[1].status + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe9996..d6783c07 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 00000000..d1e29e7f --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/config/components/quota-credentials/kustomization.yaml b/config/components/quota-credentials/kustomization.yaml new file mode 100644 index 00000000..ffc9a6d8 --- /dev/null +++ b/config/components/quota-credentials/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: quota-credentials + mountPath: /etc/quota-credentials + readOnly: true + volumes: + - name: quota-credentials + secret: + secretName: compute-quota-credentials + optional: true diff --git a/config/components/service-catalog/service-configuration.yaml b/config/components/service-catalog/service-configuration.yaml index 202ac8af..8c29a50e 100644 --- a/config/components/service-catalog/service-configuration.yaml +++ b/config/components/service-catalog/service-configuration.yaml @@ -6,6 +6,9 @@ spec: serviceRef: name: compute phase: Published + locations: + supportedClasses: + - datum-managed monitoredResourceTypes: - type: compute.datumapis.com/Instance displayName: Compute Instance @@ -44,6 +47,26 @@ spec: description: Seconds the instance has been in a running state. kind: Cumulative unit: s + - name: compute.datumapis.com/workloads + displayName: Compute Workloads + description: Number of compute workloads. + kind: Gauge + unit: '{workload}' + - name: compute.datumapis.com/instances + displayName: Compute Instances + description: Number of compute instances. + kind: Gauge + unit: '{instance}' + - name: compute.datumapis.com/vcpus + displayName: Compute vCPUs + description: Number of vCPUs allocated across all instances. + kind: Gauge + unit: '{millicore}' + - name: compute.datumapis.com/memory + displayName: Compute Memory + description: Memory allocated across all instances. + kind: Gauge + unit: MiB billing: consumerDestinations: - monitoredResourceType: compute.datumapis.com/Instance @@ -53,13 +76,13 @@ spec: - compute.datumapis.com/instance/cpu-allocated - compute.datumapis.com/instance/memory-allocated - compute.datumapis.com/instance/uptime-seconds + quota: metricRules: - selector: apiGroup: compute.datumapis.com kind: Workload metricCosts: compute.datumapis.com/workloads: 1 - quota: limits: - name: compute-workloads metric: compute.datumapis.com/workloads diff --git a/config/components/service-catalog/service.yaml b/config/components/service-catalog/service.yaml index 90a6e812..d32fd925 100644 --- a/config/components/service-catalog/service.yaml +++ b/config/components/service-catalog/service.yaml @@ -9,7 +9,5 @@ spec: owner: producerProjectRef: name: datum-cloud - enablementPolicy: - mode: GatedByProvider phase: Published serviceName: compute.datumapis.com diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml new file mode 100644 index 00000000..85b57f09 --- /dev/null +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + discovery: + quotaKubeconfigPath: /etc/quota-credentials/kubeconfig diff --git a/config/overlays/cell/kustomization.yaml b/config/overlays/cell/kustomization.yaml new file mode 100644 index 00000000..80925ee2 --- /dev/null +++ b/config/overlays/cell/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/cell-controllers + - ../../components/quota-credentials + +patches: +- path: disable_webhook_patch.yaml diff --git a/config/overlays/management-plane/discovery_mode_patch.yaml b/config/overlays/management-plane/discovery_mode_patch.yaml new file mode 100644 index 00000000..97bf762c --- /dev/null +++ b/config/overlays/management-plane/discovery_mode_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + webhookServer: {} + discovery: + mode: milo diff --git a/config/overlays/management-plane/downstream_kubeconfig_patch.yaml b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml new file mode 100644 index 00000000..7b3b764b --- /dev/null +++ b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: FEDERATION_KUBECONFIG + value: /etc/kubernetes/downstream/auth/downstream-kubeconfig.yaml + volumeMounts: + - name: downstream-kubeconfig + mountPath: /etc/kubernetes/downstream/auth + readOnly: true + - name: karmada-token + mountPath: /etc/kubernetes/karmada-token + readOnly: true + volumes: + - name: downstream-kubeconfig + configMap: + name: compute-downstream-kubeconfig + - name: karmada-token + projected: + sources: + - serviceAccountToken: + audience: https://karmada-apiserver.karmada-system.svc.cluster.local:5443 + path: token diff --git a/config/overlays/management-plane/kustomization.yaml b/config/overlays/management-plane/kustomization.yaml new file mode 100644 index 00000000..dae13c58 --- /dev/null +++ b/config/overlays/management-plane/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager + - ../../base/webhook +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/resource-metrics + - ../../components/high-availability + - ../../components/management-controllers + - ../../components/csi-webhook-cert + +patches: +- path: downstream_kubeconfig_patch.yaml +- path: discovery_mode_patch.yaml diff --git a/config/overlays/single-cluster/kustomization.yaml b/config/overlays/single-cluster/kustomization.yaml index 7a2d0320..4d72934e 100644 --- a/config/overlays/single-cluster/kustomization.yaml +++ b/config/overlays/single-cluster/kustomization.yaml @@ -15,3 +15,5 @@ components: - ../../components/resource-metrics - ../../components/high-availability - ../../components/csi-webhook-cert + - ../../components/management-controllers + - ../../components/cell-controllers From a0636692ed829659ef66e8c7b30c4fd30f47775d Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 5 Jun 2026 10:08:46 -0500 Subject: [PATCH 10/11] test(e2e): shared multi-cluster e2e harness Add the shared e2e environment helper plus the kind and Chainsaw configuration and kubeconfig/cluster-secret scripts that stand up a multi-cluster control plane, so federated scheduling can be exercised end to end. Co-Authored-By: Claude Opus 4.8 (1M context) --- hack/e2e/kind-control-plane.yaml | 17 ++ hack/e2e/make-internal-kubeconfig.sh | 60 +++++++ hack/e2e/patch-cluster-secret.sh | 90 ++++++++++ test/e2e/chainsaw-config.yaml | 47 +++++ test/e2e/env/README.md | 251 +++++++++++++++++++++++++++ test/e2e/env/env.go | 233 +++++++++++++++++++++++++ 6 files changed, 698 insertions(+) create mode 100644 hack/e2e/kind-control-plane.yaml create mode 100755 hack/e2e/make-internal-kubeconfig.sh create mode 100755 hack/e2e/patch-cluster-secret.sh create mode 100644 test/e2e/chainsaw-config.yaml create mode 100644 test/e2e/env/README.md create mode 100644 test/e2e/env/env.go diff --git a/hack/e2e/kind-control-plane.yaml b/hack/e2e/kind-control-plane.yaml new file mode 100644 index 00000000..47f3c63b --- /dev/null +++ b/hack/e2e/kind-control-plane.yaml @@ -0,0 +1,17 @@ +# Kind cluster configuration for the compute-control-plane management cluster. +# +# extraPortMappings exposes port 32443 on the macOS host so that the Karmada +# API server NodePort service (nodePort: 32443) is accessible at +# https://localhost:32443 without any additional port-forwarding. +# +# This matches KARMADA_API_NODEPORT in Taskfile.yaml. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 32443 # Karmada API server NodePort + hostPort: 32443 + protocol: TCP + listenAddress: "127.0.0.1" diff --git a/hack/e2e/make-internal-kubeconfig.sh b/hack/e2e/make-internal-kubeconfig.sh new file mode 100755 index 00000000..3303a5bd --- /dev/null +++ b/hack/e2e/make-internal-kubeconfig.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# make-internal-kubeconfig.sh +# +# Produces a kubeconfig variant that uses the Kind node's Docker container IP +# instead of localhost. This variant is stored in Karmada so the controller +# manager (running inside Docker) can reach member cluster API servers across +# the kind bridge network. +# +# Background: Kind maps each cluster's API server to a random localhost port +# on the developer machine. Inside Docker containers, "localhost" refers to the +# container's own loopback — not the host. We therefore swap the server address +# to the Kind control-plane container's Docker bridge IP (e.g. 172.18.0.x) and +# set insecure-skip-tls-verify because the node certificate does not include +# the Docker bridge IP in its SANs. +# +# Usage: +# hack/e2e/make-internal-kubeconfig.sh \ +# tmp/e2e/kubeconfigs/pop-dfw.yaml \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml \ +# compute-pop-dfw + +set -euo pipefail + +INPUT="${1:?usage: $0 }" +OUTPUT="${2:?usage: $0 }" +CLUSTER_NAME="${3:?usage: $0 }" + +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" + +# Resolve the container's Docker bridge IP. +DOCKER_IP=$(docker inspect \ + -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ + "${CONTAINER_NAME}" 2>/dev/null || true) + +if [ -z "${DOCKER_IP}" ]; then + echo "ERROR: Could not resolve Docker IP for container '${CONTAINER_NAME}'." >&2 + echo " Is the Kind cluster '${CLUSTER_NAME}' running?" >&2 + exit 1 +fi + +echo " ${CLUSTER_NAME}: Docker IP ${DOCKER_IP} → ${OUTPUT}" + +python3 - "${INPUT}" "${OUTPUT}" "${DOCKER_IP}" <<'PYEOF' +import sys, yaml + +src, dst, docker_ip = sys.argv[1], sys.argv[2], sys.argv[3] + +with open(src) as f: + cfg = yaml.safe_load(f) + +for cluster in cfg.get('clusters', []): + # Kind API server always listens on port 6443 inside the container. + cluster['cluster']['server'] = f'https://{docker_ip}:6443' + # The node cert only covers localhost / 127.0.0.1, not the bridge IP. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + +with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) +PYEOF diff --git a/hack/e2e/patch-cluster-secret.sh b/hack/e2e/patch-cluster-secret.sh new file mode 100755 index 00000000..e29ed383 --- /dev/null +++ b/hack/e2e/patch-cluster-secret.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# patch-cluster-secret.sh +# +# After "karmadactl join", Karmada stores the member cluster's kubeconfig in a +# Secret referenced by the Cluster object's spec.secretRef, and sets +# spec.apiEndpoint to the localhost address it resolved from the external +# kubeconfig. The Karmada controller manager runs inside Docker and cannot use +# localhost to reach POP cell API servers. +# +# This script: +# 1. Replaces the kubeconfig in the Secret with the Docker-IP variant so that +# the Karmada controller can make API calls to the member cluster. +# 2. Patches spec.apiEndpoint on the Cluster object so that health checks also +# use the Docker bridge IP instead of localhost. +# +# Usage: +# hack/e2e/patch-cluster-secret.sh \ +# tmp/e2e/kubeconfigs/karmada.yaml \ +# compute-pop-dfw \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml + +set -euo pipefail + +KARMADA_KUBECONFIG="${1:?usage: $0 }" +CLUSTER_NAME="${2:?usage: $0 }" +INTERNAL_KUBECONFIG="${3:?usage: $0 }" + +# ------------------------------------------------------------------ +# Read the Cluster object's secretRef (name + namespace) +# ------------------------------------------------------------------ +SECRET_NAME=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.name}' 2>/dev/null || true) + +if [ -z "${SECRET_NAME}" ]; then + echo "ERROR: Could not find spec.secretRef.name on cluster '${CLUSTER_NAME}'." >&2 + echo " Has karmadactl join completed successfully?" >&2 + exit 1 +fi + +SECRET_NAMESPACE=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.namespace}' 2>/dev/null || true) + +SECRET_NAMESPACE="${SECRET_NAMESPACE:-karmada-system}" + +echo " Patching secret ${SECRET_NAMESPACE}/${SECRET_NAME} with Docker-IP kubeconfig..." + +# ------------------------------------------------------------------ +# Replace the kubeconfig data in the secret +# ------------------------------------------------------------------ +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + create secret generic "${SECRET_NAME}" \ + --namespace="${SECRET_NAMESPACE}" \ + --from-file=kubeconfig="${INTERNAL_KUBECONFIG}" \ + --dry-run=client -o yaml \ + | kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + apply -f - + +echo " Secret ${SECRET_NAMESPACE}/${SECRET_NAME} updated — Karmada controller will use Docker bridge IP" + +# ------------------------------------------------------------------ +# Extract the Docker-IP server URL from the internal kubeconfig and +# patch spec.apiEndpoint on the Cluster object so that Karmada's +# cluster-status controller uses the same reachable address for health +# checks. Without this patch the controller continues to probe the +# localhost address stored by karmadactl join and the cluster never +# transitions to Ready. +# ------------------------------------------------------------------ +DOCKER_SERVER=$(kubectl \ + --kubeconfig="${INTERNAL_KUBECONFIG}" \ + config view --minify -o jsonpath='{.clusters[0].cluster.server}') + +if [ -z "${DOCKER_SERVER}" ]; then + echo "ERROR: Could not read server URL from ${INTERNAL_KUBECONFIG}" >&2 + exit 1 +fi + +echo " Patching spec.apiEndpoint on cluster '${CLUSTER_NAME}' → ${DOCKER_SERVER}..." +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + patch cluster "${CLUSTER_NAME}" \ + --type=merge \ + -p "{\"spec\":{\"apiEndpoint\":\"${DOCKER_SERVER}\"}}" + +echo " Cluster '${CLUSTER_NAME}' patched — health checks will now use Docker bridge IP" diff --git a/test/e2e/chainsaw-config.yaml b/test/e2e/chainsaw-config.yaml new file mode 100644 index 00000000..cd3a9950 --- /dev/null +++ b/test/e2e/chainsaw-config.yaml @@ -0,0 +1,47 @@ +# Chainsaw global configuration for the compute federation e2e test suite. +# +# Prerequisites +# ───────────── +# Run `task e2e:up` to create the Kind clusters and populate kubeconfigs under +# tmp/e2e/kubeconfigs/ before running Chainsaw. +# +# Running +# ─────── +# From the repository root via Taskfile (recommended): +# +# task e2e:test +# +# Or directly: +# +# KUBECONFIG=tmp/e2e/kubeconfigs/control-plane.yaml \ +# chainsaw test --config test/e2e/chainsaw-config.yaml test/e2e/ +# +# The KUBECONFIG env var sets the "default" cluster (control-plane cell). +# Additional clusters (downstream, pop-dfw, pop-ord) are declared below and +# referenced by name in individual test steps via `cluster: downstream` etc. +# +# Kubeconfig paths below are relative to the working directory where Chainsaw is +# invoked (the project root), NOT relative to this config file's location. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 30s + assert: 60s + cleanup: 60s + delete: 30s + error: 30s + exec: 30s + clusters: + # Downstream control plane. WorkloadDeployments, PropagationPolicies, + # and Instance write-backs live here. + downstream: + kubeconfig: tmp/e2e/kubeconfigs/downstream.yaml + # POP DFW cell — downstream member cluster labelled topology.datum.net/city-code=dfw. + pop-dfw: + kubeconfig: tmp/e2e/kubeconfigs/pop-dfw.yaml + # POP ORD cell — downstream member cluster labelled topology.datum.net/city-code=ord. + pop-ord: + kubeconfig: tmp/e2e/kubeconfigs/pop-ord.yaml diff --git a/test/e2e/env/README.md b/test/e2e/env/README.md new file mode 100644 index 00000000..671e705d --- /dev/null +++ b/test/e2e/env/README.md @@ -0,0 +1,251 @@ +# Local Kind + Karmada e2e Environment + +This document describes the local multi-cluster environment used for end-to-end +testing of the compute federation layer. + +--- + +## Prerequisites + +| Tool | Minimum version | Install | +|------|----------------|---------| +| [Docker Desktop](https://www.docker.com/products/docker-desktop/) | 4.x | required for Kind | +| [kind](https://kind.sigs.k8s.io/) | v0.23+ | `brew install kind` | +| [kubectl](https://kubernetes.io/docs/tasks/tools/) | v1.28+ | `brew install kubernetes-cli` | +| [helm](https://helm.sh/) | v3.14+ | `brew install helm` | +| [task](https://taskfile.dev/) | v3 | `brew install go-task` | +| Python 3 | 3.9+ | pre-installed on macOS | +| go | 1.24+ | `brew install go` | + +`karmadactl` is downloaded automatically by `task e2e:up` into `./bin/`. + +--- + +## Cluster Topology + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ compute-control-plane (Kind cluster) │ +│ │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ karmada-system namespace │ │ +│ │ Karmada API Server ←── https://localhost:32443 │ │ +│ │ Karmada Controller Manager │ │ +│ │ Karmada Scheduler │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +│ │ +│ compute operator (WorkloadReconciler, Federator, InstanceProjector)│ +└──────────────────────────┬──────────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployments + ┌────────────────┴─────────────────┐ + │ │ +┌─────────▼──────────┐ ┌──────────▼─────────┐ +│ compute-pop-dfw │ │ compute-pop-ord │ +│ (Kind cluster) │ │ (Kind cluster) │ +│ │ │ │ +│ city-code=dfw │ │ city-code=ord │ +│ Compute CRDs │ │ Compute CRDs │ +│ NSO CRDs │ │ NSO CRDs │ +└────────────────────┘ └────────────────────┘ +``` + +### What lives where + +| Resource | Cluster | +|----------|---------| +| `Workload`, `WorkloadDeployment` (consumer-facing) | Control Plane Cell | +| `WorkloadDeployment` (federation intent), `PropagationPolicy` | Karmada API Server | +| `WorkloadDeployment` (propagated), `Instance`, `NetworkBinding`, `SubnetClaim` | POP cells | +| `Instance` (write-back for visibility) | Karmada API Server | + +--- + +## Running the environment + +### Start + +```bash +task e2e:up +``` + +This is fully idempotent — running it twice will not fail. + +What it does, in order: + +1. Downloads `karmadactl v1.16.0` into `./bin/` (once). +2. Adds the `karmada-charts` Helm repository. +3. Creates Kind clusters `compute-control-plane`, `compute-pop-dfw`, + `compute-pop-ord` (skips any that already exist). +4. Exports kubeconfigs to `./tmp/e2e/kubeconfigs/`. +5. Installs Karmada v1.16.0 via the `karmada-charts/karmada` Helm chart into + `compute-control-plane`, with the API server exposed on NodePort 32443. +6. Registers `compute-pop-dfw` and `compute-pop-ord` as member clusters and + labels each with `topology.datum.net/city-code`. +7. Installs compute CRDs to all clusters and the Karmada API server. +8. Installs NSO CRDs to the POP cell clusters. + +### Stop + +```bash +task e2e:down +``` + +Deletes all three Kind clusters and removes `./tmp/e2e/`. + +--- + +## Kubeconfigs + +After `task e2e:up`: + +| File | Cluster | Use for | +|------|---------|---------| +| `tmp/e2e/kubeconfigs/control-plane.yaml` | `compute-control-plane` | kubectl, deploying the compute operator | +| `tmp/e2e/kubeconfigs/karmada.yaml` | Karmada API server | kubectl, karmadactl | +| `tmp/e2e/kubeconfigs/pop-dfw.yaml` | `compute-pop-dfw` | kubectl, inspecting POP cell state | +| `tmp/e2e/kubeconfigs/pop-ord.yaml` | `compute-pop-ord` | kubectl, inspecting POP cell state | + +The `-internal.yaml` variants use the Kind container's Docker bridge IP and are +intended for the Karmada controller running inside Docker — not for direct +developer use. + +### Quick check + +```bash +# Verify cluster list in Karmada +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get clusters + +# Expected output: +# NAME READY AGE +# compute-pop-dfw True ... +# compute-pop-ord True ... + +# Verify city-code labels +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get clusters -L topology.datum.net/city-code +``` + +--- + +## Using the environment from e2e tests + +Import `go.datum.net/compute/test/e2e/env` in your test suite: + +```go +package myfeature_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + computev1alpha1 "go.datum.net/compute/api/v1alpha1" + + "go.datum.net/compute/test/e2e/env" +) + +var testEnv *env.Environment + +func TestMyFeature(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "MyFeature Suite") +} + +var _ = BeforeSuite(func() { + scheme := runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) + + var err error + testEnv, err = env.New(scheme) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = It("creates a workload and propagates it", func() { + // Control plane cluster client + cpClient := testEnv.ControlPlane.Client + + // Karmada API server client + karmadaClient := testEnv.Karmada.Client + + // POP DFW cluster client + dfwCell, err := testEnv.POPCell(env.CityCodeDFW) + Expect(err).NotTo(HaveOccurred()) + dfwClient := dfwCell.Client + + _ = cpClient + _ = karmadaClient + _ = dfwClient +}) +``` + +### Environment variable override + +Set `E2E_KUBECONFIG_DIR` to an absolute path to load kubeconfigs from a +different directory (useful in CI): + +```bash +E2E_KUBECONFIG_DIR=/path/to/kubeconfigs go test ./test/e2e/... +``` + +--- + +## Networking notes (macOS) + +On macOS with Docker Desktop, Kind clusters run as Docker containers. The +container-to-container networking works as follows: + +| From | To | Address used | +|------|----|--------------| +| macOS host | Any Kind cluster API server | `localhost:` | +| macOS host | Karmada API server | `https://localhost:32443` (NodePort) | +| Karmada controller (in Docker) | POP cell API servers | Docker bridge IP (`172.18.x.x:6443`) | + +The `-internal.yaml` kubeconfig variants use Docker bridge IPs with +`insecure-skip-tls-verify: true` because the node certificates do not include +bridge IPs in their SANs. This is acceptable for a local dev environment. + +--- + +## Troubleshooting + +### Karmada API server not reachable + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get ns +``` + +If this times out, check: +1. The Kind cluster is running: `kind get clusters` +2. Port 32443 is mapped: `docker port compute-control-plane-control-plane` +3. The karmada-apiserver pod is running: + ```bash + kubectl --kubeconfig tmp/e2e/kubeconfigs/control-plane.yaml \ + get pods -n karmada-system + ``` + +### POP cluster shows NotReady in Karmada + +The Karmada controller manager uses the Docker bridge IP kubeconfig to reach +POP cells. Check: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + describe cluster compute-pop-dfw +``` + +Then verify the cluster secret contains the expected Docker IP: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get secret -n karmada-system | grep pop-dfw +``` + +### Start fresh + +```bash +task e2e:down && task e2e:up +``` diff --git a/test/e2e/env/env.go b/test/e2e/env/env.go new file mode 100644 index 00000000..7d2c59c6 --- /dev/null +++ b/test/e2e/env/env.go @@ -0,0 +1,233 @@ +// Package env provides helpers for connecting to the local Kind e2e environment +// created by "task e2e:up". +// +// # Environment layout +// +// The environment consists of three Kind clusters and one downstream API server: +// +// - Control plane cell — hosts the compute operator (WorkloadReconciler, +// WorkloadDeploymentFederator, InstanceProjector). +// - Downstream control plane — the federation API server; WorkloadDeployments +// are written here so they can be propagated to POP cells. +// - POP DFW (compute-pop-dfw) — member cluster labelled city-code=dfw. +// - POP ORD (compute-pop-ord) — member cluster labelled city-code=ord. +// +// # Kubeconfig resolution +// +// Kubeconfigs are read from the directory at [DefaultKubeconfigDir] (relative +// to the repository root), unless overridden via the [EnvKubeconfigDir] +// environment variable. +// +// Expected files inside that directory: +// +// control-plane.yaml — management / control-plane cell +// downstream.yaml — downstream federation API server (https://localhost:32443) +// pop-dfw.yaml — POP DFW cell (standard Kind localhost-based kubeconfig) +// pop-ord.yaml — POP ORD cell (standard Kind localhost-based kubeconfig) +// +// # Typical usage in a Ginkgo suite +// +// var ( +// testEnv *env.Environment +// ) +// +// var _ = BeforeSuite(func() { +// scheme := runtime.NewScheme() +// Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) +// Expect(corev1.AddToScheme(scheme)).To(Succeed()) +// +// var err error +// testEnv, err = env.New(scheme) +// Expect(err).NotTo(HaveOccurred()) +// }) +package env + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variable name that overrides the kubeconfig directory. +const EnvKubeconfigDir = "E2E_KUBECONFIG_DIR" + +// DefaultKubeconfigDir is the kubeconfig directory used when [EnvKubeconfigDir] +// is not set. It is resolved relative to the repository root (three directories +// above this source file). +const DefaultKubeconfigDir = "tmp/e2e/kubeconfigs" + +// City codes for the two POP cells created by "task e2e:up". +const ( + CityCodeDFW = "dfw" + CityCodeORD = "ord" +) + +// Environment holds a [ClusterAccess] for each cluster in the local e2e +// environment. All fields are populated by [New]; none are nil on success. +type Environment struct { + // ControlPlane is the management / control-plane cell cluster. + // The compute operator runs here (WorkloadReconciler, + // WorkloadDeploymentFederator, InstanceProjector). + ControlPlane *ClusterAccess + + // Downstream is the downstream control plane. + // WorkloadDeployments and PropagationPolicies live here. + Downstream *ClusterAccess + + // POPCells maps city-code strings (e.g. "dfw", "ord") to the + // corresponding POP cell cluster. Use [Environment.POPCell] for + // safe, error-returning access. + POPCells map[string]*ClusterAccess +} + +// ClusterAccess bundles a REST config and a controller-runtime Client for a +// single cluster. +type ClusterAccess struct { + // Config is the REST config used to build the client. + Config *rest.Config + + // Client is a controller-runtime client scoped to this cluster. + // The client is built with the scheme supplied to [New]. + Client ctrlclient.Client +} + +// New creates an [Environment] by loading kubeconfigs from the configured +// directory and building a controller-runtime client for each cluster using +// the provided scheme. +// +// The scheme should have all relevant types registered before calling New; +// for example compute types, networking types, and core Kubernetes types. +func New(scheme *k8sruntime.Scheme) (*Environment, error) { + dir := kubeconfigDir() + + controlPlane, err := loadCluster(filepath.Join(dir, "control-plane.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("control-plane cluster: %w", err) + } + + downstream, err := loadCluster(filepath.Join(dir, "downstream.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("downstream control plane: %w", err) + } + + popDFW, err := loadCluster(filepath.Join(dir, "pop-dfw.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP DFW cluster: %w", err) + } + + popORD, err := loadCluster(filepath.Join(dir, "pop-ord.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP ORD cluster: %w", err) + } + + return &Environment{ + ControlPlane: controlPlane, + Downstream: downstream, + POPCells: map[string]*ClusterAccess{ + CityCodeDFW: popDFW, + CityCodeORD: popORD, + }, + }, nil +} + +// POPCell returns the [ClusterAccess] for the POP cell with the given city +// code. It returns an error if no POP cell is registered for that code. +func (e *Environment) POPCell(cityCode string) (*ClusterAccess, error) { + ca, ok := e.POPCells[cityCode] + if !ok { + known := make([]string, 0, len(e.POPCells)) + for k := range e.POPCells { + known = append(known, k) + } + return nil, fmt.Errorf("no POP cell registered for city code %q (known: %v)", cityCode, known) + } + return ca, nil +} + +// MustPOPCell is like [Environment.POPCell] but panics on error. +// Useful in test setup where a missing POP cell is always a fatal misconfiguration. +func (e *Environment) MustPOPCell(cityCode string) *ClusterAccess { + ca, err := e.POPCell(cityCode) + if err != nil { + panic(err) + } + return ca +} + +// RESTConfigFor is a convenience function that returns a [rest.Config] for the +// named cluster without constructing a client. Useful when the caller needs to +// build a typed clientset directly. +func RESTConfigFor(kubeconfigPath string) (*rest.Config, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + return cfg, nil +} + +// KubeconfigPath returns the absolute path to the kubeconfig file for the +// named cluster. name must be one of "control-plane", "downstream", "pop-dfw", +// or "pop-ord". +func KubeconfigPath(name string) string { + return filepath.Join(kubeconfigDir(), name+".yaml") +} + +// ─── internal helpers ──────────────────────────────────────────────────────── + +func loadCluster(kubeconfigPath string, scheme *k8sruntime.Scheme) (*ClusterAccess, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + + c, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("building client from %s: %w", kubeconfigPath, err) + } + + return &ClusterAccess{ + Config: cfg, + Client: c, + }, nil +} + +// kubeconfigDir returns the directory containing e2e kubeconfigs. +// It honours the E2E_KUBECONFIG_DIR environment variable, otherwise falls +// back to /tmp/e2e/kubeconfigs. +func kubeconfigDir() string { + if dir := os.Getenv(EnvKubeconfigDir); dir != "" { + return dir + } + return filepath.Join(repoRoot(), DefaultKubeconfigDir) +} + +// repoRoot walks up from this source file to find the repository root +// (identified by the presence of go.mod). +func repoRoot() string { + // Use the file path of this source file as a starting point so the helper + // works regardless of the caller's working directory. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + // Fallback: assume tests are run from the repo root. + return "." + } + + dir := filepath.Dir(thisFile) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + // Reached filesystem root without finding go.mod. + return "." + } + dir = parent + } +} From b6efa2f97d1225d2b24f2144d1a947622473efb8 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 4 Jun 2026 20:38:51 -0500 Subject: [PATCH 11/11] test(e2e): federation delivery suites (full-federation, projection, writeback, PP lifecycle, deletion-cascade) Carved out of the federation foundation PR so the controller change reviews without ~900 lines of chainsaw YAML inline. These suites exercise the federation behaviour end-to-end against the shared test/e2e/env harness (which stays with the foundation). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../assert-downstream-wd-exists.yaml | 7 + test/e2e/deletion-cascade/chainsaw-test.yaml | 79 +++++++++ .../deletion-cascade/workload-deployment.yaml | 21 +++ test/e2e/full-federation/chainsaw-test.yaml | 150 ++++++++++++++++++ .../full-federation/workload-deployment.yaml | 21 +++ .../assert-downstream-wd.yaml | 6 + .../assert-projected-instance.yaml | 19 +++ .../instance-projection/chainsaw-test.yaml | 123 ++++++++++++++ .../workload-deployment.yaml | 21 +++ .../assert-downstream-instance.yaml | 16 ++ .../e2e/instance-writeback/chainsaw-test.yaml | 112 +++++++++++++ .../instance-writeback/instance-pop-dfw.yaml | 15 ++ .../assert-pp-exists.yaml | 6 + .../chainsaw-test.yaml | 133 ++++++++++++++++ .../workload-deployment-alpha.yaml | 21 +++ .../workload-deployment-beta.yaml | 21 +++ .../assert-downstream-pp.yaml | 20 +++ .../assert-downstream-wd.yaml | 9 ++ .../chainsaw-test.yaml | 84 ++++++++++ .../workload-deployment.yaml | 22 +++ 20 files changed, 906 insertions(+) create mode 100644 test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml create mode 100644 test/e2e/deletion-cascade/chainsaw-test.yaml create mode 100644 test/e2e/deletion-cascade/workload-deployment.yaml create mode 100644 test/e2e/full-federation/chainsaw-test.yaml create mode 100644 test/e2e/full-federation/workload-deployment.yaml create mode 100644 test/e2e/instance-projection/assert-downstream-wd.yaml create mode 100644 test/e2e/instance-projection/assert-projected-instance.yaml create mode 100644 test/e2e/instance-projection/chainsaw-test.yaml create mode 100644 test/e2e/instance-projection/workload-deployment.yaml create mode 100644 test/e2e/instance-writeback/assert-downstream-instance.yaml create mode 100644 test/e2e/instance-writeback/chainsaw-test.yaml create mode 100644 test/e2e/instance-writeback/instance-pop-dfw.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml create mode 100644 test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml create mode 100644 test/e2e/workload-deployment-federation/assert-downstream-pp.yaml create mode 100644 test/e2e/workload-deployment-federation/assert-downstream-wd.yaml create mode 100644 test/e2e/workload-deployment-federation/chainsaw-test.yaml create mode 100644 test/e2e/workload-deployment-federation/workload-deployment.yaml diff --git a/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml new file mode 100644 index 00000000..aae65da1 --- /dev/null +++ b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml @@ -0,0 +1,7 @@ +# Assert the WorkloadDeployment is present in the Karmada API server. +# Used both to confirm federation succeeded and as the target for the error: check. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-cascade-wd diff --git a/test/e2e/deletion-cascade/chainsaw-test.yaml b/test/e2e/deletion-cascade/chainsaw-test.yaml new file mode 100644 index 00000000..03a11ea0 --- /dev/null +++ b/test/e2e/deletion-cascade/chainsaw-test.yaml @@ -0,0 +1,79 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: deletion-cascade +spec: + description: | + Verifies that deleting a WorkloadDeployment from the project namespace causes + the federator to remove the corresponding WorkloadDeployment from Karmada. + + The WorkloadDeploymentFederator adds a finalizer + (compute.datumapis.com/federator) to every project WD it manages. When the + project WD is deleted: + 1. The finalizer's Finalize method runs (blocking deletion until complete). + 2. It deletes the Karmada-side WorkloadDeployment. + 3. It removes the PropagationPolicy if no other WDs for the city remain. + 4. It removes the finalizer, allowing the project WD to be garbage-collected. + + This test validates: project WD deletion → Karmada WD deletion. + + template: true + + steps: + - name: create-wd + description: Create a WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-federation + description: Wait for the WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-cascade-wd + + - name: delete-wd + description: Delete the WorkloadDeployment from the control-plane cluster. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: test-cascade-wd + + - name: assert-downstream-wd-deleted + description: Confirm the Karmada copy is removed by the finalizer. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($downstreamNS) + name: test-cascade-wd + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/deletion-cascade/workload-deployment.yaml b/test/e2e/deletion-cascade/workload-deployment.yaml new file mode 100644 index 00000000..39d68a1d --- /dev/null +++ b/test/e2e/deletion-cascade/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-cascade-wd +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/full-federation/chainsaw-test.yaml b/test/e2e/full-federation/chainsaw-test.yaml new file mode 100644 index 00000000..020a2bc9 --- /dev/null +++ b/test/e2e/full-federation/chainsaw-test.yaml @@ -0,0 +1,150 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: full-federation +spec: + description: | + End-to-end federation chain test. + + Exercises the complete path from WorkloadDeployment creation through to + Instance projection on the control-plane cluster: + + 1. Create WorkloadDeployment on control-plane. + 2. WorkloadDeploymentFederator replicates it to Karmada (ns- namespace). + 3. Karmada PropagationPolicy routes the WD to pop-dfw. + 4. WorkloadDeploymentReconciler on pop-dfw creates Instance test-full-fed-wd-0. + 5. InstanceReconciler on pop-dfw writes Instance back to Karmada with + label meta.datumapis.com/upstream-cluster-name: cluster-single. + 6. InstanceProjector on control-plane creates a projection of the Instance + in the project namespace. + + Prerequisites: both operator instances must be running (task e2e:operator:start). + + template: true + + steps: + - name: create-workload-deployment + description: Create the WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeploymentFederator replicated the WD to Karmada and status is synced back. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + - assert: + # Wait for the cell operator to write status back to the Karmada WD. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-wd-on-pop-dfw + description: Assert Karmada propagated the WD to pop-dfw and the cell reconciler set status. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + # Karmada propagation can take longer than a local apply. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-instance-on-pop-dfw + description: Assert WorkloadDeploymentReconciler created an Instance on pop-dfw with a Ready condition. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" + + - name: assert-instance-writeback-in-downstream + description: Assert InstanceReconciler wrote the Instance back to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + + - name: assert-instance-projected-to-control-plane + description: Assert InstanceProjector created a projection with status on the control-plane. + try: + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($namespace) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" diff --git a/test/e2e/full-federation/workload-deployment.yaml b/test/e2e/full-federation/workload-deployment.yaml new file mode 100644 index 00000000..70b4cb94 --- /dev/null +++ b/test/e2e/full-federation/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-full-fed-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/instance-projection/assert-downstream-wd.yaml b/test/e2e/instance-projection/assert-downstream-wd.yaml new file mode 100644 index 00000000..705d0893 --- /dev/null +++ b/test/e2e/instance-projection/assert-downstream-wd.yaml @@ -0,0 +1,6 @@ +# Assert the WorkloadDeployment is federated to Karmada (and the Karmada namespace created). +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-projector-wd diff --git a/test/e2e/instance-projection/assert-projected-instance.yaml b/test/e2e/instance-projection/assert-projected-instance.yaml new file mode 100644 index 00000000..0542194d --- /dev/null +++ b/test/e2e/instance-projection/assert-projected-instance.yaml @@ -0,0 +1,19 @@ +# Assert the InstanceProjector created a projection in the project namespace. +# +# The InstanceProjector (internal/controller/instance_projector.go): +# - Watches Instances in Karmada that carry upstreamClusterNameLabel +# - Strips "cluster-" prefix to get the cluster name ("single" in single-provider mode) +# - Finds the project namespace by matching ns- to namespace UIDs +# - Creates/updates the Instance projection in the project namespace +# - Sets an owner reference to the WorkloadDeployment for cascading deletion +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + # namespace is the Chainsaw test namespace (the project namespace on control-plane) + name: test-projected-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + ownerReferences: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + name: test-projector-wd diff --git a/test/e2e/instance-projection/chainsaw-test.yaml b/test/e2e/instance-projection/chainsaw-test.yaml new file mode 100644 index 00000000..16fa9f96 --- /dev/null +++ b/test/e2e/instance-projection/chainsaw-test.yaml @@ -0,0 +1,123 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-projection +spec: + description: | + Verifies that the InstanceProjector watches Instances written back to the + Karmada API server and creates corresponding read-only projections in the + project namespace on the control-plane cluster. + + Flow: + 1. Create a WorkloadDeployment → triggers federator → Karmada namespace created. + 2. Write an Instance to Karmada (simulating a POP-cell InstanceReconciler write-back). + 3. InstanceProjector detects the Karmada Instance and creates a projection in the + project namespace (the Chainsaw test namespace on the control-plane cluster). + 4. Assert the projection exists with the upstream tracking label and an owner + reference to the WorkloadDeployment (for cascading deletion). + + Cluster name label: "cluster-single" + The compute operator runs in single-provider mode for this e2e environment, + registering the control-plane cluster with the multicluster-runtime manager + under the name "single" (see cmd/main.go, wrappedSingleClusterProvider). + + template: true + + steps: + - name: create-wd + description: Create the WorkloadDeployment to trigger federation and namespace creation. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-downstream-namespace + description: Wait for the federated WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-projector-wd + + - name: write-instance-to-downstream + description: | + Write an Instance to Karmada simulating InstanceReconciler write-back. + Uses explicit control-plane kubeconfig to derive downstreamNS and WD UID. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get workloaddeployment test-projector-wd \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.metadata.uid}' + outputs: + - name: wdUID + value: ($stdout) + - script: + env: + - name: KARMADA_NS + value: ($downstreamNS) + - name: WD_UID + value: ($wdUID) + content: | + kubectl apply -f - < is the multicluster-runtime cluster name registered by +# wrappedSingleClusterProvider (always "single" in single-cluster mode) +# - Label meta.datumapis.com/upstream-namespace = the POP-cell namespace +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/chainsaw-test.yaml b/test/e2e/instance-writeback/chainsaw-test.yaml new file mode 100644 index 00000000..32dbbc5d --- /dev/null +++ b/test/e2e/instance-writeback/chainsaw-test.yaml @@ -0,0 +1,112 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-writeback +spec: + description: | + Verifies that the InstanceReconciler running in a POP-cell cluster writes + Instance objects back to the Karmada API server after reconciling the Ready + condition for the first time. + + Write-back convention (internal/controller/instance_controller.go): + - The Instance is written to Karmada at the same namespace/name as the POP-cell Instance. + - Label meta.datumapis.com/upstream-cluster-name is set to + "cluster-" (e.g. "cluster-compute-pop-dfw"). + - Label meta.datumapis.com/upstream-namespace records the originating namespace. + + Note: this test requires the compute operator (InstanceReconciler) to be running + in the DFW POP cell cluster. + + template: true + + steps: + - name: setup-namespaces + description: Create the Instance namespace in the DFW POP cell and Karmada. + try: + - script: + content: | + kubectl get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml apply -f - + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml apply -f - + cleanup: + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + + - name: create-instance-on-pop-dfw + description: Create the Instance on the DFW POP cell cluster. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - apply: + file: instance-pop-dfw.yaml + cleanup: + - script: + content: | + INSTANCE_NS=$(kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}') + kubectl delete instance test-writeback-instance \ + --namespace "$INSTANCE_NS" --ignore-not-found + + - name: assert-instance-in-downstream + description: Wait for the InstanceReconciler to write back the Instance to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/instance-pop-dfw.yaml b/test/e2e/instance-writeback/instance-pop-dfw.yaml new file mode 100644 index 00000000..250eb7d7 --- /dev/null +++ b/test/e2e/instance-writeback/instance-pop-dfw.yaml @@ -0,0 +1,15 @@ +# Instance created in the DFW POP cell. +# ($instanceNS) is the namespace derived from the Chainsaw test namespace UID, +# matching the ns- convention so the InstanceProjector can resolve it later. +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + name: test-writeback-instance + namespace: ($instanceNS) +spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network diff --git a/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml new file mode 100644 index 00000000..77a817a5 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml @@ -0,0 +1,6 @@ +# Asserts that the PropagationPolicy for city dfw exists in the Karmada namespace. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw diff --git a/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml new file mode 100644 index 00000000..5678c398 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml @@ -0,0 +1,133 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: propagation-policy-lifecycle +spec: + description: | + Verifies the PropagationPolicy lifecycle managed by the WorkloadDeploymentFederator: + + - A PropagationPolicy (city-dfw) is lazily created when the first WorkloadDeployment + for city code "dfw" is federated to Karmada. + - The PropagationPolicy is RETAINED while at least one WorkloadDeployment for + that city code remains in the Karmada namespace. + - The PropagationPolicy is DELETED when the last deployment for the city is removed. + + The test creates two WDs (wd-alpha, wd-beta) both targeting cityCode=dfw, verifies + the PP appears, deletes wd-alpha and asserts the PP is still present, then deletes + wd-beta and waits for the PP to disappear. + + template: true + + steps: + - name: create-deployments + description: Create two WorkloadDeployments targeting dfw on the control-plane. + try: + - apply: + file: workload-deployment-alpha.yaml + - apply: + file: workload-deployment-beta.yaml + + - name: assert-policy-created + description: | + Assert both WDs are federated to Karmada and the PropagationPolicy exists. + Both WDs must be present in Karmada before proceeding to the deletion steps; + otherwise wd-alpha's finalizer could see an empty Karmada list and prematurely + delete the PP before wd-beta has been federated. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-alpha + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-beta + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-alpha + description: Delete wd-alpha; wd-beta still targets dfw so the PP must be retained. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-alpha + + - name: assert-policy-retained + description: Assert the PropagationPolicy is still present after wd-alpha is deleted. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - sleep: + duration: 8s + - assert: + timeout: 5s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-beta + description: Delete wd-beta (the last WD for city dfw). + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-beta + + - name: assert-policy-deleted + description: Wait for the PropagationPolicy to be removed once no WDs remain. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + namespace: ($downstreamNS) + name: city-dfw + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml new file mode 100644 index 00000000..f9eb27fd --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-alpha +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml new file mode 100644 index 00000000..fd1d65c1 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-beta +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml new file mode 100644 index 00000000..98f8d0f1 --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml @@ -0,0 +1,20 @@ +# Assert the PropagationPolicy was created in the Karmada namespace. +# The name follows propagationPolicyNameFor("dfw") = "workload-deployments-dfw". +# ($downstreamNS) is substituted by Chainsaw's template engine. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw +spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml new file mode 100644 index 00000000..23c308ff --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml @@ -0,0 +1,9 @@ +# Assert the WorkloadDeployment exists in Karmada with the city-code label. +# ($downstreamNS) is substituted by Chainsaw's template engine from the script binding. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/chainsaw-test.yaml b/test/e2e/workload-deployment-federation/chainsaw-test.yaml new file mode 100644 index 00000000..302d89c4 --- /dev/null +++ b/test/e2e/workload-deployment-federation/chainsaw-test.yaml @@ -0,0 +1,84 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: workload-deployment-federation +spec: + description: | + Verifies that the WorkloadDeploymentFederator replicates a WorkloadDeployment + from the project namespace (control-plane cluster) to the Karmada API server + with the correct city-code label and PropagationPolicy. + + The federator follows the ns- convention for Karmada namespaces, + matching the MappedNamespaceResourceStrategy used by NSO. The test derives + the expected Karmada namespace dynamically from the Chainsaw test namespace UID. + + Verified: + - WorkloadDeployment exists in Karmada at ns- + - Karmada copy carries label topology.datum.net/city-code: dfw + - PropagationPolicy city-dfw exists in the Karmada namespace, + selecting WDs by city-code and routing them to matching POP-cell clusters. + + template: true + + steps: + - name: derive-ns-and-create-wd + description: Derive Karmada namespace and create the WorkloadDeployment. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeployment federated to Karmada with city-code label. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw + + - name: assert-propagation-policy-in-downstream + description: Assert PropagationPolicy created for city-dfw. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/workload-deployment.yaml b/test/e2e/workload-deployment-federation/workload-deployment.yaml new file mode 100644 index 00000000..0cd2347a --- /dev/null +++ b/test/e2e/workload-deployment-federation/workload-deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-federation-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1