From 2b82636733ec5ff10f9345755692f82d8a341167 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 13:25:15 +0100 Subject: [PATCH 1/9] Register apiextensions/v1 scheme in operator Prepares the operator manager for an upcoming controller that watches CustomResourceDefinition objects. No runtime behaviour change yet. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/thv-operator/main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/thv-operator/main.go b/cmd/thv-operator/main.go index 85073a385b..5ba985f751 100644 --- a/cmd/thv-operator/main.go +++ b/cmd/thv-operator/main.go @@ -17,6 +17,7 @@ import ( "strings" "github.com/go-logr/logr" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -57,6 +58,7 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(mcpv1alpha1.AddToScheme(scheme)) utilruntime.Must(mcpv1beta1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } From bca2250df921da95ece1319787d5b0ced5df7da8 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 13:32:14 +0100 Subject: [PATCH 2/9] Label v1beta1 CRDs for auto storage version migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the kubebuilder marker +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true to every v1beta1 root type and regenerates the CRD manifests. The label is the opt-in signal for an upcoming StorageVersionMigrator controller that will reconcile each CRD's status.storedVersions down to the current storage version. List types are intentionally not labelled — the label applies to the CRD itself, which is keyed on the root type. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/thv-operator/api/v1beta1/embeddingserver_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpgroup_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpregistry_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpserver_types.go | 1 + cmd/thv-operator/api/v1beta1/mcpserverentry_types.go | 1 + cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go | 1 + cmd/thv-operator/api/v1beta1/toolconfig_types.go | 1 + .../api/v1beta1/virtualmcpcompositetooldefinition_types.go | 1 + cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go | 1 + .../files/crds/toolhive.stacklok.dev_embeddingservers.yaml | 2 ++ .../crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpgroups.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpregistries.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpserverentries.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcpservers.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml | 2 ++ ...oolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml | 2 ++ .../files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml | 2 ++ .../templates/toolhive.stacklok.dev_embeddingservers.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpgroups.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpregistries.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpremoteproxies.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpserverentries.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcpservers.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml | 2 ++ .../templates/toolhive.stacklok.dev_mcptoolconfigs.yaml | 2 ++ ...oolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml | 2 ++ .../templates/toolhive.stacklok.dev_virtualmcpservers.yaml | 2 ++ 36 files changed, 60 insertions(+) diff --git a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go index 2b14b22c34..e088b9876e 100644 --- a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go +++ b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go @@ -210,6 +210,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=emb;embedding,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Model",type="string",JSONPath=".spec.model" diff --git a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go index 01558865d1..2a77ccda96 100644 --- a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go @@ -768,6 +768,7 @@ type MCPExternalAuthConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=extauth;mcpextauth,categories=toolhive // +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go index a151448943..4793986182 100644 --- a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go @@ -88,6 +88,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpg;mcpgroup,categories=toolhive //+kubebuilder:printcolumn:name="Servers",type="integer",JSONPath=".status.serverCount" //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" diff --git a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go index bd711bfd4f..6d05419f4c 100644 --- a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go @@ -196,6 +196,7 @@ type MCPOIDCConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpoidc,categories=toolhive // +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go index a95fd5a775..560c46715d 100644 --- a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go @@ -170,6 +170,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" //+kubebuilder:printcolumn:name="Replicas",type="integer",JSONPath=".status.readyReplicas" diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 3b38d591a7..2105719152 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -348,6 +348,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=rp;mcprp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Remote URL",type="string",JSONPath=".spec.remoteUrl" diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go index 3a21eac58e..1fc7534f82 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go @@ -869,6 +869,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpserver;mcpservers,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go index 6c5043d6e7..96e22571b0 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go @@ -151,6 +151,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpentry,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Transport",type="string",JSONPath=".spec.transport" diff --git a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go index 7e1685227c..c35eb73de2 100644 --- a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go @@ -139,6 +139,7 @@ type MCPTelemetryConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpotel,categories=toolhive // +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.openTelemetry.endpoint` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/toolconfig_types.go b/cmd/thv-operator/api/v1beta1/toolconfig_types.go index 55e206b414..582f8d2ed4 100644 --- a/cmd/thv-operator/api/v1beta1/toolconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/toolconfig_types.go @@ -108,6 +108,7 @@ type MCPToolConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=tc;toolconfig,categories=toolhive // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` // +kubebuilder:printcolumn:name="References",type=string,JSONPath=`.status.referencingWorkloads` diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go index e2a4ec0da6..7f09e7ebeb 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go @@ -100,6 +100,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcpctd;compositetool,categories=toolhive //+kubebuilder:printcolumn:name="Workflow",type="string",JSONPath=".spec.name",description="Workflow name" //+kubebuilder:printcolumn:name="Steps",type="integer",JSONPath=".spec.steps[*]",description="Number of steps" diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go index 6a9f72b3d9..1e99d9450a 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go @@ -392,6 +392,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcp;virtualmcp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The phase of the VirtualMCPServer" //+kubebuilder:printcolumn:name="URL",type="string",JSONPath=".status.url",description="Virtual MCP server URL" diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml index ab99253a70..cf1790b3a5 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index 7f9a3d2a7f..2eaf68493d 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml index 4270eb0334..1be61dcefb 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 8890392814..41aa7225a9 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml index fea184018f..4c4cac98a6 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 915417ddd9..20ccacbb34 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml index 78c2523eca..c7a9896551 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml index 44eb6f7a02..a96e309653 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index 9a61add0cb..2262801a88 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml index 719026f13d..a5256f7925 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index a9c17deff5..bd13d9ea55 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml index 411d84cdc3..e35cb99e0f 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml index 96ed64805a..360f2ac179 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index ea6684ae6c..33b6b2eb60 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml index 5a072011b2..929ca3ef82 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 280b14bea2..14b4fa41f9 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml index 6e55bc9de4..9fbf917813 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index 79154701fb..cb6acd389a 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml index 2cff1d9a86..961d0a5644 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml index a248508d41..6ee8d6a079 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index ca666e2693..d1ad1e4495 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml index 7df79aba95..cbe2794c4d 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index 347455410d..50694d18fd 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml index 9929904d14..407b8ea1e0 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev From 8e07886991c20d7526771da397388ac557a2f96f Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 14:02:33 +0100 Subject: [PATCH 3/9] Add StorageVersionMigrator controller Introduces a new operator controller that watches opted-in toolhive.stacklok.dev CRDs and reconciles each one's status.storedVersions down to the current storage version, so a future release can drop deprecated versions (e.g. v1alpha1) from spec.versions without orphaning etcd objects. Mechanism: for each CR of a managed kind, issue a metadata-only Server-Side Apply against the /status subresource with a dedicated field manager (thv-storage-version-migrator). SSA on /status bypasses the admission chain (ToolHive has validating webhooks on several root types) while still forcing the API server to re-encode each object at the current storage version. Once every CR succeeds, the CRD's status.storedVersions is patched to [] with an optimistic-lock merge. Scoping: CRDs must carry both the auto-migrate label (set via kubebuilder marker in a previous commit) and a name under toolhive.stacklok.dev. Wired into main.go behind the ENABLE_STORAGE_VERSION_MIGRATOR feature flag (default on). Helm chart surface + envtest coverage land in subsequent commits. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../storageversionmigrator_controller.go | 422 ++++++++++++++++++ cmd/thv-operator/main.go | 40 +- .../operator/templates/clusterrole/role.yaml | 29 ++ 3 files changed, 485 insertions(+), 6 deletions(-) create mode 100644 cmd/thv-operator/controllers/storageversionmigrator_controller.go diff --git a/cmd/thv-operator/controllers/storageversionmigrator_controller.go b/cmd/thv-operator/controllers/storageversionmigrator_controller.go new file mode 100644 index 0000000000..76b224e48c --- /dev/null +++ b/cmd/thv-operator/controllers/storageversionmigrator_controller.go @@ -0,0 +1,422 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + apitypes "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// Public contract for the StorageVersionMigrator controller. + +// AutoMigrateLabel identifies CRDs that opt in to storage-version migration. +// Applied via a kubebuilder marker on each root type in api/v1beta1/. +const AutoMigrateLabel = "toolhive.stacklok.dev/auto-migrate-storage-version" + +// AutoMigrateValue is the label value that enables migration for a CRD. +const AutoMigrateValue = "true" + +// ToolhiveGroup is the API group the controller is scoped to (belt-and-braces +// filter in addition to the opt-in label). +const ToolhiveGroup = "toolhive.stacklok.dev" + +// StorageVersionMigratorFieldManager is the Server-Side Apply field manager +// name used for all re-store writes. It is part of the public contract for +// the controller — do not change it across releases, as SSA ownership is +// permanent. +const StorageVersionMigratorFieldManager = "thv-storage-version-migrator" + +// EventReasonMigrationSucceeded and EventReasonMigrationFailed are the event +// reasons emitted on the owning CRD when a migration completes or fails. +const ( + EventReasonMigrationSucceeded = "StorageVersionMigrationSucceeded" + EventReasonMigrationFailed = "StorageVersionMigrationFailed" +) + +const ( + defaultMigrationCacheTTL = 1 * time.Hour + defaultListPageSize = 500 +) + +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions/status,verbs=update;patch +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*,verbs=get;list;patch +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*/status,verbs=patch + +// StorageVersionMigratorReconciler reconciles CustomResourceDefinition objects +// in the toolhive.stacklok.dev group that carry the opt-in +// AutoMigrateLabel=AutoMigrateValue. For each such CRD it re-stores every CR +// at the current storage version by issuing a metadata-only Server-Side Apply +// against the /status subresource (to avoid admission webhooks), then patches +// CRD.status.storedVersions down to [] so a future +// release can drop deprecated versions from spec.versions without orphaning +// etcd objects. +// +// Enabled by default. Opt out operator-wide via +// operator.features.storageVersionMigrator (ENABLE_STORAGE_VERSION_MIGRATOR=false) +// for admins who prefer to run kube-storage-version-migrator externally. +// Per-kind escape hatch: remove the label from the CRD (emergency only — will +// be re-applied by GitOps / helm upgrade). +type StorageVersionMigratorReconciler struct { + client.Client // cached reads for CRs (unused in this reconciler, kept for kubebuilder convention) + APIReader client.Reader // live reads for CRDs (bypasses informer) + Scheme *runtime.Scheme // kubebuilder reconciler convention + Recorder record.EventRecorder // MigrationSucceeded / MigrationFailed events on the CRD + PageSize int64 // overrideable for tests; defaults to defaultListPageSize + cache *migrationCache +} + +// Reconcile runs for each opted-in toolhive.stacklok.dev CRD event. See the +// package-level docs on StorageVersionMigratorReconciler for the full flow. +// Returns a non-nil error to trigger exponential backoff; the CRD watch +// re-enqueues on any status change, so explicit requeue intervals are not used. +func (r *StorageVersionMigratorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("crd", req.Name) + + r.ensureInitialized() + + // Live-read the CRD. Informer cache may lag label or storedVersions updates. + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := r.APIReader.Get(ctx, req.NamespacedName, crd); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get CRD %s: %w", req.Name, err) + } + + // Re-verify the filter against live state; watch predicate could have + // fired on stale informer data. + if !isManagedCRD(crd) { + return ctrl.Result{}, nil + } + + storageVersion, ok := findStorageVersion(crd) + if !ok { + // CRDs without a storage version are malformed from our perspective; + // log and skip rather than fail (the API server would have rejected + // a CRD without a storage version, so this is unreachable in practice). + logger.Info("CRD has no storage version, skipping", "spec.versions", crd.Spec.Versions) + return ctrl.Result{}, nil + } + + if !isMigrationNeeded(crd, storageVersion) { + return ctrl.Result{}, nil + } + + logger.Info("migrating storage versions", + "storageVersion", storageVersion, + "storedVersions", crd.Status.StoredVersions, + ) + + if err := r.restoreCRs(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storage version migration failed: %v", err) + return ctrl.Result{}, fmt.Errorf("re-store CRs for %s: %w", crd.Name, err) + } + + if err := r.patchStoredVersions(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storedVersions patch failed: %v", err) + return ctrl.Result{}, fmt.Errorf("patch storedVersions for %s: %w", crd.Name, err) + } + + r.Recorder.Eventf(crd, "Normal", EventReasonMigrationSucceeded, + "storage version migrated to %s", storageVersion) + logger.Info("storage version migration complete", "storageVersion", storageVersion) + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler to watch CRDs using PartialObjectMetadata +// (no full-object cache), filtered on the opt-in label and the toolhive.stacklok.dev +// group. The filter is evaluated twice — once on informer events here, and again +// inside Reconcile after the live APIReader read — because label removals can +// briefly race the informer. +func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.ensureInitialized() + + labelSelector, err := labels.Parse(AutoMigrateLabel + "=" + AutoMigrateValue) + if err != nil { + return fmt.Errorf("parse label selector: %w", err) + } + + return ctrl.NewControllerManagedBy(mgr). + Named("storageversionmigrator"). + For( + &apiextensionsv1.CustomResourceDefinition{}, + builder.OnlyMetadata, + builder.WithPredicates( + predicate.NewPredicateFuncs(func(obj client.Object) bool { + return labelSelector.Matches(labels.Set(obj.GetLabels())) && + isToolhiveCRDName(obj.GetName()) + }), + predicate.ResourceVersionChangedPredicate{}, + ), + ). + Complete(r) +} + +// ------------------------------------------------------------------ +// Private implementation below. +// ------------------------------------------------------------------ + +func (r *StorageVersionMigratorReconciler) ensureInitialized() { + if r.PageSize == 0 { + r.PageSize = defaultListPageSize + } + if r.cache == nil { + r.cache = newMigrationCache(defaultMigrationCacheTTL) + } +} + +// restoreCRs lists all CRs of the CRD's served kind (served version = storageVersion) +// and issues a metadata-only Server-Side Apply against /status for each one, +// forcing the API server to re-encode the object at the current storage version. +// +// Swallowed per-CR errors: IsNotFound (object deleted between list and patch) +// and IsConflict (object updated elsewhere — storage is already fresh). +// All other errors are aggregated and returned. +func (r *StorageVersionMigratorReconciler) restoreCRs( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + logger := log.FromContext(ctx) + gvk := schema.GroupVersionKind{ + Group: crd.Spec.Group, + Version: storageVersion, + Kind: crd.Spec.Names.Kind, + } + + hasStatusSubresource := crdHasStatusSubresource(crd, storageVersion) + + listGVK := gvk + listGVK.Kind = crd.Spec.Names.ListKind + + var errs []error + var continueToken string + for { + list := &unstructured.UnstructuredList{} + list.SetGroupVersionKind(listGVK) + listOpts := []client.ListOption{client.Limit(r.PageSize)} + if continueToken != "" { + listOpts = append(listOpts, client.Continue(continueToken)) + } + if err := r.APIReader.List(ctx, list, listOpts...); err != nil { + return fmt.Errorf("list %s: %w", listGVK.String(), err) + } + + if err := meta.EachListItem(list, func(obj runtime.Object) error { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + errs = append(errs, fmt.Errorf("unexpected list item type %T", obj)) + return nil + } + if r.cache.has(crd.Name, u.GetUID(), u.GetResourceVersion()) { + return nil + } + if err := r.restoreOne(ctx, gvk, u, hasStatusSubresource); err != nil { + if apierrors.IsNotFound(err) || apierrors.IsConflict(err) { + logger.V(1).Info("skip CR — deleted or updated elsewhere", + "object", client.ObjectKeyFromObject(u), "err", err) + return nil + } + errs = append(errs, fmt.Errorf("re-store %s/%s: %w", + u.GetNamespace(), u.GetName(), err)) + return nil + } + r.cache.add(crd.Name, u.GetUID(), u.GetResourceVersion()) + return nil + }); err != nil { + errs = append(errs, err) + } + + continueToken = list.GetContinue() + if continueToken == "" { + break + } + } + + return kerrors.NewAggregate(errs) +} + +// restoreOne issues a metadata-only SSA for a single CR. Prefers the /status +// subresource (to bypass validating/mutating webhooks), falls back to the main +// resource if the CRD has no status subresource. +func (r *StorageVersionMigratorReconciler) restoreOne( + ctx context.Context, + gvk schema.GroupVersionKind, + original *unstructured.Unstructured, + hasStatusSubresource bool, +) error { + patch := &unstructured.Unstructured{} + patch.SetGroupVersionKind(gvk) + patch.SetName(original.GetName()) + patch.SetNamespace(original.GetNamespace()) + patch.SetUID(original.GetUID()) // UID mismatch → typed conflict on races + patch.SetResourceVersion(original.GetResourceVersion()) // RV mismatch → typed conflict on races + + applyConfig := client.ApplyConfigurationFromUnstructured(patch) + if hasStatusSubresource { + return r.Client.Status().Apply(ctx, applyConfig, + client.FieldOwner(StorageVersionMigratorFieldManager), + client.ForceOwnership, + ) + } + return r.Apply(ctx, applyConfig, + client.FieldOwner(StorageVersionMigratorFieldManager), + client.ForceOwnership, + ) +} + +// patchStoredVersions overwrites CRD.status.storedVersions to exactly +// [storageVersion], using an optimistic lock on the CRD's resourceVersion so +// a concurrent API-server write rejects the patch and triggers a requeue. +func (r *StorageVersionMigratorReconciler) patchStoredVersions( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + original := crd.DeepCopy() + crd.Status.StoredVersions = []string{storageVersion} + return r.Client.Status().Patch(ctx, crd, + client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})) +} + +// isManagedCRD returns true if a CRD is opted in to migration: the group matches +// toolhive.stacklok.dev and the opt-in label is set to the expected value. +func isManagedCRD(crd *apiextensionsv1.CustomResourceDefinition) bool { + if crd.Spec.Group != ToolhiveGroup { + return false + } + return crd.GetLabels()[AutoMigrateLabel] == AutoMigrateValue +} + +// isToolhiveCRDName checks whether a CRD name is of the form .toolhive.stacklok.dev, +// which is sufficient to filter at watch time. Reconcile re-verifies via the live CRD. +func isToolhiveCRDName(name string) bool { + return strings.HasSuffix(name, "."+ToolhiveGroup) +} + +// findStorageVersion returns the single version marked storage=true in the CRD spec. +func findStorageVersion(crd *apiextensionsv1.CustomResourceDefinition) (string, bool) { + for _, v := range crd.Spec.Versions { + if v.Storage { + return v.Name, true + } + } + return "", false +} + +// isMigrationNeeded returns true if status.storedVersions contains any entry +// other than the current storage version. If storedVersions equals exactly +// [storageVersion] AND only one version is served, no work is needed — this +// second condition avoids spurious reconciles once a deprecated version has +// been fully removed from spec.versions. +func isMigrationNeeded( + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) bool { + stored := crd.Status.StoredVersions + if len(stored) != 1 || stored[0] != storageVersion { + return true + } + servedCount := 0 + for _, v := range crd.Spec.Versions { + if v.Served { + servedCount++ + } + } + return servedCount > 1 +} + +// crdHasStatusSubresource returns true if the CRD's version declares a status +// subresource. Missing → fall back to main-resource SSA. +func crdHasStatusSubresource( + crd *apiextensionsv1.CustomResourceDefinition, + version string, +) bool { + for _, v := range crd.Spec.Versions { + if v.Name != version { + continue + } + return v.Subresources != nil && v.Subresources.Status != nil + } + return false +} + +// ------------------------------------------------------------------ +// migrationCache: short-lived de-duplication of re-store writes. +// ------------------------------------------------------------------ + +// migrationCache records successfully-migrated (UID, resourceVersion) pairs +// so subsequent reconciles within the TTL window skip already-fresh objects. +// It is a correctness optimization only — a cache miss simply issues a +// redundant (but harmless) SSA. +type migrationCache struct { + mu sync.Mutex + entries map[string]cacheEntry + ttl time.Duration + now func() time.Time +} + +type cacheEntry struct { + resourceVersion string + expiresAt time.Time +} + +func newMigrationCache(ttl time.Duration) *migrationCache { + return &migrationCache{ + entries: make(map[string]cacheEntry), + ttl: ttl, + now: time.Now, + } +} + +func (c *migrationCache) has(crdName string, uid apitypes.UID, resourceVersion string) bool { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + entry, ok := c.entries[key] + if !ok { + return false + } + if c.now().After(entry.expiresAt) { + delete(c.entries, key) + return false + } + return entry.resourceVersion == resourceVersion +} + +func (c *migrationCache) add(crdName string, uid apitypes.UID, resourceVersion string) { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + c.entries[key] = cacheEntry{ + resourceVersion: resourceVersion, + expiresAt: c.now().Add(c.ttl), + } +} + +func (*migrationCache) key(crdName string, uid apitypes.UID) string { + return crdName + "|" + string(uid) +} diff --git a/cmd/thv-operator/main.go b/cmd/thv-operator/main.go index 5ba985f751..a47e7e1027 100644 --- a/cmd/thv-operator/main.go +++ b/cmd/thv-operator/main.go @@ -44,9 +44,10 @@ var ( // Feature flags for controller groups const ( - featureServer = "ENABLE_SERVER" - featureRegistry = "ENABLE_REGISTRY" - featureVMCP = "ENABLE_VMCP" + featureServer = "ENABLE_SERVER" + featureRegistry = "ENABLE_REGISTRY" + featureVMCP = "ENABLE_VMCP" + featureStorageVersionMigrator = "ENABLE_STORAGE_VERSION_MIGRATOR" ) // controllerDependencies maps each controller group to its required dependencies @@ -135,12 +136,14 @@ func setupControllersAndWebhooks(mgr ctrl.Manager) error { enableServer := isFeatureEnabled(featureServer, true) enableRegistry := isFeatureEnabled(featureRegistry, true) enableVMCP := isFeatureEnabled(featureVMCP, true) + enableStorageVersionMigrator := isFeatureEnabled(featureStorageVersionMigrator, true) // Track enabled features for dependency checking enabledFeatures := map[string]bool{ - featureServer: enableServer, - featureRegistry: enableRegistry, - featureVMCP: enableVMCP, + featureServer: enableServer, + featureRegistry: enableRegistry, + featureVMCP: enableVMCP, + featureStorageVersionMigrator: enableStorageVersionMigrator, } // Check dependencies and log warnings for missing dependencies @@ -188,10 +191,35 @@ func setupControllersAndWebhooks(mgr ctrl.Manager) error { setupLog.Info("ENABLE_VMCP is disabled, skipping Virtual MCP controllers and webhooks") } + // Set up StorageVersionMigrator controller + if enabledFeatures[featureStorageVersionMigrator] { + if err := setupStorageVersionMigrator(mgr); err != nil { + return err + } + } else { + setupLog.Info("ENABLE_STORAGE_VERSION_MIGRATOR is disabled, skipping StorageVersionMigrator controller") + } + //+kubebuilder:scaffold:builder return nil } +// setupStorageVersionMigrator sets up the StorageVersionMigrator controller, +// which reconciles status.storedVersions on opted-in toolhive.stacklok.dev CRDs +// down to [] so future releases can drop deprecated +// versions from spec.versions without orphaning etcd objects. +func setupStorageVersionMigrator(mgr ctrl.Manager) error { + if err := (&controllers.StorageVersionMigratorReconciler{ + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("storageversionmigrator-controller"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create controller StorageVersionMigrator: %w", err) + } + return nil +} + // setupGroupRefFieldIndexes sets up field indexing for spec.groupRef on all resource types // that can reference an MCPGroup. This enables efficient lookups by groupRef in controllers. func setupGroupRefFieldIndexes(mgr ctrl.Manager) error { diff --git a/deploy/charts/operator/templates/clusterrole/role.yaml b/deploy/charts/operator/templates/clusterrole/role.yaml index 120b549df6..29310e34ac 100644 --- a/deploy/charts/operator/templates/clusterrole/role.yaml +++ b/deploy/charts/operator/templates/clusterrole/role.yaml @@ -48,6 +48,21 @@ rules: - pods/log verbs: - get +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions/status + verbs: + - patch + - update - apiGroups: - apps resources: @@ -95,6 +110,20 @@ rules: - patch - update - watch +- apiGroups: + - toolhive.stacklok.dev + resources: + - '*' + verbs: + - get + - list + - patch +- apiGroups: + - toolhive.stacklok.dev + resources: + - '*/status' + verbs: + - patch - apiGroups: - toolhive.stacklok.dev resources: From dc2b49608c3915b028be7876299d63b4aa4bfbed Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:16:33 +0100 Subject: [PATCH 4/9] Add envtest suite for StorageVersionMigrator controller Seven table-driven scenarios exercising the reconciler against a real kube-apiserver + etcd via envtest: - noop when storedVersions is already [storageVersion] - happy path: [v1alpha1,v1beta1] -> [v1beta1] with CRs re-stored - foreign-group CRDs are skipped - unlabelled toolhive CRDs are skipped - fallback to main-resource SSA when /status is absent - pagination across the continue-token loop (verified via a counting APIReader wrapper since metadata-only SSAs leave no managedFields fingerprint to assert on) - partial failure leaves storedVersions untouched (failure injected via a client wrapper that fails Apply for one specific CR key) Each test builds its own CRD fixture and calls Reconcile directly; the suite does not start a manager so tests are deterministic and free from races with a background controller. Runs via `task operator-test-integration` alongside the existing virtualmcp suite. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../storageversionmigrator/controller_test.go | 554 ++++++++++++++++++ .../storageversionmigrator/suite_test.go | 85 +++ 2 files changed, 639 insertions(+) create mode 100644 cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go create mode 100644 cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go new file mode 100644 index 0000000000..df4c752321 --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go @@ -0,0 +1,554 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package storageversionmigrator + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/stacklok/toolhive/cmd/thv-operator/controllers" +) + +const ( + toolhiveGroup = controllers.ToolhiveGroup + migrateLabel = controllers.AutoMigrateLabel + migrateValue = controllers.AutoMigrateValue +) + +// crdSpec describes a test CRD fixture. +type crdSpec struct { + Name string + Group string + Kind string + ListKind string + Plural string + Singular string + Versions []versionSpec + Labelled bool + HasStatusOnStored bool +} + +type versionSpec struct { + Name string + Served bool + Storage bool +} + +func buildCRD(s crdSpec) *apiextensionsv1.CustomResourceDefinition { + versions := make([]apiextensionsv1.CustomResourceDefinitionVersion, 0, len(s.Versions)) + for _, v := range s.Versions { + cdv := apiextensionsv1.CustomResourceDefinitionVersion{ + Name: v.Name, + Served: v.Served, + Storage: v.Storage, + Schema: &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + "status": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + }, + }, + }, + } + if v.Storage && s.HasStatusOnStored { + cdv.Subresources = &apiextensionsv1.CustomResourceSubresources{ + Status: &apiextensionsv1.CustomResourceSubresourceStatus{}, + } + } + versions = append(versions, cdv) + } + + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: s.Name}, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: s.Group, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: s.Kind, + ListKind: s.ListKind, + Plural: s.Plural, + Singular: s.Singular, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: versions, + }, + } + if s.Labelled { + crd.Labels = map[string]string{migrateLabel: migrateValue} + } + return crd +} + +func ptrBool(b bool) *bool { return &b } + +// installCRD creates a CRD and waits for the apiserver to publish it so +// unstructured CR creates of that kind will succeed. +func installCRD(c crdSpec) { + crd := buildCRD(c) + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: c.Name}, live); err != nil { + return false + } + for _, cond := range live.Status.Conditions { + if cond.Type == apiextensionsv1.Established && cond.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue(), "CRD %s never became Established", c.Name) +} + +func deleteCRD(name string) { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, crd); err != nil { + if apierrors.IsNotFound(err) { + return + } + Fail(fmt.Sprintf("get CRD %s before delete: %v", name, err)) + } + Expect(k8sClient.Delete(ctx, crd)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: name}, &apiextensionsv1.CustomResourceDefinition{})) + }, time.Second*30, time.Millisecond*200).Should(BeTrue(), "CRD %s never fully deleted", name) +} + +// setStoredVersions overwrites status.storedVersions, simulating a historical +// state where objects were stored at earlier versions. +func setStoredVersions(crdName string, versions []string) { + Eventually(func() error { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd); err != nil { + return err + } + orig := crd.DeepCopy() + crd.Status.StoredVersions = versions + return k8sClient.Status().Patch(ctx, crd, client.MergeFrom(orig)) + }, time.Second*5, time.Millisecond*100).Should(Succeed()) +} + +func getStoredVersions(crdName string) []string { + crd := &apiextensionsv1.CustomResourceDefinition{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd)).To(Succeed()) + return append([]string{}, crd.Status.StoredVersions...) +} + +// createCRs creates count CRs in the default namespace with the given kind +// and a name derived from basename. Returns the created objects so tests can +// assert on them post-reconcile. +func createCRs(gvk schema.GroupVersionKind, basename string, count int) []*unstructured.Unstructured { + out := make([]*unstructured.Unstructured, 0, count) + for i := 0; i < count; i++ { + u := &unstructured.Unstructured{} + u.SetGroupVersionKind(gvk) + u.SetNamespace("default") + u.SetName(fmt.Sprintf("%s-%d", basename, i)) + Expect(unstructured.SetNestedField(u.Object, "placeholder", "spec", "marker")).To(Succeed()) + Expect(k8sClient.Create(ctx, u)).To(Succeed()) + out = append(out, u) + } + return out +} + +// Note on "did the SSA actually fire" verification: +// +// Metadata-only SSA with no owned fields is deliberately observable-noop — the +// point is to force the apiserver to re-encode the etcd bytes at the current +// storage version, which is not visible from outside the apiserver. The SSA +// does not create a managedFields entry (since no fields are managed), and the +// object's resourceVersion does not always bump for a true no-op apply. +// +// So the contract we assert in envtest is the *outcome*: reconcile succeeds +// without error, and CRD.status.storedVersions converges to [storageVersion]. +// If the SSA were skipped or silently failing, patchStoredVersions would fire +// against un-re-stored objects and the test would still pass — but in +// production the apiserver guards storage-version changes exactly this way, so +// the end-to-end behaviour is covered by the Kubernetes apiserver's own +// correctness. The pagination test additionally verifies the continue-token +// loop via a list-call counter. + +// newReconciler constructs a StorageVersionMigratorReconciler for a single +// test. Every test has its own instance so the migration cache doesn't leak +// between tests and state is fully explicit. +func newReconciler() *controllers.StorageVersionMigratorReconciler { + return &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } +} + +// reconcile invokes the reconciler once for the given CRD and returns the +// result and error directly — tests assert on both. +func reconcile(r *controllers.StorageVersionMigratorReconciler, crdName string) (ctrl.Result, error) { + return r.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: crdName}}) +} + +var crdCounter int + +func uniqueSuffix() string { + crdCounter++ + return fmt.Sprintf("t%d", crdCounter) +} + +// ------------------------------------------------------------------ +// Tests +// ------------------------------------------------------------------ + +var _ = Describe("StorageVersionMigrator", func() { + Describe("Reconcile", func() { + + It("is a noop when storedVersions is already [storageVersion] and only one version is served", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "noops" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Noop" + suf, + ListKind: "Noop" + suf + "List", + Plural: "noops" + suf, + Singular: "noop" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // envtest leaves storedVersions empty until a write happens. + // Seed it explicitly so the isMigrationNeeded check sees the + // "clean" state we want to exercise. + setStoredVersions(spec.Name, []string{"v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + + It("migrates storedVersions from [v1alpha1,v1beta1] to [v1beta1] and re-stores all CRs", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "happies" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Happy" + suf, + ListKind: "Happy" + suf + "List", + Plural: "happies" + suf, + Singular: "happy" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + + // Verify every CR still exists (migrator does not delete). + for _, cr := range crs { + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(cr.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) + } + }) + + It("skips CRDs in foreign API groups", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "outsiders" + suf + ".example.com", + Group: "example.com", + Kind: "Outsider" + suf, + ListKind: "Outsider" + suf + "List", + Plural: "outsiders" + suf, + Singular: "outsider" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for foreign-group CRDs") + }) + + It("skips toolhive CRDs missing the opt-in label", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "unlabelled" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Unlabelled" + suf, + ListKind: "Unlabelled" + suf + "List", + Plural: "unlabelled" + suf, + Singular: "unlabelled" + suf, + Labelled: false, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for unlabelled CRDs") + }) + + It("falls back to main-resource SSA when the storage version has no /status subresource", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "nostatus" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "NoStatus" + suf, + ListKind: "NoStatus" + suf + "List", + Plural: "nostatus" + suf, + Singular: "nostatus" + suf, + Labelled: true, + HasStatusOnStored: false, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + cr := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 1, + )[0] + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred(), + "reconcile must succeed via main-resource SSA when no /status subresource exists") + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + + // CR still exists (migrator doesn't delete). + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(cr.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) + }) + + It("handles pagination across multiple list pages", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "paginated" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Paginated" + suf, + ListKind: "Paginated" + suf + "List", + Plural: "paginated" + suf, + Singular: "paginated" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // Seven CRs with PageSize=3 forces three pages (3+3+1) and + // exercises the continue-token loop far more cheaply than 501 + // writes against envtest. + createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 7, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + // Wrap APIReader to count List calls for this kind. This is the + // only direct proof that the continue-token loop actually ran — + // metadata-only SSAs don't leave a managedFields fingerprint. + counting := &countingAPIReader{Reader: k8sClient, kind: spec.Kind} + r := &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: counting, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + PageSize: 3, + } + _, err := reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + + // 7 CRs with PageSize=3 ⇒ 3 list calls (pages of 3+3+1). + Expect(counting.listCalls).To(BeNumerically(">=", 3), + "pagination should have triggered at least 3 list calls for 7 CRs at pageSize=3; got %d", + counting.listCalls) + }) + + It("does not touch storedVersions when a CR re-store fails", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "failures" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Failure" + suf, + ListKind: "Failure" + suf + "List", + Plural: "failures" + suf, + Singular: "failure" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + failureTarget := client.ObjectKeyFromObject(crs[0]) + failing := &failingApplyClient{ + Client: k8sClient, + fail: func(key client.ObjectKey) bool { return key == failureTarget }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: failing, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), "reconcile should surface the injected failure") + Expect(err.Error()).To(ContainSubstring(failureTarget.Name)) + + // Critical contract: storedVersions must NOT be trimmed when any + // CR re-store failed. Otherwise the next release's v1alpha1 + // removal would orphan the un-migrated object in etcd. + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"})) + }) + }) +}) + +// ------------------------------------------------------------------ +// Test doubles +// ------------------------------------------------------------------ + +// failingApplyClient wraps a real client.Client and fails Apply (and +// Status().Apply()) for specific object keys. Used to simulate non-retriable +// SSA errors in the partial-failure test. +type failingApplyClient struct { + client.Client + fail func(key client.ObjectKey) bool +} + +func (f *failingApplyClient) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.ApplyOption) error { + if key, ok := keyFromApplyConfig(obj); ok && f.fail(key) { + return fmt.Errorf("injected SSA failure for %s", key) + } + return f.Client.Apply(ctx, obj, opts...) +} + +func (f *failingApplyClient) Status() client.SubResourceWriter { + return &failingApplyStatus{ + SubResourceWriter: f.Client.Status(), + fail: f.fail, + } +} + +type failingApplyStatus struct { + client.SubResourceWriter + fail func(key client.ObjectKey) bool +} + +func (s *failingApplyStatus) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.SubResourceApplyOption) error { + if key, ok := keyFromApplyConfig(obj); ok && s.fail(key) { + return fmt.Errorf("injected status SSA failure for %s", key) + } + return s.SubResourceWriter.Apply(ctx, obj, opts...) +} + +// keyFromApplyConfig peeks inside the opaque runtime.ApplyConfiguration that +// client.ApplyConfigurationFromUnstructured returns. The concrete type embeds +// *unstructured.Unstructured, so we recover the object key via the +// GetName/GetNamespace interface without depending on the unexported wrapper. +func keyFromApplyConfig(obj runtime.ApplyConfiguration) (client.ObjectKey, bool) { + type nameNamespace interface { + GetName() string + GetNamespace() string + } + if nn, ok := obj.(nameNamespace); ok { + return client.ObjectKey{Name: nn.GetName(), Namespace: nn.GetNamespace()}, true + } + return client.ObjectKey{}, false +} + +// noopRecorder is a minimal EventRecorder for direct-Reconcile tests. +type noopRecorder struct{} + +func (*noopRecorder) Event(_ runtime.Object, _, _, _ string) {} +func (*noopRecorder) Eventf(_ runtime.Object, _, _, _ string, _ ...any) {} +func (*noopRecorder) AnnotatedEventf(_ runtime.Object, _ map[string]string, _, _, _ string, _ ...any) { +} + +// countingAPIReader wraps a client.Reader and records how many List calls +// targeted a given kind. Used by the pagination test to verify the +// continue-token loop ran as expected. +type countingAPIReader struct { + client.Reader + kind string + listCalls int +} + +func (c *countingAPIReader) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { + if u, ok := list.(*unstructured.UnstructuredList); ok { + // ListKind is "List"; match on the configured kind prefix. + if u.GetKind() == c.kind+"List" { + c.listCalls++ + } + } + return c.Reader.List(ctx, list, opts...) +} diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go new file mode 100644 index 0000000000..25b52aa720 --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package storageversionmigrator contains envtest-backed integration tests +// for the StorageVersionMigrator controller. +// +// The suite does NOT pre-install any CRDs — each test constructs and installs +// the exact CRD it needs, which keeps scenarios independent and lets us +// exercise edge cases (foreign groups, missing status subresource, etc.) that +// wouldn't be possible with the real toolhive CRD manifests. +// +// The suite also does NOT start a controller manager. Each test constructs its +// own reconciler and calls Reconcile directly. This is more deterministic than +// manager-driven tests (no Eventually() races against the background +// controller) and lets individual tests inject custom clients to exercise +// failure paths. +package storageversionmigrator + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.uber.org/zap/zapcore" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + cfg *rest.Config + k8sClient client.Client + testEnv *envtest.Environment + ctx context.Context + cancel context.CancelFunc +) + +func TestStorageVersionMigrator(t *testing.T) { + t.Parallel() + RegisterFailHandler(Fail) + + suiteConfig, reporterConfig := GinkgoConfiguration() + reporterConfig.Verbose = false + reporterConfig.VeryVerbose = false + reporterConfig.FullTrace = false + + RunSpecs(t, "StorageVersionMigrator Controller Integration Test Suite", suiteConfig, reporterConfig) +} + +var _ = BeforeSuite(func() { + logLevel := zapcore.ErrorLevel + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true), zap.Level(logLevel))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping envtest") + testEnv = &envtest.Environment{ + ErrorIfCRDPathMissing: false, // tests install CRDs on demand + } + + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + utilruntime.Must(apiextensionsv1.AddToScheme(scheme.Scheme)) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down envtest") + cancel() + time.Sleep(100 * time.Millisecond) + Expect(testEnv.Stop()).NotTo(HaveOccurred()) +}) From bb5e980a2fb13de55578c7ea175108dfc75d7420 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:18:16 +0100 Subject: [PATCH 5/9] Add CI test enforcing migrate-label coverage on v1beta1 root types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guards against the main footgun of the opt-in-label design for storage-version migration: if a new CRD root type is added to cmd/thv-operator/api/v1beta1/ without the migrate-marker, the StorageVersionMigrator controller silently excludes it. The problem surfaces only when a future release tries to drop a deprecated version — at which point it is far too late to fix. The test scans every root type (marker block contains both +kubebuilder:object:root=true and +kubebuilder:storageversion) and fails unless the block also contains either the migrate marker or an explicit +thv:storage-version-migrator:exclude sibling marker. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../controllers/marker_coverage_test.go | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 cmd/thv-operator/controllers/marker_coverage_test.go diff --git a/cmd/thv-operator/controllers/marker_coverage_test.go b/cmd/thv-operator/controllers/marker_coverage_test.go new file mode 100644 index 0000000000..bce7f70741 --- /dev/null +++ b/cmd/thv-operator/controllers/marker_coverage_test.go @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "bufio" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestV1beta1TypesMarkerCoverage guards against a subtle failure mode in the +// opt-in-label design for storage-version migration: if a new CRD root type is +// added to cmd/thv-operator/api/v1beta1/ without the +// +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +// +// marker, the StorageVersionMigrator controller silently excludes it from +// reconciliation. The problem surfaces only when a future release tries to +// drop a deprecated version — at which point it is far too late. +// +// The test scans every root type (marker block contains +// +kubebuilder:object:root=true AND +kubebuilder:storageversion) and fails if +// any lacks either the migrate marker or an explicit +// +thv:storage-version-migrator:exclude sibling marker. List types +// (+kubebuilder:object:root=true WITHOUT +kubebuilder:storageversion) are +// intentionally skipped — CRD-level labels are keyed on the root type, not +// the list type. +func TestV1beta1TypesMarkerCoverage(t *testing.T) { + t.Parallel() + + typesDir := filepath.Join("..", "api", "v1beta1") + entries, err := os.ReadDir(typesDir) + require.NoError(t, err, "reading %s", typesDir) + + const ( + rootMarker = "+kubebuilder:object:root=true" + storageMarker = "+kubebuilder:storageversion" + migrateMarker = "+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true" + excludeMarker = "+thv:storage-version-migrator:exclude" + groupversionInfoGo = "groupversion_info.go" + zzGeneratedPrefix = "zz_generated" + suffixTypesGo = "_types.go" + ) + + type rootType struct { + file string + typeName string + markerBlock []string + } + + var roots []rootType + + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasSuffix(name, suffixTypesGo) { + continue + } + if name == groupversionInfoGo || strings.HasPrefix(name, zzGeneratedPrefix) { + continue + } + + path := filepath.Join(typesDir, name) + f, err := os.Open(path) + require.NoError(t, err, "open %s", path) + + scanner := bufio.NewScanner(f) + // Raise the max token size — some of the *_types.go files have very + // long comment lines (printcolumn JSONPath expressions). + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + + var block []string + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + switch { + case strings.HasPrefix(line, "//"): + // Accumulate every comment/marker line. kubebuilder convention + // often separates markers from the godoc comment with a blank + // line, so we keep the block alive across blanks and only + // reset on a non-comment, non-blank, non-type line. + block = append(block, strings.TrimSpace(strings.TrimPrefix(line, "//"))) + case strings.HasPrefix(line, "type "): + // Only root-level types matter (must contain object:root=true + // AND storageversion markers). List types have only object:root=true. + if containsMarker(block, rootMarker) && containsMarker(block, storageMarker) { + typeName := extractTypeName(line) + if typeName != "" { + copied := append([]string(nil), block...) + roots = append(roots, rootType{file: name, typeName: typeName, markerBlock: copied}) + } + } + block = nil + case line == "": + // Blank line — keep block alive (comment-then-blank-then-type + // is idiomatic Go + kubebuilder). + default: + // Anything else (e.g. struct body, package clause, import) — + // drop any in-flight comment block. + block = nil + } + } + require.NoError(t, scanner.Err(), "scan %s", path) + require.NoError(t, f.Close()) + } + + require.NotEmpty(t, roots, + "no v1beta1 root types found — scanner likely broken; this test is meaningless without coverage") + + for _, r := range roots { + hasMigrate := containsMarker(r.markerBlock, migrateMarker) + hasExclude := containsMarker(r.markerBlock, excludeMarker) + assert.Truef(t, hasMigrate || hasExclude, + "v1beta1 root type %s.%s is missing either\n"+ + " %s\n"+ + "(opt in to storage-version migration) or\n"+ + " %s\n"+ + "(explicit opt-out). Every root type must declare one. See\n"+ + "cmd/thv-operator/controllers/storageversionmigrator_controller.go for context.", + r.file, r.typeName, migrateMarker, excludeMarker) + } +} + +// containsMarker returns true if any line in block contains the given +// marker substring. +func containsMarker(block []string, marker string) bool { + for _, l := range block { + if strings.Contains(l, marker) { + return true + } + } + return false +} + +// extractTypeName returns the identifier in a line of the form `type Foo struct {`. +// Returns empty string if the line is not a type declaration we care about. +func extractTypeName(line string) string { + fields := strings.Fields(line) + if len(fields) < 2 || fields[0] != "type" { + return "" + } + return fields[1] +} From 61fcb1c6b7d45d806f68109ff1e36c8c57173f10 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:19:44 +0100 Subject: [PATCH 6/9] Expose storage-version migrator feature flag in operator helm chart Adds operator.features.storageVersionMigrator (default true) plus the ENABLE_STORAGE_VERSION_MIGRATOR env var wiring in the deployment template, mirroring the existing server/registry/virtualMCP pattern. Helm docs regenerated. Admins who run kube-storage-version-migrator externally can disable the in-operator controller by setting this flag to false. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- deploy/charts/operator/README.md | 3 ++- deploy/charts/operator/templates/deployment.yaml | 2 ++ deploy/charts/operator/values.yaml | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index 04fa310fe5..0757884e37 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and |-----|------|---------|-------------| | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources | | nameOverride | string | `""` | Override the name of the chart | -| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"env":[],"features":{"experimental":false,"registry":true,"server":true,"virtualMCP":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.23.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":[""],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.23.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.23.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | +| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"env":[],"features":{"experimental":false,"registry":true,"server":true,"storageVersionMigrator":true,"virtualMCP":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.23.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":[""],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.23.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.23.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | | operator.affinity | object | `{}` | Affinity settings for the operator pod | | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling | | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator | @@ -58,6 +58,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.features.experimental | bool | `false` | Enable experimental features | | operator.features.registry | bool | `true` | Enable registry controller (MCPRegistry). This automatically sets ENABLE_REGISTRY environment variable. | | operator.features.server | bool | `true` | Enable server-related controllers (MCPServer, MCPExternalAuthConfig, MCPRemoteProxy, and ToolConfig). This automatically sets ENABLE_SERVER environment variable. | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects. Leave this on unless you are running kube-storage-version-migrator externally. This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. | | operator.features.virtualMCP | bool | `true` | Enable Virtual MCP aggregation features (VirtualMCPServer, MCPGroup controllers and webhooks). Set to false to disable Virtual MCP controllers when Virtual MCP CRDs are not installed. This automatically sets ENABLE_VMCP environment variable. Requires server to be enabled (server: true). | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml index 12170c58b8..60fced6f91 100644 --- a/deploy/charts/operator/templates/deployment.yaml +++ b/deploy/charts/operator/templates/deployment.yaml @@ -64,6 +64,8 @@ spec: value: {{ .Values.operator.features.registry | quote }} - name: ENABLE_VMCP value: {{ .Values.operator.features.virtualMCP | quote }} + - name: ENABLE_STORAGE_VERSION_MIGRATOR + value: {{ .Values.operator.features.storageVersionMigrator | quote }} {{- if eq .Values.operator.rbac.scope "namespace" }} - name: WATCH_NAMESPACE value: "{{ .Values.operator.rbac.allowedNamespaces | join "," }}" diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index d8eaa5a720..f199324598 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -20,6 +20,13 @@ operator: # This automatically sets ENABLE_VMCP environment variable. # Requires server to be enabled (server: true). virtualMCP: true + # -- Enable the StorageVersionMigrator controller, which auto-cleans + # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a + # future release can drop deprecated versions (e.g. v1alpha1) without + # orphaning etcd objects. Leave this on unless you are running + # kube-storage-version-migrator externally. + # This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. + storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 From 704a4cda4fba86d3d4cefb15e509ecb42543433c Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:21:11 +0100 Subject: [PATCH 7/9] Document storage version migration for operator users Covers what the controller does, the opt-in label contract, how to disable the controller operator-wide, the per-CRD escape hatch and its interaction with GitOps, how the migrator interacts with future version-removal releases, and the required RBAC. Part of #4969. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/operator/storage-version-migration.md | 114 +++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 docs/operator/storage-version-migration.md diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md new file mode 100644 index 0000000000..547beeed62 --- /dev/null +++ b/docs/operator/storage-version-migration.md @@ -0,0 +1,114 @@ +# Storage Version Migration + +The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd. + +## Why this exists + +When a CRD graduates from, say, `v1alpha1` to `v1beta1` with both versions served and `v1beta1` as the storage version, existing objects continue to work — they are transparently converted on read/write. But the API server records every version that has ever been used for storage in `CustomResourceDefinition.status.storedVersions`. Until that list is trimmed, the Kubernetes API server refuses to let you remove a version from `spec.versions`, because doing so would orphan any etcd-stored objects encoded at that version. + +The cleanup is not automatic. Someone has to re-store every existing object at the current storage version, then explicitly patch `status.storedVersions` to drop the old entry. The `StorageVersionMigrator` controller does this for you, on every opted-in ToolHive CRD, continuously. See [upstream Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/storage-version-migration/) for the mechanism. + +## What the controller does + +For each opted-in CRD: + +1. Reads `spec.versions` to find the entry with `storage: true`. +2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. +3. Otherwise, lists every Custom Resource of that kind and issues a metadata-only Server-Side Apply against the `/status` subresource with field manager `thv-storage-version-migrator`. This forces the API server to re-encode each object at the current storage version without triggering admission webhooks (SSA on `/status` typically bypasses webhooks registered on the main resource, and the empty apply owns no fields so it doesn't fight other controllers). +4. Once every object has been re-stored, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. + +CRDs without a `/status` subresource fall back to main-resource SSA. + +## The opt-in label + +A CRD participates in migration only if it carries: + +```yaml +metadata: + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" +``` + +The label is set at CRD-generation time via a kubebuilder marker on each Go type in `cmd/thv-operator/api/v1beta1/`: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type MCPServer struct { ... } +``` + +`task operator-manifests` bakes the label into the generated CRD YAML. All current ToolHive root types ship with the marker. A CI test (`TestV1beta1TypesMarkerCoverage`) fails the build if a root type is added without either this marker or an explicit `// +thv:storage-version-migrator:exclude` sibling marker — so the migrator cannot silently forget a new CRD. + +Adding a new CRD that should be migrated: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type NewShinyThing struct { ... } +``` + +Adding a new CRD that deliberately should NOT be migrated (e.g. an experimental kind that is still stabilising its schema): + +```go +// +thv:storage-version-migrator:exclude +type ExperimentalThing struct { ... } +``` + +## Disabling the controller + +Set the Helm feature flag: + +```yaml +operator: + features: + storageVersionMigrator: false # default: true +``` + +This sets `ENABLE_STORAGE_VERSION_MIGRATOR=false` on the operator Deployment, and the reconciler is not registered with the manager. + +Disable only if you are running an external migrator such as [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator). Disabling without a replacement is a footgun: the next ToolHive release that removes a deprecated API version will refuse to apply its CRD update until `storedVersions` is cleaned, and you will have to clean it yourself. + +## Per-CRD emergency escape hatch + +Removing the label on a live cluster excludes that single CRD from migration immediately: + +```bash +kubectl label crd/mcpservers.toolhive.stacklok.dev \ + toolhive.stacklok.dev/auto-migrate-storage-version- +``` + +Intended for incident response only. If you deploy the operator via GitOps (Argo CD, Flux) or `helm upgrade`, the chart will re-apply the chart-set label within seconds. Use the `storageVersionMigrator` feature flag for long-term opt-out. + +## Interaction with version removal releases + +The `StorageVersionMigrator` must have had time to run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is: + +1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled. The controller quietly re-stores all objects and trims `storedVersions` on every cluster during this deprecation window. +2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster's `storedVersions` was already cleaned in the previous release, the CRD update applies cleanly. + +If your cluster upgraded directly from a pre-migrator release to the version-removal release without ever running release N, you must clean `storedVersions` manually (or deploy `kube-storage-version-migrator` once) before the upgrade can succeed. + +## Verification + +For any ToolHive CRD in a cluster where the controller has run: + +```bash +kubectl get crd mcpservers.toolhive.stacklok.dev \ + -o jsonpath='{.status.storedVersions}' +# ["v1beta1"] +``` + +If the list contains more than one entry, the controller has not yet finished migrating — check operator logs for reconcile errors and the `StorageVersionMigrationFailed` event on the CRD. + +## RBAC + +The controller requires (generated from kubebuilder markers, applied by the operator Helm chart): + +- `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch` +- `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch` +- `*.toolhive.stacklok.dev`: `get`, `list`, `patch` +- `*/status.toolhive.stacklok.dev`: `patch` + +## Related + +- Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969) +- Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/) +- Reference implementation: [kubernetes-sigs/cluster-api `crdmigrator`](https://github.com/kubernetes-sigs/cluster-api/tree/main/controllers/crdmigrator) From dad30c90898cf9a80952c34ee17496bf899df2d0 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 29 Apr 2026 15:01:37 +0100 Subject: [PATCH 8/9] Address PR review: switch from no-op SSA to Get+Update with annotation toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empirical verification (probe test against envtest) confirmed the reviewer's hypothesis: metadata-only Server-Side Apply on /status is short-circuited by the apiserver — resourceVersion does not bump and etcd is not re-encoded. The "force a re-encode" claim the original implementation rested on was hollow. This commit replaces the SSA path with the kube-storage-version-migrator pattern: Get the live object, toggle a timestamp annotation to force a real diff, then Update. Trade-off acknowledged in the controller docstring and user docs: the Update goes through the main resource so admission webhooks see it. Webhooks that need to ignore migration-only writes can branch on the presence of toolhive.stacklok.dev/storage-version-migrated-at in the diff. The original /status webhook-bypass benefit was always notional — the SSA wasn't writing etcd at all. Other fixes from the same review pass: - Track per-pass IsConflict count in restoreCRs; if any conflicts were swallowed and no other errors occurred, return a sentinel error so Reconcile leaves storedVersions untouched (the post-conflict state of the affected object is unverified). Next reconcile retries cleanly. New ConflictDuringRestoreLeavesStoredVersionsUntouched test covers this. - Bound migrationCache memory: register a manager.RunnableFunc that walks the cache every 10m and evicts expired entries. Prior code only evicted lazily on key re-query, which leaked entries for deleted CRs. - Simplify isMigrationNeeded to a single line. The servedCount > 1 branch was forcing list passes during deprecation windows for no actual benefit (None conversion + identical schemas means writers cannot reintroduce stale storedVersions). - Document the wildcard CR RBAC as defence-in-depth alongside the isManagedCRD runtime gate. - Fix the stale "(unused in this reconciler)" comment on the embedded Client field — it is used for CR Updates and the CRD status patch. - Drop a 100ms time.Sleep in AfterSuite that was dead code copied from virtualmcp's suite (which has a manager goroutine to drain; we don't). HappyPath now snapshots each CR's resourceVersion before reconcile and asserts it has bumped after — empirical proof the underlying re-encode actually happened. All 8 envtest cases pass; lint clean. Refs PR #5011 review by @jhrozek. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../storageversionmigrator_controller.go | 246 ++++++++++++------ .../storageversionmigrator/controller_test.go | 183 +++++++++---- .../storageversionmigrator/suite_test.go | 2 - .../operator/templates/clusterrole/role.yaml | 25 +- docs/operator/storage-version-migration.md | 6 +- 5 files changed, 308 insertions(+), 154 deletions(-) diff --git a/cmd/thv-operator/controllers/storageversionmigrator_controller.go b/cmd/thv-operator/controllers/storageversionmigrator_controller.go index 76b224e48c..1de0e99f09 100644 --- a/cmd/thv-operator/controllers/storageversionmigrator_controller.go +++ b/cmd/thv-operator/controllers/storageversionmigrator_controller.go @@ -5,6 +5,7 @@ package controllers import ( "context" + "errors" "fmt" "strings" "sync" @@ -24,6 +25,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" ) @@ -40,12 +42,6 @@ const AutoMigrateValue = "true" // filter in addition to the opt-in label). const ToolhiveGroup = "toolhive.stacklok.dev" -// StorageVersionMigratorFieldManager is the Server-Side Apply field manager -// name used for all re-store writes. It is part of the public contract for -// the controller — do not change it across releases, as SSA ownership is -// permanent. -const StorageVersionMigratorFieldManager = "thv-storage-version-migrator" - // EventReasonMigrationSucceeded and EventReasonMigrationFailed are the event // reasons emitted on the owning CRD when a migration completes or fails. const ( @@ -56,34 +52,63 @@ const ( const ( defaultMigrationCacheTTL = 1 * time.Hour defaultListPageSize = 500 + defaultCacheGCInterval = 10 * time.Minute ) +// errMigrationRetriedDueToConflicts is returned by restoreCRs when at least one +// CR re-store hit a typed Conflict (and no other errors occurred). The caller +// must NOT trim CRD.status.storedVersions in this case: the post-conflict state +// of the affected object is unverified, so reasoning about whether the storage +// re-encode actually happened is unsafe. The next reconcile retries cleanly. +var errMigrationRetriedDueToConflicts = errors.New( + "storage version migration retried due to concurrent writes; storedVersions left unchanged") + +// The wildcard CR RBAC below is intentional. The set of opted-in CRDs isn't +// known at codegen time — it's a per-CRD runtime label decision — so the +// kubebuilder marker can't enumerate kinds. The runtime gate is the +// isManagedCRD check inside Reconcile, which requires both the toolhive +// API group AND the opt-in label. Wildcard RBAC plus isManagedCRD form the +// defence in depth: RBAC bounds the controller to a single API group, and +// the label gate further restricts it to opted-in CRDs. + //+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch //+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions/status,verbs=update;patch -//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*,verbs=get;list;patch -//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*/status,verbs=patch +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*,verbs=get;list;update +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*/status,verbs=update // StorageVersionMigratorReconciler reconciles CustomResourceDefinition objects // in the toolhive.stacklok.dev group that carry the opt-in // AutoMigrateLabel=AutoMigrateValue. For each such CRD it re-stores every CR -// at the current storage version by issuing a metadata-only Server-Side Apply -// against the /status subresource (to avoid admission webhooks), then patches +// at the current storage version by doing a Get + Update on the live object +// while toggling the MigrationTimestampAnnotation to force a real content +// diff. The annotation toggle is required because the apiserver elides no-op +// writes (an Update with bytes equal to what's already stored does not bump +// resourceVersion and does not re-encode) — without a real diff the migration +// would be a hollow operation. After all CRs have been re-stored it patches // CRD.status.storedVersions down to [] so a future // release can drop deprecated versions from spec.versions without orphaning // etcd objects. // +// Webhook interaction: the Update goes through the main resource, so +// validating/mutating admission webhooks on the kind DO see this write. +// Webhooks that need to ignore migration-only writes should branch on the +// presence of MigrationTimestampAnnotation in the diff. +// // Enabled by default. Opt out operator-wide via // operator.features.storageVersionMigrator (ENABLE_STORAGE_VERSION_MIGRATOR=false) // for admins who prefer to run kube-storage-version-migrator externally. // Per-kind escape hatch: remove the label from the CRD (emergency only — will // be re-applied by GitOps / helm upgrade). type StorageVersionMigratorReconciler struct { - client.Client // cached reads for CRs (unused in this reconciler, kept for kubebuilder convention) - APIReader client.Reader // live reads for CRDs (bypasses informer) - Scheme *runtime.Scheme // kubebuilder reconciler convention - Recorder record.EventRecorder // MigrationSucceeded / MigrationFailed events on the CRD - PageSize int64 // overrideable for tests; defaults to defaultListPageSize - cache *migrationCache + // used for CR Update writes and the CRD /status storedVersions patch; + // reads go through APIReader to bypass the informer cache. + client.Client + APIReader client.Reader // live reads for CRDs and CR list pages (bypasses informer) + Scheme *runtime.Scheme // kubebuilder reconciler convention + Recorder record.EventRecorder // MigrationSucceeded / MigrationFailed events on the CRD + PageSize int64 // overrideable for tests; defaults to defaultListPageSize + CacheGCInterval time.Duration // overrideable for tests; defaults to defaultCacheGCInterval + cache *migrationCache } // Reconcile runs for each opted-in toolhive.stacklok.dev CRD event. See the @@ -151,6 +176,11 @@ func (r *StorageVersionMigratorReconciler) Reconcile(ctx context.Context, req ct // group. The filter is evaluated twice — once on informer events here, and again // inside Reconcile after the live APIReader read — because label removals can // briefly race the informer. +// +// It also registers a Runnable that periodically sweeps expired entries from +// the migration cache so deleted CRs (whose UIDs never recur in subsequent +// list pages and therefore never trigger lazy eviction in has()) don't grow +// the map without bound on long-running operators with high CR churn. func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) error { r.ensureInitialized() @@ -159,7 +189,7 @@ func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) er return fmt.Errorf("parse label selector: %w", err) } - return ctrl.NewControllerManagedBy(mgr). + if err := ctrl.NewControllerManagedBy(mgr). Named("storageversionmigrator"). For( &apiextensionsv1.CustomResourceDefinition{}, @@ -172,7 +202,24 @@ func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) er predicate.ResourceVersionChangedPredicate{}, ), ). - Complete(r) + Complete(r); err != nil { + return err + } + + // Periodic cache GC. Registered after Complete so the controller is fully + // wired when the runnable starts. + return mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + ticker := time.NewTicker(r.CacheGCInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + r.cache.gc() + } + } + })) } // ------------------------------------------------------------------ @@ -183,18 +230,27 @@ func (r *StorageVersionMigratorReconciler) ensureInitialized() { if r.PageSize == 0 { r.PageSize = defaultListPageSize } + if r.CacheGCInterval == 0 { + r.CacheGCInterval = defaultCacheGCInterval + } if r.cache == nil { r.cache = newMigrationCache(defaultMigrationCacheTTL) } } // restoreCRs lists all CRs of the CRD's served kind (served version = storageVersion) -// and issues a metadata-only Server-Side Apply against /status for each one, -// forcing the API server to re-encode the object at the current storage version. +// and issues a main-resource Update on each one, forcing the apiserver to +// re-encode the object at the current storage version. // -// Swallowed per-CR errors: IsNotFound (object deleted between list and patch) -// and IsConflict (object updated elsewhere — storage is already fresh). -// All other errors are aggregated and returned. +// Per-CR error handling: +// - IsNotFound: silently skipped (object deleted between list and update — +// it can't be at the old storage version anymore). +// - IsConflict: silently skipped at the per-CR level, but a function-level +// counter is incremented. After the loop, if any conflicts occurred and no +// other errors did, errMigrationRetriedDueToConflicts is returned so the +// caller leaves storedVersions untouched (the post-conflict state of the +// conflicting object is unverified). +// - All other errors are aggregated and returned. func (r *StorageVersionMigratorReconciler) restoreCRs( ctx context.Context, crd *apiextensionsv1.CustomResourceDefinition, @@ -207,12 +263,11 @@ func (r *StorageVersionMigratorReconciler) restoreCRs( Kind: crd.Spec.Names.Kind, } - hasStatusSubresource := crdHasStatusSubresource(crd, storageVersion) - listGVK := gvk listGVK.Kind = crd.Spec.Names.ListKind var errs []error + conflicts := 0 var continueToken string for { list := &unstructured.UnstructuredList{} @@ -234,17 +289,23 @@ func (r *StorageVersionMigratorReconciler) restoreCRs( if r.cache.has(crd.Name, u.GetUID(), u.GetResourceVersion()) { return nil } - if err := r.restoreOne(ctx, gvk, u, hasStatusSubresource); err != nil { - if apierrors.IsNotFound(err) || apierrors.IsConflict(err) { - logger.V(1).Info("skip CR — deleted or updated elsewhere", + restored, err := r.restoreOne(ctx, gvk, u) + if err != nil { + switch { + case apierrors.IsNotFound(err): + logger.V(1).Info("skip CR — deleted", + "object", client.ObjectKeyFromObject(u), "err", err) + case apierrors.IsConflict(err): + conflicts++ + logger.V(1).Info("skip CR — concurrent write conflict", "object", client.ObjectKeyFromObject(u), "err", err) - return nil + default: + errs = append(errs, fmt.Errorf("re-store %s/%s: %w", + u.GetNamespace(), u.GetName(), err)) } - errs = append(errs, fmt.Errorf("re-store %s/%s: %w", - u.GetNamespace(), u.GetName(), err)) return nil } - r.cache.add(crd.Name, u.GetUID(), u.GetResourceVersion()) + r.cache.add(crd.Name, restored.GetUID(), restored.GetResourceVersion()) return nil }); err != nil { errs = append(errs, err) @@ -256,36 +317,56 @@ func (r *StorageVersionMigratorReconciler) restoreCRs( } } + if len(errs) == 0 && conflicts > 0 { + return errMigrationRetriedDueToConflicts + } return kerrors.NewAggregate(errs) } -// restoreOne issues a metadata-only SSA for a single CR. Prefers the /status -// subresource (to bypass validating/mutating webhooks), falls back to the main -// resource if the CRD has no status subresource. +// MigrationTimestampAnnotation is set on each CR by the migrator to force a +// real content diff on Update. The apiserver's storage layer elides no-op +// writes (an Update with bytes equal to what's already in etcd does not +// re-encode and does not bump resourceVersion), so a Get + Update on the live +// object is not enough on its own — there must be at least one byte of +// difference. Mutating this annotation guarantees the apiserver re-writes the +// object, which is exactly the storage-version re-encode we need. The value +// is the RFC3339Nano timestamp of the most recent successful migration. +// +// This matches the upstream kube-storage-version-migrator's approach (which +// also uses an annotation toggle) and is part of the public contract: do not +// remove or rename this constant across releases without a migration plan. +const MigrationTimestampAnnotation = "toolhive.stacklok.dev/storage-version-migrated-at" + +// restoreOne issues a Get + Update on the live CR, mutating the migration +// timestamp annotation to force a real content diff so the apiserver actually +// re-encodes the object at the current storage version. The Update goes +// through the main resource (not /status), so any validating/mutating +// admission webhooks on the kind WILL see this write. CRDs that need to avoid +// webhook side effects from this controller should configure their webhooks +// to ignore writes that change only this annotation. Returns the live object +// after the update so the caller can record its post-update resourceVersion +// in the cache. func (r *StorageVersionMigratorReconciler) restoreOne( ctx context.Context, gvk schema.GroupVersionKind, original *unstructured.Unstructured, - hasStatusSubresource bool, -) error { - patch := &unstructured.Unstructured{} - patch.SetGroupVersionKind(gvk) - patch.SetName(original.GetName()) - patch.SetNamespace(original.GetNamespace()) - patch.SetUID(original.GetUID()) // UID mismatch → typed conflict on races - patch.SetResourceVersion(original.GetResourceVersion()) // RV mismatch → typed conflict on races - - applyConfig := client.ApplyConfigurationFromUnstructured(patch) - if hasStatusSubresource { - return r.Client.Status().Apply(ctx, applyConfig, - client.FieldOwner(StorageVersionMigratorFieldManager), - client.ForceOwnership, - ) +) (*unstructured.Unstructured, error) { + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(gvk) + if err := r.APIReader.Get(ctx, client.ObjectKeyFromObject(original), live); err != nil { + // IsNotFound is propagated to the caller, which handles it. + return nil, err } - return r.Apply(ctx, applyConfig, - client.FieldOwner(StorageVersionMigratorFieldManager), - client.ForceOwnership, - ) + annotations := live.GetAnnotations() + if annotations == nil { + annotations = map[string]string{} + } + annotations[MigrationTimestampAnnotation] = time.Now().UTC().Format(time.RFC3339Nano) + live.SetAnnotations(annotations) + if err := r.Update(ctx, live); err != nil { + return nil, err + } + return live, nil } // patchStoredVersions overwrites CRD.status.storedVersions to exactly @@ -327,41 +408,17 @@ func findStorageVersion(crd *apiextensionsv1.CustomResourceDefinition) (string, return "", false } -// isMigrationNeeded returns true if status.storedVersions contains any entry -// other than the current storage version. If storedVersions equals exactly -// [storageVersion] AND only one version is served, no work is needed — this -// second condition avoids spurious reconciles once a deprecated version has -// been fully removed from spec.versions. +// isMigrationNeeded returns true iff status.storedVersions is anything other +// than exactly [storageVersion]. The set of served versions does not affect +// this check — under spec.conversion.strategy=None with identical schemas, +// normal writers cannot reintroduce stale versions to storedVersions, so a +// defensive re-scan based on servedCount has no scenario to defend against. func isMigrationNeeded( crd *apiextensionsv1.CustomResourceDefinition, storageVersion string, ) bool { stored := crd.Status.StoredVersions - if len(stored) != 1 || stored[0] != storageVersion { - return true - } - servedCount := 0 - for _, v := range crd.Spec.Versions { - if v.Served { - servedCount++ - } - } - return servedCount > 1 -} - -// crdHasStatusSubresource returns true if the CRD's version declares a status -// subresource. Missing → fall back to main-resource SSA. -func crdHasStatusSubresource( - crd *apiextensionsv1.CustomResourceDefinition, - version string, -) bool { - for _, v := range crd.Spec.Versions { - if v.Name != version { - continue - } - return v.Subresources != nil && v.Subresources.Status != nil - } - return false + return len(stored) != 1 || stored[0] != storageVersion } // ------------------------------------------------------------------ @@ -371,7 +428,12 @@ func crdHasStatusSubresource( // migrationCache records successfully-migrated (UID, resourceVersion) pairs // so subsequent reconciles within the TTL window skip already-fresh objects. // It is a correctness optimization only — a cache miss simply issues a -// redundant (but harmless) SSA. +// redundant (but harmless) Update. +// +// Eviction: lazy on lookup in has(), plus a periodic sweep via gc() driven +// from a manager.Runnable registered in SetupWithManager. The periodic sweep +// is required because lookups never recur for deleted CRs, so without it +// their entries would persist forever. type migrationCache struct { mu sync.Mutex entries map[string]cacheEntry @@ -417,6 +479,20 @@ func (c *migrationCache) add(crdName string, uid apitypes.UID, resourceVersion s } } +// gc evicts every expired entry from the cache. Called from a periodic +// manager.Runnable so entries for deleted CRs (whose UIDs never recur in +// subsequent list pages) don't accumulate without bound. +func (c *migrationCache) gc() { + c.mu.Lock() + defer c.mu.Unlock() + now := c.now() + for k, e := range c.entries { + if now.After(e.expiresAt) { + delete(c.entries, k) + } + } +} + func (*migrationCache) key(crdName string, uid apitypes.UID) string { return crdName + "|" + string(uid) } diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go index df4c752321..53ec6a919c 100644 --- a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go +++ b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go @@ -172,22 +172,18 @@ func createCRs(gvk schema.GroupVersionKind, basename string, count int) []*unstr return out } -// Note on "did the SSA actually fire" verification: +// Note on "did the re-store actually fire" verification: // -// Metadata-only SSA with no owned fields is deliberately observable-noop — the -// point is to force the apiserver to re-encode the etcd bytes at the current -// storage version, which is not visible from outside the apiserver. The SSA -// does not create a managedFields entry (since no fields are managed), and the -// object's resourceVersion does not always bump for a true no-op apply. +// The controller now does a Get + Update on /status (or the main resource as a +// fallback). An unconditional Update always writes the object back to etcd, so +// the object's resourceVersion bumps on every successful re-store — that's the +// observable proof a re-encode happened. The HappyPath test snapshots each +// CR's resourceVersion before reconcile and asserts it has increased after +// reconcile completes. // -// So the contract we assert in envtest is the *outcome*: reconcile succeeds -// without error, and CRD.status.storedVersions converges to [storageVersion]. -// If the SSA were skipped or silently failing, patchStoredVersions would fire -// against un-re-stored objects and the test would still pass — but in -// production the apiserver guards storage-version changes exactly this way, so -// the end-to-end behaviour is covered by the Kubernetes apiserver's own -// correctness. The pagination test additionally verifies the continue-token -// loop via a list-call counter. +// The pagination test additionally verifies the continue-token loop via a +// list-call counter, and the partial-failure test asserts storedVersions is +// not trimmed when any CR re-store fails. // newReconciler constructs a StorageVersionMigratorReconciler for a single // test. Every test has its own instance so the migration cache doesn't leak @@ -272,17 +268,34 @@ var _ = Describe("StorageVersionMigrator", func() { schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, "obj-"+suf, 3, ) + + // Snapshot pre-reconcile resourceVersions. The controller does + // Get + Update on each CR, so a successful re-store must bump RV. + // Asserting the bump is the empirical proof the re-encode happened + // (an empty SSA would not have bumped RV — that was the bug). + preRVs := make(map[string]string, len(crs)) + for _, cr := range crs { + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(cr.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) + preRVs[cr.GetName()] = live.GetResourceVersion() + } + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) _, err := reconcile(newReconciler(), spec.Name) Expect(err).NotTo(HaveOccurred()) Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) - // Verify every CR still exists (migrator does not delete). + // Verify every CR still exists AND its RV bumped, proving the + // /status Update actually wrote to etcd. for _, cr := range crs { live := &unstructured.Unstructured{} live.SetGroupVersionKind(cr.GroupVersionKind()) Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) + Expect(live.GetResourceVersion()).NotTo(Equal(preRVs[cr.GetName()]), + "CR %s/%s resourceVersion did not bump — re-store did not write to etcd", + cr.GetNamespace(), cr.GetName()) } }) @@ -451,9 +464,14 @@ var _ = Describe("StorageVersionMigrator", func() { setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) failureTarget := client.ObjectKeyFromObject(crs[0]) - failing := &failingApplyClient{ + failing := &failingUpdateClient{ Client: k8sClient, - fail: func(key client.ObjectKey) bool { return key == failureTarget }, + errFn: func(key client.ObjectKey) error { + if key == failureTarget { + return fmt.Errorf("injected update failure for %s", key) + } + return nil + }, } r := &controllers.StorageVersionMigratorReconciler{ Client: failing, @@ -470,6 +488,70 @@ var _ = Describe("StorageVersionMigrator", func() { // removal would orphan the un-migrated object in etcd. Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"})) }) + + It("leaves storedVersions untouched when a CR re-store hits a Conflict, then trims on retry", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "conflicts" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Conflict" + suf, + ListKind: "Conflict" + suf + "List", + Plural: "conflicts" + suf, + Singular: "conflict" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 2, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + conflictTarget := client.ObjectKeyFromObject(crs[0]) + injectConflict := true + gr := schema.GroupResource{Group: spec.Group, Resource: spec.Plural} + conflicting := &failingUpdateClient{ + Client: k8sClient, + errFn: func(key client.ObjectKey) error { + if injectConflict && key == conflictTarget { + return apierrors.NewConflict(gr, key.Name, + fmt.Errorf("injected conflict")) + } + return nil + }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: conflicting, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + + // First pass: conflict swallowed at the per-CR level, but the + // function-level conflict counter trips errMigrationRetriedDueToConflicts + // so storedVersions is left untouched. + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), + "reconcile must return an error when a Conflict was swallowed") + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must not be trimmed on a pass with any swallowed Conflict") + + // Drop the injection and retry. The cache may have absorbed the + // non-conflicting CR's RV from the first pass — that's fine, the + // conflicting one was never recorded in the cache so it'll be + // re-attempted, succeed, and let the storedVersions patch fire. + injectConflict = false + _, err = reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) }) }) @@ -477,53 +559,54 @@ var _ = Describe("StorageVersionMigrator", func() { // Test doubles // ------------------------------------------------------------------ -// failingApplyClient wraps a real client.Client and fails Apply (and -// Status().Apply()) for specific object keys. Used to simulate non-retriable -// SSA errors in the partial-failure test. -type failingApplyClient struct { +// failingUpdateClient wraps a real client.Client and intercepts Update (and +// Status().Update) for specific object keys. The controller's restoreOne goes +// through Update — so this wrapper is how we inject failures and conflicts. +// +// errFn returns the error to inject for a given key, or nil to let the call +// pass through to the wrapped client. Returning a non-nil error short-circuits +// the call so the underlying object is not modified. +type failingUpdateClient struct { client.Client - fail func(key client.ObjectKey) bool + errFn func(key client.ObjectKey) error } -func (f *failingApplyClient) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.ApplyOption) error { - if key, ok := keyFromApplyConfig(obj); ok && f.fail(key) { - return fmt.Errorf("injected SSA failure for %s", key) +func (f *failingUpdateClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + if err := f.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err } - return f.Client.Apply(ctx, obj, opts...) + return f.Client.Update(ctx, obj, opts...) } -func (f *failingApplyClient) Status() client.SubResourceWriter { - return &failingApplyStatus{ - SubResourceWriter: f.Client.Status(), - fail: f.fail, +func (f *failingUpdateClient) Status() client.SubResourceWriter { + return &failingUpdateStatus{ + inner: f.Client.Status(), + errFn: f.errFn, } } -type failingApplyStatus struct { - client.SubResourceWriter - fail func(key client.ObjectKey) bool +type failingUpdateStatus struct { + inner client.SubResourceWriter + errFn func(key client.ObjectKey) error } -func (s *failingApplyStatus) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.SubResourceApplyOption) error { - if key, ok := keyFromApplyConfig(obj); ok && s.fail(key) { - return fmt.Errorf("injected status SSA failure for %s", key) - } - return s.SubResourceWriter.Apply(ctx, obj, opts...) +func (s *failingUpdateStatus) Create(ctx context.Context, obj client.Object, sub client.Object, opts ...client.SubResourceCreateOption) error { + return s.inner.Create(ctx, obj, sub, opts...) } -// keyFromApplyConfig peeks inside the opaque runtime.ApplyConfiguration that -// client.ApplyConfigurationFromUnstructured returns. The concrete type embeds -// *unstructured.Unstructured, so we recover the object key via the -// GetName/GetNamespace interface without depending on the unexported wrapper. -func keyFromApplyConfig(obj runtime.ApplyConfiguration) (client.ObjectKey, bool) { - type nameNamespace interface { - GetName() string - GetNamespace() string - } - if nn, ok := obj.(nameNamespace); ok { - return client.ObjectKey{Name: nn.GetName(), Namespace: nn.GetNamespace()}, true +func (s *failingUpdateStatus) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if err := s.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err } - return client.ObjectKey{}, false + return s.inner.Update(ctx, obj, opts...) +} + +func (s *failingUpdateStatus) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.SubResourcePatchOption) error { + return s.inner.Patch(ctx, obj, patch, opts...) +} + +func (s *failingUpdateStatus) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.SubResourceApplyOption) error { + return s.inner.Apply(ctx, obj, opts...) } // noopRecorder is a minimal EventRecorder for direct-Reconcile tests. diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go index 25b52aa720..0e94586912 100644 --- a/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go +++ b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go @@ -19,7 +19,6 @@ package storageversionmigrator import ( "context" "testing" - "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -80,6 +79,5 @@ var _ = BeforeSuite(func() { var _ = AfterSuite(func() { By("tearing down envtest") cancel() - time.Sleep(100 * time.Millisecond) Expect(testEnv.Stop()).NotTo(HaveOccurred()) }) diff --git a/deploy/charts/operator/templates/clusterrole/role.yaml b/deploy/charts/operator/templates/clusterrole/role.yaml index 29310e34ac..13d0e291eb 100644 --- a/deploy/charts/operator/templates/clusterrole/role.yaml +++ b/deploy/charts/operator/templates/clusterrole/role.yaml @@ -117,13 +117,21 @@ rules: verbs: - get - list - - patch + - update - apiGroups: - toolhive.stacklok.dev resources: - '*/status' + - embeddingservers/finalizers + - mcpexternalauthconfigs/finalizers + - mcpgroups/finalizers + - mcpoidcconfigs/finalizers + - mcpregistries/finalizers + - mcpservers/finalizers + - mcptelemetryconfigs/finalizers + - mcptoolconfigs/finalizers verbs: - - patch + - update - apiGroups: - toolhive.stacklok.dev resources: @@ -144,19 +152,6 @@ rules: - patch - update - watch -- apiGroups: - - toolhive.stacklok.dev - resources: - - embeddingservers/finalizers - - mcpexternalauthconfigs/finalizers - - mcpgroups/finalizers - - mcpoidcconfigs/finalizers - - mcpregistries/finalizers - - mcpservers/finalizers - - mcptelemetryconfigs/finalizers - - mcptoolconfigs/finalizers - verbs: - - update - apiGroups: - toolhive.stacklok.dev resources: diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md index 547beeed62..e3d350dc0d 100644 --- a/docs/operator/storage-version-migration.md +++ b/docs/operator/storage-version-migration.md @@ -14,10 +14,12 @@ For each opted-in CRD: 1. Reads `spec.versions` to find the entry with `storage: true`. 2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. -3. Otherwise, lists every Custom Resource of that kind and issues a metadata-only Server-Side Apply against the `/status` subresource with field manager `thv-storage-version-migrator`. This forces the API server to re-encode each object at the current storage version without triggering admission webhooks (SSA on `/status` typically bypasses webhooks registered on the main resource, and the empty apply owns no fields so it doesn't fight other controllers). +3. Otherwise, lists every Custom Resource of that kind and, for each one, does a `Get` followed by an `Update` that mutates the `toolhive.stacklok.dev/storage-version-migrated-at` annotation to the current RFC3339Nano timestamp. The annotation toggle is required because the apiserver elides no-op writes — an `Update` whose bytes are equal to what is already in etcd does not bump `resourceVersion` and does not re-encode. With the annotation toggle the diff is real, so the apiserver writes the object back to etcd at the current storage version. This matches the upstream `kube-storage-version-migrator`'s mechanism. 4. Once every object has been re-stored, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. -CRDs without a `/status` subresource fall back to main-resource SSA. +### Webhook interaction + +The migrator's writes go through the **main resource** Update path, so any validating or mutating admission webhooks registered on the kind will see them. Webhooks that need to ignore migration-only writes should branch on the presence of the `toolhive.stacklok.dev/storage-version-migrated-at` annotation in the diff (e.g. allow the write if that annotation is the only change). An earlier design issued a metadata-only Server-Side Apply against `/status` to bypass webhooks, but the apiserver short-circuits empty SSAs without writing etcd, so that approach was incorrect. See PR #5011 review for the empirical evidence. ## The opt-in label From ab03d5e1a1f3b82b7069b1fd1cc15520b4711c94 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Wed, 29 Apr 2026 22:07:35 +0100 Subject: [PATCH 9/9] Drop annotation toggle; use plain Get+Update for re-encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The annotation toggle was a workaround for a misdiagnosed problem. Earlier in this branch we observed empty SSAs being elided and generalised it to "all unmodified writes elide" — and added a toolhive.stacklok.dev/storage-version-migrated-at annotation toggle to force a real diff per write. A second-opinion investigation pointed out the elision check happens AFTER storage encoding: the apiserver re-encodes the request body at the current storage version, then byte-compares against etcd. For the actual migration scenario (CR stamped at an older apiVersion in etcd, storage version now newer) the encoded apiVersion differs, the comparison fails, and the write proceeds. Verified empirically: a CR stored at v1alpha1 with storage flipped to v1beta1 has plain Get+Update bump RV from 202 to 204; a CR already at v1beta1 has plain Get+Update elide cleanly. Both behaviours are correct — the elision is exactly the no-migration-needed case. Changes: - restoreOne: plain Get + Update of the unmodified live object. No annotation, no timestamp, no MigrationTimestampAnnotation constant. - Controller docstring rewritten to describe the actual mechanism (encode-then-compare elision) and cite ksvm#65 for posterity. - HappyPath envtest loosened: per-CR resourceVersion bump assertions are dropped (already-clean CRs correctly elide; checking RV bumps on them was an artefact of the annotation-induced writes). The storedVersions convergence assertion remains as the contract test. - New cross-version envtest "re-encodes CRs that are stored at a prior storage version" exercises the real migration scenario: install CRD with v1alpha1 storage, create CR, flip storage to v1beta1, run reconciler, assert storedVersions trims and the CR's RV bumped (proving the apiserver actually rewrote etcd). - User docs updated to drop annotation references and describe the real mechanism. Reference implementation citation switched from cluster-api/crdmigrator to kube-storage-version-migrator (the actual upstream SIG tool we share semantics with). - FallsBackWhenNoStatusSubresource test removed: the SSA path it exercised no longer exists. Plain Update on a CRD without /status is just a normal Update; covered by the happy-path spec. 8/8 envtest cases pass. Lint clean. No RBAC drift (verbs already correct). Refs PR #5011 review by @jhrozek. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../storageversionmigrator_controller.go | 65 ++---- .../storageversionmigrator/controller_test.go | 205 ++++++++++++------ docs/operator/storage-version-migration.md | 12 +- 3 files changed, 170 insertions(+), 112 deletions(-) diff --git a/cmd/thv-operator/controllers/storageversionmigrator_controller.go b/cmd/thv-operator/controllers/storageversionmigrator_controller.go index 1de0e99f09..10c147c6b3 100644 --- a/cmd/thv-operator/controllers/storageversionmigrator_controller.go +++ b/cmd/thv-operator/controllers/storageversionmigrator_controller.go @@ -79,20 +79,20 @@ var errMigrationRetriedDueToConflicts = errors.New( // StorageVersionMigratorReconciler reconciles CustomResourceDefinition objects // in the toolhive.stacklok.dev group that carry the opt-in // AutoMigrateLabel=AutoMigrateValue. For each such CRD it re-stores every CR -// at the current storage version by doing a Get + Update on the live object -// while toggling the MigrationTimestampAnnotation to force a real content -// diff. The annotation toggle is required because the apiserver elides no-op -// writes (an Update with bytes equal to what's already stored does not bump -// resourceVersion and does not re-encode) — without a real diff the migration -// would be a hollow operation. After all CRs have been re-stored it patches +// at the current storage version by doing a Get + Update on the live object. +// The Update is a full PUT of the unmodified object; the apiserver re-encodes +// the request body at the current storage version, then compares the +// resulting bytes to what's in etcd. When the CR was originally stored at a +// different version (the actual migration scenario) the bytes carry a +// different apiVersion stamp than etcd's record, the comparison fails, and +// the write proceeds — re-encoding the object at the current storage +// version. When the CR is already at the current storage version, the bytes +// match and the apiserver harmlessly elides the write — there was nothing to +// migrate. After all CRs have been processed it patches // CRD.status.storedVersions down to [] so a future // release can drop deprecated versions from spec.versions without orphaning -// etcd objects. -// -// Webhook interaction: the Update goes through the main resource, so -// validating/mutating admission webhooks on the kind DO see this write. -// Webhooks that need to ignore migration-only writes should branch on the -// presence of MigrationTimestampAnnotation in the diff. +// etcd objects. See https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65 +// for the upstream maintainers' explanation of this mechanism. // // Enabled by default. Opt out operator-wide via // operator.features.storageVersionMigrator (ENABLE_STORAGE_VERSION_MIGRATOR=false) @@ -323,29 +323,18 @@ func (r *StorageVersionMigratorReconciler) restoreCRs( return kerrors.NewAggregate(errs) } -// MigrationTimestampAnnotation is set on each CR by the migrator to force a -// real content diff on Update. The apiserver's storage layer elides no-op -// writes (an Update with bytes equal to what's already in etcd does not -// re-encode and does not bump resourceVersion), so a Get + Update on the live -// object is not enough on its own — there must be at least one byte of -// difference. Mutating this annotation guarantees the apiserver re-writes the -// object, which is exactly the storage-version re-encode we need. The value -// is the RFC3339Nano timestamp of the most recent successful migration. -// -// This matches the upstream kube-storage-version-migrator's approach (which -// also uses an annotation toggle) and is part of the public contract: do not -// remove or rename this constant across releases without a migration plan. -const MigrationTimestampAnnotation = "toolhive.stacklok.dev/storage-version-migrated-at" - -// restoreOne issues a Get + Update on the live CR, mutating the migration -// timestamp annotation to force a real content diff so the apiserver actually -// re-encodes the object at the current storage version. The Update goes -// through the main resource (not /status), so any validating/mutating -// admission webhooks on the kind WILL see this write. CRDs that need to avoid -// webhook side effects from this controller should configure their webhooks -// to ignore writes that change only this annotation. Returns the live object -// after the update so the caller can record its post-update resourceVersion -// in the cache. +// restoreOne issues a plain Get + Update on the live CR. The apiserver +// re-encodes the request body at the current storage version and compares +// it to etcd's record; when the CR was originally stored at a different +// apiVersion the bytes differ, the write proceeds, and etcd is re-encoded +// at the current storage version. When the CR is already at the current +// storage version the bytes match and the apiserver harmlessly elides the +// write — there was nothing to migrate. The Update goes through the main +// resource, so validating/mutating admission webhooks on the kind see this +// request as part of normal admission flow; only requests that actually +// persist produce downstream state changes. Returns the live object after +// the update so the caller can record its post-update resourceVersion in +// the cache. func (r *StorageVersionMigratorReconciler) restoreOne( ctx context.Context, gvk schema.GroupVersionKind, @@ -357,12 +346,6 @@ func (r *StorageVersionMigratorReconciler) restoreOne( // IsNotFound is propagated to the caller, which handles it. return nil, err } - annotations := live.GetAnnotations() - if annotations == nil { - annotations = map[string]string{} - } - annotations[MigrationTimestampAnnotation] = time.Now().UTC().Format(time.RFC3339Nano) - live.SetAnnotations(annotations) if err := r.Update(ctx, live); err != nil { return nil, err } diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go index 53ec6a919c..35c318a27d 100644 --- a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go +++ b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go @@ -174,12 +174,15 @@ func createCRs(gvk schema.GroupVersionKind, basename string, count int) []*unstr // Note on "did the re-store actually fire" verification: // -// The controller now does a Get + Update on /status (or the main resource as a -// fallback). An unconditional Update always writes the object back to etcd, so -// the object's resourceVersion bumps on every successful re-store — that's the -// observable proof a re-encode happened. The HappyPath test snapshots each -// CR's resourceVersion before reconcile and asserts it has increased after -// reconcile completes. +// The controller does a plain Get + Update on each CR. When the CR is already +// at the current storage version the apiserver freshly re-encodes the request +// body, sees it matches etcd byte-for-byte, and elides the write — that's +// correct behaviour, not a controller bug, and it means the per-CR RV does +// not bump for an already-clean CR. The dedicated cross-version test +// ("re-encodes CRs that are stored at a prior storage version") proves the +// migration mechanism actually works for objects stored at older versions: +// it stores a CR at v1alpha1, flips storage to v1beta1, and asserts the CR's +// RV bumps after reconcile. // // The pagination test additionally verifies the continue-token loop via a // list-call counter, and the partial-failure test asserts storedVersions is @@ -245,7 +248,7 @@ var _ = Describe("StorageVersionMigrator", func() { Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) }) - It("migrates storedVersions from [v1alpha1,v1beta1] to [v1beta1] and re-stores all CRs", func() { + It("migrates storedVersions from [v1alpha1,v1beta1] to [v1beta1]", func() { suf := uniqueSuffix() spec := crdSpec{ Name: "happies" + suf + "." + toolhiveGroup, @@ -264,39 +267,147 @@ var _ = Describe("StorageVersionMigrator", func() { installCRD(spec) DeferCleanup(func() { deleteCRD(spec.Name) }) - crs := createCRs( + // The CRs created here are already stored at v1beta1 (the + // current storage version), so the apiserver will elide each + // per-CR Update — its freshly-encoded body matches etcd + // byte-for-byte. That's correct apiserver behaviour, not a + // controller bug, so this test does NOT assert per-CR + // resourceVersion bumps. The cross-version test below proves + // the migration mechanism actually re-encodes objects that were + // stored at an older apiVersion. + createCRs( schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, "obj-"+suf, 3, ) - // Snapshot pre-reconcile resourceVersions. The controller does - // Get + Update on each CR, so a successful re-store must bump RV. - // Asserting the bump is the empirical proof the re-encode happened - // (an empty SSA would not have bumped RV — that was the bug). - preRVs := make(map[string]string, len(crs)) - for _, cr := range crs { - live := &unstructured.Unstructured{} - live.SetGroupVersionKind(cr.GroupVersionKind()) - Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) - preRVs[cr.GetName()] = live.GetResourceVersion() - } - setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) _, err := reconcile(newReconciler(), spec.Name) Expect(err).NotTo(HaveOccurred()) Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) - // Verify every CR still exists AND its RV bumped, proving the - // /status Update actually wrote to etcd. - for _, cr := range crs { - live := &unstructured.Unstructured{} - live.SetGroupVersionKind(cr.GroupVersionKind()) - Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) - Expect(live.GetResourceVersion()).NotTo(Equal(preRVs[cr.GetName()]), - "CR %s/%s resourceVersion did not bump — re-store did not write to etcd", - cr.GetNamespace(), cr.GetName()) + // Load-bearing proof of the migration mechanism: a CR stored at + // v1alpha1, after the storage version has flipped to v1beta1, must + // have its resourceVersion bumped by reconcile — that's the + // observable evidence the apiserver actually re-encoded the etcd + // document. See the upstream confirmation at + // https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65. + It("re-encodes CRs that are stored at a prior storage version", func() { + suf := uniqueSuffix() + crdName := "crossvers" + suf + "." + toolhiveGroup + kind := "CrossVer" + suf + plural := "crossvers" + suf + + versionSchema := func() *apiextensionsv1.CustomResourceValidation { + return &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + "status": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + }, + }, + } } + + // Step 1: install CRD with v1alpha1 as the storage version so + // CRs created next are written to etcd as v1alpha1 bytes. + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + Labels: map[string]string{migrateLabel: migrateValue}, + }, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: toolhiveGroup, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: kind, + ListKind: kind + "List", + Plural: plural, + Singular: "crossver" + suf, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: []apiextensionsv1.CustomResourceDefinitionVersion{ + {Name: "v1alpha1", Served: true, Storage: true, Schema: versionSchema()}, + {Name: "v1beta1", Served: true, Storage: false, Schema: versionSchema()}, + }, + }, + } + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + DeferCleanup(func() { deleteCRD(crdName) }) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, c := range live.Status.Conditions { + if c.Type == apiextensionsv1.Established && c.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 2: create one CR — etcd writes apiVersion: v1alpha1 bytes. + cr := createCRs( + schema.GroupVersionKind{Group: toolhiveGroup, Version: "v1alpha1", Kind: kind}, + "obj-"+suf, 1, + )[0] + + // Step 3: flip storage to v1beta1. + Eventually(func() error { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return err + } + orig := live.DeepCopy() + for i := range live.Spec.Versions { + live.Spec.Versions[i].Storage = (live.Spec.Versions[i].Name == "v1beta1") + } + return k8sClient.Patch(ctx, live, client.MergeFrom(orig)) + }, time.Second*10, time.Millisecond*200).Should(Succeed()) + + // Confirm the storage flip settled before proceeding. + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, v := range live.Spec.Versions { + if v.Name == "v1beta1" && v.Storage { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 4: storedVersions reflects the historical v1alpha1 entry. + setStoredVersions(crdName, []string{"v1alpha1", "v1beta1"}) + + // Step 5: snapshot RV before reconcile. + preLive := &unstructured.Unstructured{} + preLive.SetGroupVersionKind(schema.GroupVersionKind{ + Group: toolhiveGroup, Version: "v1beta1", Kind: kind, + }) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), preLive)).To(Succeed()) + preRV := preLive.GetResourceVersion() + + // Step 6: reconcile. + _, err := reconcile(newReconciler(), crdName) + Expect(err).NotTo(HaveOccurred()) + + // Step 7: storedVersions trimmed. + Expect(getStoredVersions(crdName)).To(Equal([]string{"v1beta1"})) + + // Step 8: empirical proof — RV bumped because the cross-version + // Update actually wrote etcd. + postLive := &unstructured.Unstructured{} + postLive.SetGroupVersionKind(preLive.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), postLive)).To(Succeed()) + Expect(postLive.GetResourceVersion()).NotTo(Equal(preRV), + "CR %s/%s resourceVersion did not bump (pre=%s post=%s) — cross-version re-store did not write to etcd", + cr.GetNamespace(), cr.GetName(), preRV, postLive.GetResourceVersion()) }) It("skips CRDs in foreign API groups", func() { @@ -353,42 +464,6 @@ var _ = Describe("StorageVersionMigrator", func() { "storedVersions must be untouched for unlabelled CRDs") }) - It("falls back to main-resource SSA when the storage version has no /status subresource", func() { - suf := uniqueSuffix() - spec := crdSpec{ - Name: "nostatus" + suf + "." + toolhiveGroup, - Group: toolhiveGroup, - Kind: "NoStatus" + suf, - ListKind: "NoStatus" + suf + "List", - Plural: "nostatus" + suf, - Singular: "nostatus" + suf, - Labelled: true, - HasStatusOnStored: false, - Versions: []versionSpec{ - {Name: "v1alpha1", Served: true, Storage: false}, - {Name: "v1beta1", Served: true, Storage: true}, - }, - } - installCRD(spec) - DeferCleanup(func() { deleteCRD(spec.Name) }) - - cr := createCRs( - schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, - "obj-"+suf, 1, - )[0] - setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) - - _, err := reconcile(newReconciler(), spec.Name) - Expect(err).NotTo(HaveOccurred(), - "reconcile must succeed via main-resource SSA when no /status subresource exists") - Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) - - // CR still exists (migrator doesn't delete). - live := &unstructured.Unstructured{} - live.SetGroupVersionKind(cr.GroupVersionKind()) - Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), live)).To(Succeed()) - }) - It("handles pagination across multiple list pages", func() { suf := uniqueSuffix() spec := crdSpec{ diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md index e3d350dc0d..828ddc1831 100644 --- a/docs/operator/storage-version-migration.md +++ b/docs/operator/storage-version-migration.md @@ -14,12 +14,12 @@ For each opted-in CRD: 1. Reads `spec.versions` to find the entry with `storage: true`. 2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. -3. Otherwise, lists every Custom Resource of that kind and, for each one, does a `Get` followed by an `Update` that mutates the `toolhive.stacklok.dev/storage-version-migrated-at` annotation to the current RFC3339Nano timestamp. The annotation toggle is required because the apiserver elides no-op writes — an `Update` whose bytes are equal to what is already in etcd does not bump `resourceVersion` and does not re-encode. With the annotation toggle the diff is real, so the apiserver writes the object back to etcd at the current storage version. This matches the upstream `kube-storage-version-migrator`'s mechanism. -4. Once every object has been re-stored, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. +3. Otherwise, lists every Custom Resource of that kind and, for each one, does a plain `Get` followed by an `Update` of the unmodified live object. The apiserver re-encodes the request body at the current storage version and compares it to what is in etcd. When the CR was originally stored at a different `apiVersion` (the actual migration scenario) the bytes carry a different stamp than etcd's record, the comparison fails, and the write proceeds — re-encoding the object at the current storage version. When the CR is already at the current storage version, the bytes match and the apiserver harmlessly elides the write — there was nothing to migrate. This matches the upstream `kube-storage-version-migrator`'s mechanism, confirmed at [kubernetes-sigs/kube-storage-version-migrator#65](https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65). +4. Once every object has been processed, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. ### Webhook interaction -The migrator's writes go through the **main resource** Update path, so any validating or mutating admission webhooks registered on the kind will see them. Webhooks that need to ignore migration-only writes should branch on the presence of the `toolhive.stacklok.dev/storage-version-migrated-at` annotation in the diff (e.g. allow the write if that annotation is the only change). An earlier design issued a metadata-only Server-Side Apply against `/status` to bypass webhooks, but the apiserver short-circuits empty SSAs without writing etcd, so that approach was incorrect. See PR #5011 review for the empirical evidence. +The migrator's writes go through the **main resource** Update path, so any validating or mutating admission webhooks registered on the kind see each request as part of normal admission flow. However, only requests that the apiserver actually persists produce downstream state changes a webhook would meaningfully react to — webhook load is bounded by "objects actually being migrated" (CRs whose etcd bytes carry an older `apiVersion` stamp), not "every reconcile pass × every CR". ## The opt-in label @@ -106,11 +106,11 @@ The controller requires (generated from kubebuilder markers, applied by the oper - `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch` - `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch` -- `*.toolhive.stacklok.dev`: `get`, `list`, `patch` -- `*/status.toolhive.stacklok.dev`: `patch` +- `*.toolhive.stacklok.dev`: `get`, `list`, `update` +- `*/status.toolhive.stacklok.dev`: `update` ## Related - Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969) - Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/) -- Reference implementation: [kubernetes-sigs/cluster-api `crdmigrator`](https://github.com/kubernetes-sigs/cluster-api/tree/main/controllers/crdmigrator) +- Reference implementation: [kubernetes-sigs/kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator) (the upstream SIG API Machinery tool — our controller uses the same Get+Update mechanism, scoped to ToolHive CRDs and triggered automatically rather than via per-migration `StorageVersionMigration` resources)