diff --git a/api/v1alpha1/pattern_types.go b/api/v1alpha1/pattern_types.go index ac36d0f46..b93e5859b 100644 --- a/api/v1alpha1/pattern_types.go +++ b/api/v1alpha1/pattern_types.go @@ -206,6 +206,10 @@ type PatternStatus struct { AnalyticsUUID string `json:"analyticsUUID,omitempty"` // +operator-sdk:csv:customresourcedefinitions:type=status LocalCheckoutPath string `json:"path,omitempty"` + // +operator-sdk:csv:customresourcedefinitions:type=status + // DeletionPhase tracks the current phase of pattern deletion + // Values: "" (not deleting), "deletingSpokeApps" (phase 1: delete apps from spoke), "deletingHubApps" (phase 2: delete apps from hub) + DeletionPhase PatternDeletionPhase `json:"deletionPhase,omitempty"` } // See: https://book.kubebuilder.io/reference/markers/crd.html @@ -262,6 +266,14 @@ const ( Suspended PatternConditionType = "Suspended" ) +type PatternDeletionPhase string + +const ( + InitializeDeletion PatternDeletionPhase = "" + DeletingSpokeApps PatternDeletionPhase = "DeletingSpokeApps" + DeletingHubApps PatternDeletionPhase = "DeletingHubApps" +) + func init() { SchemeBuilder.Register(&Pattern{}, &PatternList{}) } diff --git a/config/crd/bases/gitops.hybrid-cloud-patterns.io_patterns.yaml b/config/crd/bases/gitops.hybrid-cloud-patterns.io_patterns.yaml index 2af229e86..ed03e8cbb 100644 --- a/config/crd/bases/gitops.hybrid-cloud-patterns.io_patterns.yaml +++ b/config/crd/bases/gitops.hybrid-cloud-patterns.io_patterns.yaml @@ -224,6 +224,11 @@ spec: - type type: object type: array + deletionPhase: + description: |- + DeletionPhase tracks the current phase of pattern deletion + Values: "" (not deleting), "deletingSpokeApps" (phase 1: delete apps from spoke), "deletingHubApps" (phase 2: delete apps from hub) + type: string lastError: description: Last error encountered by the pattern type: string diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 352a6c967..f135bb68f 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -44,6 +44,13 @@ rules: - list - patch - update +- apiGroups: + - cluster.open-cluster-management.io + resources: + - managedclusters + verbs: + - delete + - list - apiGroups: - config.openshift.io resources: @@ -104,6 +111,12 @@ rules: - list - patch - update +- apiGroups: + - view.open-cluster-management.io + resources: + - managedclusterviews + verbs: + - create --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role diff --git a/internal/controller/acm.go b/internal/controller/acm.go index 006d92592..f03243a34 100644 --- a/internal/controller/acm.go +++ b/internal/controller/acm.go @@ -22,6 +22,7 @@ import ( "fmt" "log" + kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" ) @@ -64,3 +65,69 @@ func haveACMHub(r *PatternReconciler) bool { } return true } + +// listManagedClusters lists all ManagedCluster resources (excluding local-cluster) +// Returns a list of cluster names and an error +func (r *PatternReconciler) listManagedClusters(ctx context.Context) ([]string, error) { + gvrMC := schema.GroupVersionResource{ + Group: "cluster.open-cluster-management.io", + Version: "v1", + Resource: "managedclusters", + } + + // ManagedCluster is a cluster-scoped resource, so no namespace needed + mcList, err := r.dynamicClient.Resource(gvrMC).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list ManagedClusters: %w", err) + } + + var clusterNames []string + for _, item := range mcList.Items { + name := item.GetName() + // Exclude local-cluster (hub cluster) + if name != "local-cluster" { + clusterNames = append(clusterNames, name) + } + } + + return clusterNames, nil +} + +// deleteManagedClusters deletes all ManagedCluster resources (excluding local-cluster) +// Returns the number of clusters deleted and an error +func (r *PatternReconciler) deleteManagedClusters(ctx context.Context) (int, error) { + gvrMC := schema.GroupVersionResource{ + Group: "cluster.open-cluster-management.io", + Version: "v1", + Resource: "managedclusters", + } + + // ManagedCluster is a cluster-scoped resource, so no namespace needed + mcList, err := r.dynamicClient.Resource(gvrMC).List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to list ManagedClusters: %w", err) + } + + deletedCount := 0 + for _, item := range mcList.Items { + name := item.GetName() + // Exclude local-cluster (hub cluster) + if name == "local-cluster" { + continue + } + + // Delete the managed cluster + err := r.dynamicClient.Resource(gvrMC).Delete(ctx, name, metav1.DeleteOptions{}) + if err != nil { + // If already deleted, that's fine + if kerrors.IsNotFound(err) { + continue + } + return deletedCount, fmt.Errorf("failed to delete ManagedCluster %q: %w", name, err) + } + log.Printf("Deleted ManagedCluster: %q", name) + deletedCount++ + } + + return deletedCount, nil +} diff --git a/internal/controller/argo.go b/internal/controller/argo.go index a25fa0e20..627773e6a 100644 --- a/internal/controller/argo.go +++ b/internal/controller/argo.go @@ -21,6 +21,7 @@ import ( "fmt" "log" "os" + "slices" "strconv" "strings" @@ -425,9 +426,16 @@ func newApplicationParameters(p *api.Pattern) []argoapi.HelmParameter { } } if !p.DeletionTimestamp.IsZero() { + // Determine deletePattern value based on deletion phase + // Phase 1 (deletingSpokeApps): deletePattern = "2" (delete apps from spoke) + // Phase 2 (deletingHubApps): deletePattern = "1" (delete apps from hub) + deletePatternValue := "2" // default to spoke deletion + if p.Status.DeletionPhase == api.DeletingHubApps { + deletePatternValue = "1" + } parameters = append(parameters, argoapi.HelmParameter{ Name: "global.deletePattern", - Value: "1", + Value: deletePatternValue, ForceString: true, }) } @@ -961,3 +969,39 @@ func updateHelmParameter(goal api.PatternParameter, actual []argoapi.HelmParamet } return false } + +// syncApplicationWithPrune syncs the application with prune and force options if such a sync is not already in progress. +// Returns true if a sync with prune and force is already in progress, false otherwise +func syncApplicationWithPrune(client argoclient.Interface, app *argoapi.Application) (bool, error) { + if app.Operation != nil && app.Operation.Sync != nil && app.Operation.Sync.Prune && slices.Contains(app.Operation.Sync.SyncOptions, "Force=true") { + return true, nil + } + + app.Operation = &argoapi.Operation{ + Sync: &argoapi.SyncOperation{ + Prune: true, + SyncOptions: []string{"Force=true"}, + }, + } + + _, err := client.ArgoprojV1alpha1().Applications(app.Namespace).Update(context.Background(), app, metav1.UpdateOptions{}) + if err != nil { + return false, fmt.Errorf("failed to sync application %q with prune: %w", app.Name, err) + } + + return true, nil +} + +// returns the child applications owned by the app-of-apps parentApp +func getChildApplications(client argoclient.Interface, parentApp *argoapi.Application) ([]argoapi.Application, error) { + listOptions := metav1.ListOptions{ + LabelSelector: fmt.Sprintf("app.kubernetes.io/instance=%s", parentApp.Name), + } + + appList, err := client.ArgoprojV1alpha1().Applications("").List(context.Background(), listOptions) + if err != nil { + return nil, fmt.Errorf("failed to list child applications of %s: %w", parentApp.Name, err) + } + + return appList.Items, nil +} diff --git a/internal/controller/pattern_controller.go b/internal/controller/pattern_controller.go index a7230d8b8..d98967328 100644 --- a/internal/controller/pattern_controller.go +++ b/internal/controller/pattern_controller.go @@ -17,9 +17,14 @@ limitations under the License. package controllers import ( + "bytes" "context" + "crypto/tls" + "encoding/json" "fmt" + "io" "log" + "net/http" "os" "path/filepath" "strings" @@ -88,6 +93,8 @@ type PatternReconciler struct { //+kubebuilder:rbac:groups=operator.openshift.io,resources="openshiftcontrollermanagers",resources=openshiftcontrollermanagers,verbs=get;list //+kubebuilder:rbac:groups="",resources=secrets,verbs=get;create;update;watch //+kubebuilder:rbac:groups="route.openshift.io",namespace=vp-gitea,resources=routes;routes/custom-host,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups="view.open-cluster-management.io",resources=managedclusterviews,verbs=create +//+kubebuilder:rbac:groups="cluster.open-cluster-management.io",resources=managedclusters,verbs=list;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -162,7 +169,14 @@ func (r *PatternReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // -- GitOps Subscription targetSub, _ := newSubscriptionFromConfigMap(r.fullClient) - _ = controllerutil.SetOwnerReference(qualifiedInstance, targetSub, r.Scheme) + operatorConfigMap, err := GetOperatorConfigmap() + if err == nil { + if err := controllerutil.SetOwnerReference(operatorConfigMap, targetSub, r.Scheme); err != nil { + return r.actionPerformed(qualifiedInstance, "error setting owner of gitops subscription", err) + } + } else { + return r.actionPerformed(qualifiedInstance, "error getting operator configmap", err) + } sub, _ := getSubscription(r.olmClient, targetSub.Name) if sub == nil { @@ -176,7 +190,20 @@ func (r *PatternReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return r.actionPerformed(qualifiedInstance, "update gitops subscription", errSub) } } else { - logOnce("The gitops subscription is not owned by us, leaving untouched") + // Historically the subscription was owned by the pattern, not the operator. If this is the case, + // we update the owner reference to the operator itself. + if err := controllerutil.RemoveOwnerReference(qualifiedInstance, sub, r.Scheme); err == nil { + if err := controllerutil.SetOwnerReference(operatorConfigMap, sub, r.Scheme); err != nil { + return r.actionPerformed(qualifiedInstance, "error setting patterns operator owner reference of gitops subscription", err) + } + // Persist the updated ownerReferences on the Subscription + if _, err := r.olmClient.OperatorsV1alpha1().Subscriptions(SubscriptionNamespace).Update(context.Background(), sub, metav1.UpdateOptions{}); err != nil { + return r.actionPerformed(qualifiedInstance, "error updating gitops subscription owner references", err) + } + return r.actionPerformed(qualifiedInstance, "updated patterns operator owner reference of gitops subscription", nil) + } else { + logOnce("The gitops subscription is not owned by us, leaving untouched") + } } logOnce("subscription found") @@ -496,6 +523,102 @@ func (r *PatternReconciler) applyDefaults(input *api.Pattern) (*api.Pattern, err return output, nil } +func (r *PatternReconciler) updateDeletionPhase(instance *api.Pattern, phase api.PatternDeletionPhase) error { + log.Printf("Updating deletion phase to '%s'", phase) + instance.Status.DeletionPhase = phase + if err := r.Client.Status().Update(context.TODO(), instance); err != nil { + return fmt.Errorf("failed to update deletion phase: %w", err) + } + + // Re-fetch to get updated status + if err := r.Get(context.TODO(), client.ObjectKeyFromObject(instance), instance); err != nil { + return fmt.Errorf("failed to re-fetch pattern after phase update: %w", err) + } + + return nil +} + +func (r *PatternReconciler) deleteSpokeApps(instance *api.Pattern, targetApp, app *argoapi.Application, namespace string) error { + log.Printf("Deletion phase: %s - checking if all child applications are gone from spoke", api.DeletingSpokeApps) + + // Update application with deletePattern=2 to trigger spoke deletion + if changed, _ := updateApplication(r.argoClient, targetApp, app, namespace); changed { + return fmt.Errorf("updated application %q for spoke deletion", app.Name) + } + if app.Status.Sync.Status == argoapi.SyncStatusCodeOutOfSync { + inProgress, err := syncApplicationWithPrune(r.argoClient, app) + if err != nil { + return err + } + if inProgress { + return fmt.Errorf("sync with prune and force is already in progress for application %q", app.Name) + } + } + + childApps, err := getChildApplications(r.argoClient, app) + if err != nil { + return err + } else { + for _, childApp := range childApps { + if _, err := syncApplicationWithPrune(r.argoClient, &childApp); err != nil { + return err + } + } + } + + // Check if all child applications are gone from spoke + allGone, err := r.checkSpokeChildApplicationsGone(instance) + if err != nil { + return fmt.Errorf("error checking child applications: %w", err) + } + + if !allGone { + log.Printf("Waiting for all child applications to be deleted from spoke clusters") + return fmt.Errorf("waiting for child applications to be deleted from spoke clusters") + } + + return nil +} + +func (r *PatternReconciler) deleteHubApps(targetApp, app *argoapi.Application, namespace string) error { + log.Printf("Deletion phase: %s - deleting from hub", api.DeletingHubApps) + + // Delete managed clusters (excluding local-cluster) + // These must be removed before hub deletion can proceed because ACM won't delete properly if they exist + if haveACMHub(r) { + deletedCount, err := r.deleteManagedClusters(context.TODO()) + if err != nil { + return fmt.Errorf("failed to delete managed clusters: %w", err) + } + + if deletedCount > 0 { + log.Printf("Deleted %d managed cluster(s), waiting for them to be fully removed", deletedCount) + return fmt.Errorf("deleted %d managed cluster(s), waiting for removal to complete before proceeding with hub deletion", deletedCount) + } + + // Update application with deletePattern=1 to trigger hub deletion + if changed, _ := updateApplication(r.argoClient, targetApp, app, namespace); changed { + return fmt.Errorf("updated application %q for hub deletion", app.Name) + } + + inProgress, err := syncApplicationWithPrune(r.argoClient, app) + if err != nil { + return err + } + if inProgress { + return fmt.Errorf("sync with prune and force is already in progress for application %q", app.Name) + } + + return fmt.Errorf("waiting for removal of that acm hub") + } + + log.Printf("Removing the application, and cascading to anything instantiated by ArgoCD") + if err := removeApplication(r.argoClient, app.Name, namespace); err != nil { + return err + } + return fmt.Errorf("waiting for application %q to be removed", app.Name) +} + func (r *PatternReconciler) finalizeObject(instance *api.Pattern) error { // Add finalizer when object is created log.Printf("Finalizing pattern object") @@ -524,23 +647,38 @@ func (r *PatternReconciler) finalizeObject(instance *api.Pattern) error { return nil } - if changed, _ := updateApplication(r.argoClient, targetApp, app, ns); changed { - return fmt.Errorf("updated application %q for removal", app.Name) + // Initialize deletion phase if not set + if qualifiedInstance.Status.DeletionPhase == api.InitializeDeletion { + log.Printf("Initializing deletion phase") + if haveACMHub(r) { + if err := r.updateDeletionPhase(qualifiedInstance, api.DeletingSpokeApps); err != nil { + return err + } + } else { + if err := r.updateDeletionPhase(qualifiedInstance, api.DeletingHubApps); err != nil { + return err + } + } } - if haveACMHub(r) { - return fmt.Errorf("waiting for removal of that acm hub") - } + // Phase 1: Delete applications from spoke clusters + if qualifiedInstance.Status.DeletionPhase == api.DeletingSpokeApps { + if err := r.deleteSpokeApps(qualifiedInstance, targetApp, app, ns); err != nil { + return err + } - if app.Status.Sync.Status == argoapi.SyncStatusCodeOutOfSync { - return fmt.Errorf("application %q is still %s", app.Name, argoapi.SyncStatusCodeOutOfSync) + log.Printf("All child applications are gone, transitioning to %s phase", api.DeletingHubApps) + if err := r.updateDeletionPhase(qualifiedInstance, api.DeletingHubApps); err != nil { + return err + } } - log.Printf("Removing the application, and cascading to anything instantiated by ArgoCD") - if err := removeApplication(r.argoClient, app.Name, ns); err != nil { - return err + // Phase 2: Delete applications from hub + if qualifiedInstance.Status.DeletionPhase == api.DeletingHubApps { + if err := r.deleteHubApps(targetApp, app, ns); err != nil { + return err + } } - return fmt.Errorf("waiting for application %q to be removed", app.Name) } return nil @@ -604,7 +742,8 @@ func (r *PatternReconciler) onReconcileErrorWithRequeue(p *api.Pattern, reason s } if duration != nil { log.Printf("Requeueing\n") - return reconcile.Result{RequeueAfter: *duration}, err + // Return nil error when we have a duration to avoid exponential backoff + return reconcile.Result{RequeueAfter: *duration}, nil } return reconcile.Result{}, err } @@ -698,6 +837,141 @@ func (r *PatternReconciler) updatePatternCRDetails(input *api.Pattern) (bool, er return false, nil } +// checkSpokeChildApplicationsGone checks if all child applications (excluding the app-of-apps) are gone from spoke clusters +// The operator runs on the hub cluster and needs to check spoke clusters through ACM Search Service +// Returns true if all child applications are gone, false otherwise +func (r *PatternReconciler) checkSpokeChildApplicationsGone(p *api.Pattern) (bool, error) { + + // Running locally: use localhost with env var set to "https://localhost:4010/searchapi/graphql" and port-forward + // User should run: kubectl port-forward -n open-cluster-management svc/search-search-api 4010:4010 + searchURL := os.Getenv("ACM_SEARCH_API_URL") + if searchURL == "" { + searchNamespace := "open-cluster-management" // Default namespace for ACM + searchURL = fmt.Sprintf("https://search-search-api.%s.svc.cluster.local:4010/searchapi/graphql", searchNamespace) + } + + token := os.Getenv("ACM_SEARCH_API_TOKEN") + if token == "" { + var tokenBytes []byte + var err error + + tokenPath := "/run/secrets/kubernetes.io/serviceaccount/token" + + if tokenBytes, err = os.ReadFile(tokenPath); err != nil { + return false, fmt.Errorf("failed to read serviceaccount token: %w", err) + } + token = string(tokenBytes) + } + + // Build GraphQL query to search for Applications + // Filter out local-cluster apps and app of apps (based on namespace) + query := map[string]any{ + "operationName": "searchResult", + "query": "query searchResult($input: [SearchInput]) { searchResult: search(input: $input) { items related { kind items } } }", + "variables": map[string]any{ + "input": []map[string]any{ + { + "filters": []map[string]any{ + { + "property": "apigroup", + "values": []string{"argoproj.io"}, + }, + { + "property": "kind", + "values": []string{"Application"}, + }, + { + "property": "cluster", + "values": []string{"!local-cluster"}, + }, + { + "property": "namespace", + "values": []string{fmt.Sprintf("!%s", getClusterWideArgoNamespace())}, + }, + }, + "relatedKinds": []string{"Application"}, + "limit": 20000, + }, + }, + }, + } + + // Marshal query to JSON + queryJSON, err := json.Marshal(query) + if err != nil { + return false, fmt.Errorf("failed to marshal GraphQL query: %w", err) + } + + // Create HTTP request + req, err := http.NewRequest("POST", searchURL, bytes.NewBuffer(queryJSON)) + if err != nil { + return false, fmt.Errorf("failed to create HTTP request: %w", err) + } + + // Set headers + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + // Create HTTP client + // Use insecure TLS (self-signed certs) + client := &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + } + + // Make the request + resp, err := client.Do(req) + if err != nil { + return false, fmt.Errorf("failed to make HTTP request to search service: %w", err) + } + defer resp.Body.Close() + + // Read response body + body, err := io.ReadAll(resp.Body) + if err != nil { + return false, fmt.Errorf("failed to read response body: %w", err) + } + + // Check HTTP status + if resp.StatusCode != http.StatusOK { + return false, fmt.Errorf("search service returned status %d: %s", resp.StatusCode, string(body)) + } + + // Parse JSON response + type SearchAPIResponse struct { + Data struct { + SearchResult []struct { + Items []struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + Cluster string `json:"cluster"` + } `json:"items"` + } `json:"searchResult"` + } `json:"data"` + } + var searchResponse SearchAPIResponse + if err := json.Unmarshal(body, &searchResponse); err != nil { + return false, fmt.Errorf("failed to parse JSON response: %w", err) + } + + var remote_app_names []string + if searchResult := searchResponse.Data.SearchResult; len(searchResult) > 0 { + for _, item := range searchResult[0].Items { + remote_app_names = append(remote_app_names, fmt.Sprintf("%s/%s in %s", item.Namespace, item.Name, item.Cluster)) + } + } + + if len(remote_app_names) != 0 { + return false, fmt.Errorf("spoke cluster apps still exist: %s", remote_app_names) + } + + return true, nil +} + func (r *PatternReconciler) authGitFromSecret(namespace, secret string) (map[string][]byte, error) { tokenSecret, err := r.fullClient.CoreV1().Secrets(namespace).Get(context.TODO(), secret, metav1.GetOptions{}) if err != nil { diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 7ad533a10..4b98e8565 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -40,6 +40,7 @@ import ( kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" configv1 "github.com/openshift/api/config/v1" ) @@ -404,3 +405,17 @@ func IsCommonSlimmed(patternPath string) bool { } return true } + +// Gets the configmap for the Patterns Operator. (Used as an owner reference for the operator itself.) +func GetOperatorConfigmap() (*corev1.ConfigMap, error) { + config, err := ctrl.GetConfig() + if err != nil { + return nil, fmt.Errorf("failed to get config: %s", err) + } + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to call NewForConfig: %s", err) + } + + return clientset.CoreV1().ConfigMaps(OperatorNamespace).Get(context.Background(), OperatorConfigMap, metav1.GetOptions{}) +}