Skip to content

Commit 596fce3

Browse files
authored
feat: vmdi missing image self heal (#63)
* feat: added two new fields to cr and generated boilerplate * feat: add checks to see if we have a missing artifact in provisioner * refactor: rename fields to be more descriptive * feat: add new values to config to handle retrying totally failed syncs * feat: wired up * refactor: moved to time based failure * refactor: let library check on sync * refactor: remove other unneeded fields * refactor: formatting * chore: generate manifests again * fix: messy names and duplicate env var targeting * feat: added more condition reasons * feat: added new phase for visibility into retry loop * feat: handling new phase in controller and service code * chore: regenerate chart to include new phase * fix: wait 10 seconds before checking sync status again * chore: regenerate resources after adding LastFailureTime to spec * fix: resolve finalizer conflict and now respecting backoff behavior * chore: regenerate the chart * fix: pr feedback * fix: resolve nit * fix: consistent names * fix: better boolean name * fix: 10 as concurrency limit on sync. Average number of labs per sync is ~3 so lets us do 4 labs
1 parent 7360299 commit 596fce3

9 files changed

Lines changed: 163 additions & 82 deletions

File tree

api/v1alpha1/vmdiskimage_types.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,24 @@ const (
3434

3535
// Condition Reasons
3636
const (
37-
ReasonResourceCreationFailed string = "ResourceCreationFailed"
38-
ReasonResouceUpdateFailed string = "ResourceUpdateFailed"
39-
ReasonQueued string = "Queued"
40-
ReasonSyncing string = "Syncing"
41-
ReasonRetryLimitExceeded string = "RetryLimitExceeded"
42-
ReasonSynced string = "Synced"
37+
ReasonResourceCreationFailed string = "ResourceCreationFailed"
38+
ReasonResouceUpdateFailed string = "ResourceUpdateFailed"
39+
ReasonQueued string = "Queued"
40+
ReasonSyncing string = "Syncing"
41+
ReasonRetryLimitExceeded string = "RetryLimitExceeded"
42+
ReasonMissingSourceArtifact string = "MissingSourceArtifact"
43+
ReasonSyncAttemptDurationExceeded string = "SyncAttemptDurationExceeded"
44+
ReasonUnknownSyncFailure string = "UnknownSyncFailure"
45+
ReasonSynced string = "Synced"
4346
)
4447

4548
// CRD phases
4649
const (
47-
PhaseQueued string = "Queued"
48-
PhaseSyncing string = "Syncing"
49-
PhaseReady string = "Ready"
50-
PhaseFailed string = "Failed"
50+
PhaseQueued string = "Queued"
51+
PhaseSyncing string = "Syncing"
52+
PhaseReady string = "Ready"
53+
PhaseRetryableFailure string = "RetryableFailure"
54+
PhaseFailed string = "Failed"
5155
)
5256

5357
// VMDiskImage Labels
@@ -94,7 +98,7 @@ type VMDiskImageStatus struct {
9498
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
9599
// Important: Run "make" to regenerate code after modifying this file
96100

97-
// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed
101+
// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed;RetryableFailure
98102
Phase string `json:"phase"`
99103

100104
// A human-readable message providing more details about the current phase.
@@ -103,7 +107,8 @@ type VMDiskImageStatus struct {
103107
// Conditions of the VMDiskImage resource.
104108
Conditions []metav1.Condition `json:"conditions,omitempty"`
105109

106-
FailureCount int `json:"failureCount,omitempty"`
110+
FailureCount int `json:"failureCount,omitempty"`
111+
LastFailureTime *metav1.Time `json:"lastFailureTime,omitempty"`
107112
}
108113

109114
// +kubebuilder:object:root=true

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ spec:
137137
type: array
138138
failureCount:
139139
type: integer
140+
lastFailureTime:
141+
format: date-time
142+
type: string
140143
message:
141144
description: A human-readable message providing more details about the current phase.
142145
type: string
@@ -146,6 +149,7 @@ spec:
146149
- Syncing
147150
- Ready
148151
- Failed
152+
- RetryableFailure
149153
type: string
150154
required:
151155
- phase

config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ spec:
140140
type: array
141141
failureCount:
142142
type: integer
143+
lastFailureTime:
144+
format: date-time
145+
type: string
143146
message:
144147
description: A human-readable message providing more details about
145148
the current phase.
@@ -150,6 +153,7 @@ spec:
150153
- Syncing
151154
- Ready
152155
- Failed
156+
- RetryableFailure
153157
type: string
154158
required:
155159
- phase

dist/install.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ spec:
148148
type: array
149149
failureCount:
150150
type: integer
151+
lastFailureTime:
152+
format: date-time
153+
type: string
151154
message:
152155
description: A human-readable message providing more details about
153156
the current phase.
@@ -158,6 +161,7 @@ spec:
158161
- Syncing
159162
- Ready
160163
- Failed
164+
- RetryableFailure
161165
type: string
162166
required:
163167
- phase

internal/vm-disk-image/config/vmdi-controller-config-reader.go

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,44 @@ import (
66
)
77

88
const (
9-
defaultConcurrency = 10 // TODO: We will need to tune this default
10-
defaultRetryLimit = 2
11-
defaultBackoffDuration = 10 * time.Second
12-
defaultMaxSyncDuration = 1 * time.Hour
9+
defaultConcurrency = 10
10+
defaultMaxBackoffDelay = 1 * time.Hour
11+
defaultMaxSyncDuration = 12 * time.Hour
12+
defaultMaxSyncAttemptRetries = 3
13+
defaultMaxSyncAttemptDuration = 1 * time.Hour
1314
)
1415

1516
type VMDiskImageControllerConfig struct {
16-
Concurrency int
17-
RetryLimit int
18-
RetryBackoffDuration time.Duration
19-
MaxSyncDuration time.Duration
17+
Concurrency int
18+
MaxBackoffDelay time.Duration
19+
MaxSyncDuration time.Duration
20+
MaxSyncAttemptDuration time.Duration
21+
MaxSyncAttemptRetries int
2022
}
2123

2224
// This function will allow us to get the required config variables from the environment.
2325
// Locally this is your "env" and in production these values will come from a configmap
2426
func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
2527
// The max amount of VMDIs we can have syncing at one time.
26-
concurrency := corecfg.GetIntEnvOrDefault("CONCURRENCY", defaultConcurrency)
28+
concurrency := corecfg.GetIntEnvOrDefault("MAX_VMDI_SYNC_CONCURRENCY", defaultConcurrency)
2729

28-
// How many times we will retry a failed sync.
29-
retryLimit := corecfg.GetIntEnvOrDefault("RETRY_LIMIT", defaultRetryLimit)
30+
// The longest we will ever wait to retry.
31+
maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay)
3032

31-
// How long we want to wait before trying to resync a failed VMDI.
32-
retryBackoffDuration := corecfg.GetDurationEnvOrDefault("RETRY_BACKOFF_DURATION", defaultBackoffDuration)
33+
// How long we will try to run a sync before we fail it forever.
34+
maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
3335

3436
// How long we will let a VMDI sit in syncing status.
35-
maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
37+
maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)
38+
39+
// How many times we will retry on a given attempt.
40+
maxSyncAttemptRetries := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
3641

3742
return VMDiskImageControllerConfig{
38-
Concurrency: concurrency,
39-
RetryLimit: retryLimit,
40-
RetryBackoffDuration: retryBackoffDuration,
41-
MaxSyncDuration: maxSyncDuration,
43+
Concurrency: concurrency,
44+
MaxBackoffDelay: maxBackoffDelay,
45+
MaxSyncAttemptDuration: maxAttemptDuration,
46+
MaxSyncAttemptRetries: maxSyncAttemptRetries,
47+
MaxSyncDuration: maxSyncDuration,
4248
}
4349
}

internal/vm-disk-image/controller/controller.go

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
7575
return r.VMDiskImageOrchestrator.DeleteResource(ctx, &VMDiskImage)
7676
}
7777

78-
resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
79-
if resourceHasFinalizer {
80-
err := r.AddControllerFinalizer(ctx, &VMDiskImage)
81-
if err != nil {
82-
return r.HandleResourceUpdateError(ctx, &VMDiskImage, err, "Failed to add finalizer to our resource")
83-
}
84-
78+
resourceMissingFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
79+
if resourceMissingFinalizer {
80+
return r.AddControllerFinalizer(ctx, &VMDiskImage)
8581
}
8682

8783
currentPhase := VMDiskImage.Status.Phase
@@ -93,6 +89,8 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
9389
return r.AttemptSyncingOfResource(ctx, &VMDiskImage)
9490
case crdv1.PhaseSyncing:
9591
return r.TransitonFromSyncing(ctx, &VMDiskImage)
92+
case crdv1.PhaseRetryableFailure:
93+
return r.AttemptRetry(ctx, &VMDiskImage)
9694
case crdv1.PhaseReady, crdv1.PhaseFailed:
9795
return ctrl.Result{}, nil
9896
default:
@@ -112,18 +110,18 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
112110

113111
resourceGenerator := &vmdi.Generator{}
114112
vmdiProvisioner := vmdi.K8sVMDIProvisioner{
115-
Client: client,
116-
ResourceGenerator: resourceGenerator,
117-
MaxSyncDuration: config.MaxSyncDuration,
118-
RetryLimit: config.RetryLimit,
113+
Client: client,
114+
ResourceGenerator: resourceGenerator,
115+
MaxSyncAttemptDuration: config.MaxSyncAttemptDuration,
116+
MaxSyncAttemptRetries: config.MaxSyncAttemptRetries,
119117
}
120118
orchestrator := vmdi.Orchestrator{
121-
Client: client,
122-
Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
123-
Provisioner: vmdiProvisioner,
124-
RetryLimit: config.RetryLimit,
125-
RetryBackoff: config.RetryBackoffDuration,
126-
SyncLimit: config.Concurrency,
119+
Client: client,
120+
Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
121+
Provisioner: vmdiProvisioner,
122+
MaxRetryBackoff: config.MaxBackoffDelay,
123+
MaxSyncTime: config.MaxSyncDuration,
124+
ConcurrentSyncLimit: config.Concurrency,
127125
}
128126
reconciler := &VMDiskImageReconciler{
129127
Scheme: mgr.GetScheme(),

0 commit comments

Comments
 (0)