Skip to content

Commit e7a5d34

Browse files
fix: improve upgrade e2e test robustness for controller-runtime v0.23.1
Controller-runtime v0.23.1+ has faster leader election startup, causing timing issues with the upgrade e2e tests that watch pod logs for leader election messages. The log watching approach fails because it uses Follow=true which only streams NEW logs, missing messages emitted before the test starts watching. With faster startup in v0.23.1+, leader election often completes before the test begins watching. Changes: - Replace log watching with direct lease object checking - Poll lease.Spec.HolderIdentity to confirm leader election occurred - More reliable than log watching, independent of timing - Add k8s.io/utils/ptr import for log options This approach is timing-independent and directly verifies the desired state rather than relying on log message observation. Assisted-by: Cursor/Claude Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent ab4a502 commit e7a5d34

1 file changed

Lines changed: 58 additions & 38 deletions

File tree

test/upgrade-e2e/post_upgrade_test.go

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1818
"k8s.io/apimachinery/pkg/labels"
1919
"k8s.io/apimachinery/pkg/types"
20+
"k8s.io/utils/ptr"
2021
"sigs.k8s.io/controller-runtime/pkg/client"
2122

2223
ocv1 "github.com/operator-framework/operator-controller/api/v1"
@@ -56,13 +57,16 @@ func TestClusterCatalogUnpacking(t *testing.T) {
5657
managerPod = managerPods.Items[0]
5758
}, time.Minute, time.Second)
5859

59-
t.Log("Waiting for acquired leader election")
60-
leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
61-
defer leaderCancel()
62-
leaderSubstrings := []string{"successfully acquired lease"}
63-
leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...)
64-
require.NoError(t, err)
65-
require.True(t, leaderElected)
60+
t.Logf("Waiting for acquired leader election by checking lease")
61+
// Instead of watching logs (which has timing issues in controller-runtime v0.23.1+),
62+
// directly check the lease object to confirm leader election occurred
63+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
64+
var lease coordinationv1.Lease
65+
err := c.Get(ctx, types.NamespacedName{Name: "catalogd-operator-lock", Namespace: "olmv1-system"}, &lease)
66+
require.NoError(ct, err)
67+
require.NotNil(ct, lease.Spec.HolderIdentity, "lease should have a holder")
68+
require.NotEmpty(ct, *lease.Spec.HolderIdentity, "lease holder identity should not be empty")
69+
}, 3*time.Minute, time.Second)
6670

6771
t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1")
6872
logCtx, cancel := context.WithTimeout(ctx, time.Minute)
@@ -110,16 +114,18 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
110114
t.Log("Wait for operator-controller deployment to be ready")
111115
managerPod := waitForDeployment(t, ctx, "operator-controller")
112116

113-
t.Log("Wait for acquired leader election")
117+
t.Log("Wait for acquired leader election by checking lease")
118+
// Instead of watching logs (which has timing issues in controller-runtime v0.23.1+),
119+
// directly check the lease object to confirm leader election occurred
114120
// Average case is under 1 minute but in the worst case: (previous leader crashed)
115121
// we could have LeaseDuration (137s) + RetryPeriod (26s) +/- 163s
116-
leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
117-
defer leaderCancel()
118-
119-
leaderSubstrings := []string{"successfully acquired lease"}
120-
leaderElected, err := watchPodLogsForSubstring(leaderCtx, managerPod, leaderSubstrings...)
121-
require.NoError(t, err)
122-
require.True(t, leaderElected)
122+
require.EventuallyWithT(t, func(ct *assert.CollectT) {
123+
var lease coordinationv1.Lease
124+
err := c.Get(ctx, types.NamespacedName{Name: "9c4404e7.operatorframework.io", Namespace: "olmv1-system"}, &lease)
125+
require.NoError(ct, err)
126+
require.NotNil(ct, lease.Spec.HolderIdentity, "lease should have a holder")
127+
require.NotEmpty(ct, *lease.Spec.HolderIdentity, "lease holder identity should not be empty")
128+
}, 3*time.Minute, time.Second)
123129

124130
t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it")
125131
// Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes
@@ -277,32 +283,46 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
277283
}
278284

279285
func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {
280-
podLogOpts := corev1.PodLogOptions{
281-
Follow: true,
282-
Container: container,
283-
}
284-
285-
req := kclientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts)
286-
podLogs, err := req.Stream(ctx)
287-
if err != nil {
288-
return false, err
289-
}
290-
defer podLogs.Close()
286+
// Use a polling approach to periodically check pod logs for the substrings
287+
// This handles controller-runtime v0.23.1+ where leader election happens faster
288+
// and the message may be emitted before we start watching with Follow
289+
ticker := time.NewTicker(2 * time.Second)
290+
defer ticker.Stop()
291+
292+
for {
293+
select {
294+
case <-ctx.Done():
295+
return false, ctx.Err()
296+
case <-ticker.C:
297+
// Get recent logs without Follow to check if message exists
298+
logOpts := corev1.PodLogOptions{
299+
Container: container,
300+
TailLines: ptr.To(int64(1000)),
301+
}
291302

292-
scanner := bufio.NewScanner(podLogs)
293-
for scanner.Scan() {
294-
line := scanner.Text()
303+
req := kclientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &logOpts)
304+
podLogs, err := req.Stream(context.Background()) // Use Background to avoid context cancellation during read
305+
if err != nil {
306+
// Pod might not be ready yet, continue polling
307+
continue
308+
}
295309

296-
foundCount := 0
297-
for _, substring := range substrings {
298-
if strings.Contains(line, substring) {
299-
foundCount++
310+
scanner := bufio.NewScanner(podLogs)
311+
for scanner.Scan() {
312+
line := scanner.Text()
313+
314+
foundCount := 0
315+
for _, substring := range substrings {
316+
if strings.Contains(line, substring) {
317+
foundCount++
318+
}
319+
}
320+
if foundCount == len(substrings) {
321+
podLogs.Close()
322+
return true, nil
323+
}
300324
}
301-
}
302-
if foundCount == len(substrings) {
303-
return true, nil
325+
podLogs.Close()
304326
}
305327
}
306-
307-
return false, scanner.Err()
308328
}

0 commit comments

Comments
 (0)