Skip to content

Commit 666a3cf

Browse files
chore(ci): Add test to check if user changes are preserved
Add a test to ensure that OLM is not reverting user changes like kubectl rollout restart. Assisted-by: Cursor/Claude
1 parent 25704ad commit 666a3cf

2 files changed

Lines changed: 315 additions & 0 deletions

File tree

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
Feature: Rollout Restart User Changes
2+
# Verifies that user-added pod template annotations persist after OLM reconciliation.
3+
# Fixes: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
4+
5+
Background:
6+
Given OLM is available
7+
And ClusterCatalog "test" serves bundles
8+
And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
9+
10+
# TODO(Fix Boxcutter): It does not work with boxcutter
11+
# but to properly fix we need SSA first. So, this one is blocked
12+
# by https://github.com/operator-framework/operator-controller/pull/2515
13+
Scenario: User-initiated deployment changes persist after OLM reconciliation
14+
When ClusterExtension is applied
15+
"""
16+
apiVersion: olm.operatorframework.io/v1
17+
kind: ClusterExtension
18+
metadata:
19+
name: ${NAME}
20+
spec:
21+
namespace: ${TEST_NAMESPACE}
22+
serviceAccount:
23+
name: olm-sa
24+
source:
25+
sourceType: Catalog
26+
catalog:
27+
packageName: test
28+
selector:
29+
matchLabels:
30+
"olm.operatorframework.io/metadata.name": test-catalog
31+
"""
32+
Then ClusterExtension is rolled out
33+
And ClusterExtension is available
34+
And resource "deployment/test-operator" is installed
35+
And deployment "test-operator" is ready
36+
When user performs rollout restart on "deployment/test-operator"
37+
Then deployment "test-operator" rollout completes successfully
38+
And resource "deployment/test-operator" has restart annotation
39+
# Wait for OLM reconciliation cycle (runs every 10 seconds)
40+
And I wait for "30" seconds
41+
# Verify user changes persisted after OLM reconciliation
42+
Then deployment "test-operator" rollout is still successful
43+
And resource "deployment/test-operator" has restart annotation
44+
And deployment "test-operator" has expected number of ready replicas

test/e2e/steps/steps.go

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os/exec"
1515
"path/filepath"
1616
"reflect"
17+
"strconv"
1718
"strings"
1819
"time"
1920

@@ -88,6 +89,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8889
sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
8990
sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
9091
sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
92+
sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
93+
sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
94+
sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
95+
sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
96+
sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
97+
sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
98+
sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
9199

92100
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
93101
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1288,3 +1296,266 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
12881296

12891297
return latest, nil
12901298
}
1299+
1300+
// UserPerformsRolloutRestart simulates a user running "kubectl rollout restart deployment/<name>".
1301+
// This adds a restart annotation to trigger a rolling restart of pods.
1302+
// This is used to test the generic fix - OLM should not undo ANY user-added annotations.
1303+
// In OLMv0, OLM would undo this change. In OLMv1, it should stay because kubectl owns it.
1304+
// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
1305+
func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
1306+
sc := scenarioCtx(ctx)
1307+
resourceName = substituteScenarioVars(resourceName, sc)
1308+
1309+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1310+
if !ok {
1311+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1312+
}
1313+
1314+
if kind != "deployment" {
1315+
return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
1316+
}
1317+
1318+
// Run kubectl rollout restart to add the restart annotation.
1319+
// This is the real command users run, so we test actual user behavior.
1320+
out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
1321+
if err != nil {
1322+
return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
1323+
}
1324+
1325+
logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
1326+
1327+
return nil
1328+
}
1329+
1330+
// ResourceHasRestartAnnotation checks that a deployment has a restart annotation.
1331+
// This confirms that user changes stay in place after OLM runs.
1332+
// The fix is generic and works for ANY user-added annotations on pod templates.
1333+
func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error {
1334+
sc := scenarioCtx(ctx)
1335+
resourceName = substituteScenarioVars(resourceName, sc)
1336+
1337+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1338+
if !ok {
1339+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1340+
}
1341+
1342+
if kind != "deployment" {
1343+
return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind)
1344+
}
1345+
1346+
// Look for the restart annotation added by "kubectl rollout restart"
1347+
restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
1348+
1349+
// Keep checking until the annotation appears (it may not show up immediately)
1350+
// This is better than checking only once
1351+
var annotationValue string
1352+
waitFor(ctx, func() bool {
1353+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1354+
"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
1355+
if err != nil {
1356+
return false
1357+
}
1358+
// If the annotation exists and has a value, it stayed in place
1359+
if out == "" {
1360+
return false
1361+
}
1362+
annotationValue = out
1363+
return true
1364+
})
1365+
1366+
logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", annotationValue)
1367+
return nil
1368+
}
1369+
1370+
// DeploymentIsReady checks that a deployment is ready with all pods running.
1371+
func DeploymentIsReady(ctx context.Context, deploymentName string) error {
1372+
sc := scenarioCtx(ctx)
1373+
deploymentName = substituteScenarioVars(deploymentName, sc)
1374+
1375+
waitFor(ctx, func() bool {
1376+
// Check if deployment is ready
1377+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1378+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1379+
if err != nil {
1380+
return false
1381+
}
1382+
return out == "True"
1383+
})
1384+
1385+
logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
1386+
return nil
1387+
}
1388+
1389+
// DeploymentRolloutCompletesSuccessfully waits for the rollout to finish.
1390+
// This checks that new pods were created and are running.
1391+
func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
1392+
sc := scenarioCtx(ctx)
1393+
deploymentName = substituteScenarioVars(deploymentName, sc)
1394+
1395+
// Use kubectl rollout status to wait until done.
1396+
// This makes sure new pods are created and running.
1397+
// Timeout is 7m to account for startup probes (test-operator has 5min startup probe)
1398+
out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=7m")
1399+
if err != nil {
1400+
return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
1401+
}
1402+
1403+
logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
1404+
1405+
// Check deployment status
1406+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1407+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1408+
if err != nil {
1409+
return fmt.Errorf("failed to check deployment availability: %w", err)
1410+
}
1411+
if available != "True" {
1412+
return fmt.Errorf("deployment %s is not available", deploymentName)
1413+
}
1414+
1415+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1416+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1417+
if err != nil {
1418+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1419+
}
1420+
if progressing != "True" {
1421+
return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
1422+
}
1423+
1424+
return nil
1425+
}
1426+
1427+
// WaitForSeconds waits for the given number of seconds.
1428+
// This gives OLM time to run its checks between test steps.
1429+
//
1430+
// Note: We wait for a fixed time (not checking in a loop) because we need to make sure
1431+
// OLM actually runs (it runs every 10 seconds). We want to check that user changes stay
1432+
// AFTER OLM runs. If we just checked in a loop, we wouldn't know if OLM ran yet.
1433+
func WaitForSeconds(ctx context.Context, seconds string) error {
1434+
sec, err := strconv.Atoi(seconds)
1435+
if err != nil {
1436+
return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
1437+
}
1438+
1439+
if sec <= 0 {
1440+
return fmt.Errorf("seconds value must be greater than 0, got %d", sec)
1441+
}
1442+
1443+
logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
1444+
1445+
timer := time.NewTimer(time.Duration(sec) * time.Second)
1446+
defer timer.Stop()
1447+
1448+
select {
1449+
case <-timer.C:
1450+
logger.V(1).Info("Wait complete")
1451+
return nil
1452+
case <-ctx.Done():
1453+
return fmt.Errorf("wait canceled: %w", ctx.Err())
1454+
}
1455+
}
1456+
1457+
// verifyDeploymentReplicaStatus checks if a deployment has the right number of ready pods.
1458+
func verifyDeploymentReplicaStatus(deploymentName, namespace string) (string, string, error) {
1459+
readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace,
1460+
"-o", "jsonpath={.status.readyReplicas}")
1461+
if err != nil {
1462+
return "", "", fmt.Errorf("failed to get ready replicas: %w", err)
1463+
}
1464+
1465+
replicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace,
1466+
"-o", "jsonpath={.spec.replicas}")
1467+
if err != nil {
1468+
return "", "", fmt.Errorf("failed to get desired replicas: %w", err)
1469+
}
1470+
1471+
// Normalize empty jsonpath results (when readyReplicas is omitted for 0 replicas)
1472+
readyReplicas = strings.TrimSpace(readyReplicas)
1473+
replicas = strings.TrimSpace(replicas)
1474+
if readyReplicas == "" {
1475+
readyReplicas = "0"
1476+
}
1477+
if replicas == "" {
1478+
replicas = "0"
1479+
}
1480+
1481+
// Compare as integers to avoid false negatives
1482+
readyInt, err := strconv.Atoi(readyReplicas)
1483+
if err != nil {
1484+
return readyReplicas, replicas, fmt.Errorf("invalid ready replicas value %q: %w", readyReplicas, err)
1485+
}
1486+
replicasInt, err := strconv.Atoi(replicas)
1487+
if err != nil {
1488+
return readyReplicas, replicas, fmt.Errorf("invalid desired replicas value %q: %w", replicas, err)
1489+
}
1490+
1491+
if readyInt != replicasInt {
1492+
return readyReplicas, replicas, fmt.Errorf("deployment %s has %d ready replicas but expected %d",
1493+
deploymentName, readyInt, replicasInt)
1494+
}
1495+
1496+
return readyReplicas, replicas, nil
1497+
}
1498+
1499+
// DeploymentRolloutIsStillSuccessful checks that the rollout is still working.
1500+
// This makes sure OLM didn't undo the user's "kubectl rollout restart" command.
1501+
// It checks that new pods are still running.
1502+
func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
1503+
sc := scenarioCtx(ctx)
1504+
deploymentName = substituteScenarioVars(deploymentName, sc)
1505+
1506+
// Check deployment status
1507+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1508+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1509+
if err != nil {
1510+
return fmt.Errorf("failed to check deployment availability: %w", err)
1511+
}
1512+
if available != "True" {
1513+
return fmt.Errorf("deployment %s is not available - rollout was undone", deploymentName)
1514+
}
1515+
1516+
// Check that the deployment is still working correctly
1517+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1518+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1519+
if err != nil {
1520+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1521+
}
1522+
if progressing != "True" {
1523+
return fmt.Errorf("deployment %s is not working - rollout may have been undone", deploymentName)
1524+
}
1525+
1526+
// Check that the right number of pods are ready (rollout finished and wasn't stopped)
1527+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1528+
if err != nil {
1529+
return fmt.Errorf("%w - rollout may have been undone", err)
1530+
}
1531+
1532+
logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
1533+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1534+
1535+
return nil
1536+
}
1537+
1538+
// DeploymentHasExpectedReadyReplicas checks that the deployment has the right number of ready pods.
1539+
// This makes sure the rollout finished successfully and pods are running.
1540+
func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
1541+
sc := scenarioCtx(ctx)
1542+
deploymentName = substituteScenarioVars(deploymentName, sc)
1543+
1544+
// Check that the right number of pods are ready
1545+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1546+
if err != nil {
1547+
return err
1548+
}
1549+
1550+
// Also check that there are no unavailable pods
1551+
unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1552+
"-o", "jsonpath={.status.unavailableReplicas}")
1553+
if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
1554+
return fmt.Errorf("deployment %s has %s unavailable pods", deploymentName, unavailableReplicas)
1555+
}
1556+
1557+
logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
1558+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1559+
1560+
return nil
1561+
}

0 commit comments

Comments
 (0)