Skip to content

Commit 2be7113

Browse files
chore(ci): Add test to check if user changes are preserved
Add a test to ensure that OLM is not reverting user changes like kubectl rollout restart. Assisted-by: Cursor/Claude
1 parent 25704ad commit 2be7113

2 files changed

Lines changed: 300 additions & 0 deletions

File tree

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
Feature: Rollout Restart User Changes
2+
# Verifies that user-added pod template annotations persist after OLM reconciliation.
3+
# Fixes: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
4+
5+
Background:
6+
Given OLM is available
7+
And ClusterCatalog "test" serves bundles
8+
And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
9+
10+
# TODO(Fix Boxcutter): It does not work with boxcutter
11+
# but to properly fix we need SSA first. So, this one is blocked
12+
# by https://github.com/operator-framework/operator-controller/pull/2515
13+
Scenario: User-initiated deployment changes persist after OLM reconciliation
14+
When ClusterExtension is applied
15+
"""
16+
apiVersion: olm.operatorframework.io/v1
17+
kind: ClusterExtension
18+
metadata:
19+
name: ${NAME}
20+
spec:
21+
namespace: ${TEST_NAMESPACE}
22+
serviceAccount:
23+
name: olm-sa
24+
source:
25+
sourceType: Catalog
26+
catalog:
27+
packageName: test
28+
selector:
29+
matchLabels:
30+
"olm.operatorframework.io/metadata.name": test-catalog
31+
"""
32+
Then ClusterExtension is rolled out
33+
And ClusterExtension is available
34+
And resource "deployment/test-operator" is installed
35+
And deployment "test-operator" is ready
36+
When user performs rollout restart on "deployment/test-operator"
37+
Then deployment "test-operator" rollout completes successfully
38+
And deployment "test-operator" has restart annotation
39+
# Wait for OLM reconciliation cycle (runs every 10 seconds)
40+
And I wait for "30" seconds
41+
# Verify user changes persisted after OLM reconciliation
42+
Then deployment "test-operator" rollout is still successful
43+
And deployment "test-operator" has restart annotation
44+
And deployment "test-operator" has expected number of ready replicas

test/e2e/steps/steps.go

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"os/exec"
1515
"path/filepath"
1616
"reflect"
17+
"strconv"
1718
"strings"
1819
"time"
1920

@@ -88,6 +89,13 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8889
sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails)
8990
sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored)
9091
sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches)
92+
sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart)
93+
sc.Step(`^(?i)deployment "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation)
94+
sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady)
95+
sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully)
96+
sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds)
97+
sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful)
98+
sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas)
9199

92100
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
93101
sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace)
@@ -1288,3 +1296,251 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev
12881296

12891297
return latest, nil
12901298
}
1299+
1300+
// UserPerformsRolloutRestart simulates a user running "kubectl rollout restart deployment/<name>".
1301+
// This adds a restart annotation to trigger a rolling restart of pods.
1302+
// This is used to test the generic fix - OLM should not undo ANY user-added annotations.
1303+
// In OLMv0, OLM would undo this change. In OLMv1, it should stay because kubectl owns it.
1304+
// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392
1305+
func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error {
1306+
sc := scenarioCtx(ctx)
1307+
resourceName = substituteScenarioVars(resourceName, sc)
1308+
1309+
kind, deploymentName, ok := strings.Cut(resourceName, "/")
1310+
if !ok {
1311+
return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName)
1312+
}
1313+
1314+
if kind != "deployment" {
1315+
return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind)
1316+
}
1317+
1318+
// Run kubectl rollout restart to add the restart annotation.
1319+
// This is the real command users run, so we test actual user behavior.
1320+
out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace)
1321+
if err != nil {
1322+
return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err)
1323+
}
1324+
1325+
logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out)
1326+
1327+
return nil
1328+
}
1329+
1330+
// ResourceHasRestartAnnotation checks that a deployment has a restart annotation.
1331+
// This confirms that user changes stay in place after OLM runs.
1332+
func ResourceHasRestartAnnotation(ctx context.Context, deploymentName string) error {
1333+
sc := scenarioCtx(ctx)
1334+
deploymentName = substituteScenarioVars(deploymentName, sc)
1335+
1336+
restartAnnotationKey := "kubectl.kubernetes.io/restartedAt"
1337+
1338+
var annotationValue string
1339+
waitFor(ctx, func() bool {
1340+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1341+
"-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey))
1342+
if err != nil {
1343+
return false
1344+
}
1345+
// If the annotation exists and has a value, it stayed in place
1346+
if out == "" {
1347+
return false
1348+
}
1349+
annotationValue = out
1350+
return true
1351+
})
1352+
1353+
logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", annotationValue)
1354+
return nil
1355+
}
1356+
1357+
// DeploymentIsReady checks that a deployment is ready with all pods running.
1358+
func DeploymentIsReady(ctx context.Context, deploymentName string) error {
1359+
sc := scenarioCtx(ctx)
1360+
deploymentName = substituteScenarioVars(deploymentName, sc)
1361+
1362+
waitFor(ctx, func() bool {
1363+
// Check if deployment is ready
1364+
out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1365+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1366+
if err != nil {
1367+
return false
1368+
}
1369+
return out == "True"
1370+
})
1371+
1372+
logger.V(1).Info("Deployment is ready", "deployment", deploymentName)
1373+
return nil
1374+
}
1375+
1376+
// DeploymentRolloutCompletesSuccessfully waits for the rollout to finish.
1377+
// This checks that new pods were created and are running.
1378+
func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error {
1379+
sc := scenarioCtx(ctx)
1380+
deploymentName = substituteScenarioVars(deploymentName, sc)
1381+
1382+
timeoutArg := fmt.Sprintf("--timeout=%s", timeout)
1383+
out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, timeoutArg)
1384+
if err != nil {
1385+
return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out)
1386+
}
1387+
1388+
logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out)
1389+
1390+
// Check deployment status
1391+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1392+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1393+
if err != nil {
1394+
return fmt.Errorf("failed to check deployment availability: %w", err)
1395+
}
1396+
if available != "True" {
1397+
return fmt.Errorf("deployment %s is not available", deploymentName)
1398+
}
1399+
1400+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1401+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1402+
if err != nil {
1403+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1404+
}
1405+
if progressing != "True" {
1406+
return fmt.Errorf("deployment %s is not progressing correctly", deploymentName)
1407+
}
1408+
1409+
return nil
1410+
}
1411+
1412+
// WaitForSeconds waits for the given number of seconds.
1413+
// This gives OLM time to run its checks between test steps.
1414+
//
1415+
// Note: We wait for a fixed time (not checking in a loop) because we need to make sure
1416+
// OLM actually runs (it runs every 10 seconds). We want to check that user changes stay
1417+
// AFTER OLM runs. If we just checked in a loop, we wouldn't know if OLM ran yet.
1418+
func WaitForSeconds(ctx context.Context, seconds string) error {
1419+
sec, err := strconv.Atoi(seconds)
1420+
if err != nil {
1421+
return fmt.Errorf("invalid seconds value %s: %w", seconds, err)
1422+
}
1423+
1424+
if sec <= 0 {
1425+
return fmt.Errorf("seconds value must be greater than 0, got %d", sec)
1426+
}
1427+
1428+
logger.V(1).Info("Waiting for reconciliation", "seconds", sec)
1429+
1430+
timer := time.NewTimer(time.Duration(sec) * time.Second)
1431+
defer timer.Stop()
1432+
1433+
select {
1434+
case <-timer.C:
1435+
logger.V(1).Info("Wait complete")
1436+
return nil
1437+
case <-ctx.Done():
1438+
return fmt.Errorf("wait canceled: %w", ctx.Err())
1439+
}
1440+
}
1441+
1442+
// verifyDeploymentReplicaStatus checks if a deployment has the right number of ready pods.
1443+
func verifyDeploymentReplicaStatus(deploymentName, namespace string) (string, string, error) {
1444+
readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace,
1445+
"-o", "jsonpath={.status.readyReplicas}")
1446+
if err != nil {
1447+
return "", "", fmt.Errorf("failed to get ready replicas: %w", err)
1448+
}
1449+
1450+
replicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace,
1451+
"-o", "jsonpath={.spec.replicas}")
1452+
if err != nil {
1453+
return "", "", fmt.Errorf("failed to get desired replicas: %w", err)
1454+
}
1455+
1456+
// Normalize empty jsonpath results (when readyReplicas is omitted for 0 replicas)
1457+
readyReplicas = strings.TrimSpace(readyReplicas)
1458+
replicas = strings.TrimSpace(replicas)
1459+
if readyReplicas == "" {
1460+
readyReplicas = "0"
1461+
}
1462+
if replicas == "" {
1463+
replicas = "0"
1464+
}
1465+
1466+
// Compare as integers to avoid false negatives
1467+
readyInt, err := strconv.Atoi(readyReplicas)
1468+
if err != nil {
1469+
return readyReplicas, replicas, fmt.Errorf("invalid ready replicas value %q: %w", readyReplicas, err)
1470+
}
1471+
replicasInt, err := strconv.Atoi(replicas)
1472+
if err != nil {
1473+
return readyReplicas, replicas, fmt.Errorf("invalid desired replicas value %q: %w", replicas, err)
1474+
}
1475+
1476+
if readyInt != replicasInt {
1477+
return readyReplicas, replicas, fmt.Errorf("deployment %s has %d ready replicas but expected %d",
1478+
deploymentName, readyInt, replicasInt)
1479+
}
1480+
1481+
return readyReplicas, replicas, nil
1482+
}
1483+
1484+
// DeploymentRolloutIsStillSuccessful checks that the rollout is still working.
1485+
// This makes sure OLM didn't undo the user's "kubectl rollout restart" command.
1486+
// It checks that new pods are still running.
1487+
func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error {
1488+
sc := scenarioCtx(ctx)
1489+
deploymentName = substituteScenarioVars(deploymentName, sc)
1490+
1491+
// Check deployment status
1492+
available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1493+
"-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}")
1494+
if err != nil {
1495+
return fmt.Errorf("failed to check deployment availability: %w", err)
1496+
}
1497+
if available != "True" {
1498+
return fmt.Errorf("deployment %s is not available - rollout was undone", deploymentName)
1499+
}
1500+
1501+
// Check that the deployment is still working correctly
1502+
progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1503+
"-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}")
1504+
if err != nil {
1505+
return fmt.Errorf("failed to check deployment progressing: %w", err)
1506+
}
1507+
if progressing != "True" {
1508+
return fmt.Errorf("deployment %s is not working - rollout may have been undone", deploymentName)
1509+
}
1510+
1511+
// Check that the right number of pods are ready (rollout finished and wasn't stopped)
1512+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1513+
if err != nil {
1514+
return fmt.Errorf("%w - rollout may have been undone", err)
1515+
}
1516+
1517+
logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName,
1518+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1519+
1520+
return nil
1521+
}
1522+
1523+
// DeploymentHasExpectedReadyReplicas checks that the deployment has the right number of ready pods.
1524+
// This makes sure the rollout finished successfully and pods are running.
1525+
func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error {
1526+
sc := scenarioCtx(ctx)
1527+
deploymentName = substituteScenarioVars(deploymentName, sc)
1528+
1529+
// Check that the right number of pods are ready
1530+
readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace)
1531+
if err != nil {
1532+
return err
1533+
}
1534+
1535+
// Also check that there are no unavailable pods
1536+
unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace,
1537+
"-o", "jsonpath={.status.unavailableReplicas}")
1538+
if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" {
1539+
return fmt.Errorf("deployment %s has %s unavailable pods", deploymentName, unavailableReplicas)
1540+
}
1541+
1542+
logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName,
1543+
"readyReplicas", readyReplicas, "desiredReplicas", replicas)
1544+
1545+
return nil
1546+
}

0 commit comments

Comments
 (0)