Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 125 additions & 29 deletions tools/storm/servicing/tests/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"path/filepath"
"regexp"
"strings"
"sync"
"time"
stormsvcconfig "tridenttools/storm/servicing/utils/config"
stormutils "tridenttools/storm/utils"
Expand All @@ -18,22 +19,23 @@ import (
stormvm "tridenttools/storm/utils/vm"
stormvmconfig "tridenttools/storm/utils/vm/config"

"github.com/microsoft/storm"
"github.com/sirupsen/logrus"
"gopkg.in/yaml.v2"
)

func UpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
return innerUpdateLoop(testConfig, vmConfig, false)
func UpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
return innerUpdateLoop(tc, testConfig, vmConfig, false)
}

func Rollback(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
return innerUpdateLoop(testConfig, vmConfig, true)
func Rollback(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
return innerUpdateLoop(tc, testConfig, vmConfig, true)
}

func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
// Create context to ensure goroutines exit cleanly
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
func innerUpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
// Use the framework-provided context. Storm cancels this when the test
// case finishes, which automatically terminates any child goroutines.
ctx := tc.Context()

logrus.Tracef("Stop existing update servers if any")
// Kill any running update servers
Expand Down Expand Up @@ -63,11 +65,20 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
cosiFileBase := cosiFile[strings.LastIndex(cosiFile, "/")+1:]

logrus.Tracef("Start update servers (netlisten)")
// Start update servers (netlisten)
// Start update servers (netlisten) — tracked via the framework WaitGroup
// so storm waits for them to exit when the test case closes.
aStartedChannel := make(chan bool)
go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
tc.BackgroundWaitGroup().Add(1)
go func() {
defer tc.BackgroundWaitGroup().Done()
stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
}()
bStartedChannel := make(chan bool)
go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
tc.BackgroundWaitGroup().Add(1)
go func() {
defer tc.BackgroundWaitGroup().Done()
stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
}()
// Wait for both update servers to start
<-aStartedChannel
<-bStartedChannel
Comment on lines 82 to 84
Expand Down Expand Up @@ -189,24 +200,50 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
}

logrus.Tracef("Setting up SSH proxy ports for update servers")
aStartedChannel := make(chan bool)
go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
bStartedChannel := make(chan bool)
go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
// SSH reverse tunnels are needed only during staging and finalize (COSI download).
// Use a per-iteration context so proxies are cleanly stopped after finalize,
// preventing goroutine accumulation across iterations.
proxyCtx, proxyCancel := context.WithCancel(ctx)
var proxyWg sync.WaitGroup
proxyCleanedUp := false
cleanupProxies := func() {
if proxyCleanedUp {
return
}
proxyCleanedUp = true
proxyCancel()
proxyWg.Wait()
}

proxyWg.Add(2)
aStartedChannel := make(chan bool, 1)
go func() {
defer proxyWg.Done()
stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
}()
bStartedChannel := make(chan bool, 1)
go func() {
defer proxyWg.Done()
stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
}()
// Wait for both SSH proxy ports to be ready
<-aStartedChannel
<-bStartedChannel
if !<-aStartedChannel || !<-bStartedChannel {
cleanupProxies()
return fmt.Errorf("failed to start SSH proxy ports for iteration %d", i)
}

logrus.Tracef("Checking for crash dumps on host")
crashDumpOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "ls /var/crash/*")
if err == nil {
logrus.Debugf("Crash files found on host during iteration %d: %s", i, crashDumpOutput)
logrus.Error("Crash files found on host")
cleanupProxies()
return fmt.Errorf("crash files found on host during iteration %d", i)
}

if rollback && i == 1 {
if err := prepareRollback(vmConfig, vmIP, updateConfig, expectedVolume, i); err != nil {
cleanupProxies()
return fmt.Errorf("failed to prepare rollback for iteration %d: %w", i, err)
}
}
Expand All @@ -232,20 +269,25 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi

stageLogLocalTmpFile, err := os.CreateTemp("", "staged-trident-full")
if err != nil {
cleanupProxies()
return fmt.Errorf("failed to create temp staging log file: %w", err)
}
stageLogLocalTmpPath := stageLogLocalTmpFile.Name()
defer os.Remove(stageLogLocalTmpPath)
stageLogLocalTmpFile.Close()

err = stormssh.ScpDownloadFile(vmConfig.VMConfig, vmIP, "/var/log/trident-full.log", stageLogLocalTmpPath)
if err != nil {
os.Remove(stageLogLocalTmpPath)
cleanupProxies()
return fmt.Errorf("failed to download staged trident log: %w", err)
}

if testConfig.OutputPath != "" {
logrus.Tracef("Download staging trident logs for iteration %d", i)
stageLogPath := filepath.Join(testConfig.OutputPath, fmt.Sprintf("%s-staged-trident-full.log", fmt.Sprintf("%03d", i)))
if err := exec.Command("cp", stageLogLocalTmpPath, stageLogPath).Run(); err != nil {
os.Remove(stageLogLocalTmpPath)
cleanupProxies()
return fmt.Errorf("failed to copy staged trident log to output path: %w", err)
}
if err := os.Chmod(stageLogPath, 0644); err != nil {
Expand All @@ -257,6 +299,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
}

if stageErr != nil {
os.Remove(stageLogLocalTmpPath)
cleanupProxies()
if egrepOut, err := exec.Command("/bin/sh", "-c", fmt.Sprintf("grep 'target is busy' %s | grep umount", stageLogLocalTmpPath)).CombinedOutput(); err == nil {
// Check for known unmount failure and signal
Comment on lines 301 to 305
logrus.Errorf("umount failure (iteration %d: %v): %s", i, stageErr, egrepOut)
Expand All @@ -273,6 +317,9 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
return fmt.Errorf("no update servicing required (iteration %d: %v)", i, stageErr)
}

// Clean up staging temp file now rather than accumulating defers
os.Remove(stageLogLocalTmpPath)
Comment thread
bfjelds marked this conversation as resolved.

logrus.Tracef("Running Trident update finalize command on VM")

// Capture "uptime --since" before the finalize reboot. After reboot, the SSH
Expand Down Expand Up @@ -303,6 +350,11 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
logrus.Tracef("Finalize output for iteration %d:\n%s\n%v", i, combinedFinalizeOutput, finalizeErr)
}

// SSH reverse tunnels are no longer needed — finalize re-reads the COSI
// file, so tunnels must stay alive through finalize. Stop them now before
// the reboot to avoid orphaned SSH processes.
cleanupProxies()

Comment on lines +353 to +357
logrus.Tracef("Wait for VM to come back up after finalize reboot")
if vmConfig.VMConfig.Platform == stormvmconfig.PlatformQEMU {
err := vmConfig.QemuConfig.WaitForLogin(vmConfig.VMConfig.Name, testConfig.OutputPath, testConfig.Verbose, i)
Expand Down Expand Up @@ -546,8 +598,12 @@ func validateRollback(cfg stormvmconfig.VMConfig, vmIP string) error {
}

// checkSerialLogForDracutIssues scans the serial log for patterns that indicate
// initramfs is stuck waiting for a device, which is the symptom of bug 15086
// (stale disk UUIDs embedded in initramfs by dracut).
// initramfs had issues. It distinguishes between:
// - Genuine initramfs failures (emergency shell, timeout) where the VM never
// reached systemd
// - Transient initramfs warnings that resolved (dracut-initqueue delay) where
// the VM booted past initramfs into systemd but may be unreachable for other
// reasons (e.g., sshd not started, network issue)
func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
if serialLogPath == "" {
return
Expand All @@ -559,21 +615,61 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
}
content := string(data)

// Check if the VM booted past initramfs into systemd. We look for
// systemd PID 1 messages that only appear AFTER initramfs hands off.
//
// IMPORTANT: Only consider lines with a kernel timestamp [ N.NNN]
// to avoid false positives from messages leaked from a previous boot
// (e.g., if serial log truncation raced with shutdown messages).
// A line like "[ 5.123] systemd[1]: Finished systemd-remount-fs"
// confirms THIS boot reached systemd, not a prior one.
bootedToSystemd := false
for _, line := range strings.Split(content, "\n") {
if !strings.Contains(line, "systemd[1]:") {
continue
}
if !strings.Contains(line, "systemd-remount-fs") && !strings.Contains(line, "local-fs.target") {
continue
}
// Verify it has a kernel timestamp (rules out plain console text
// from previous boots or dracut debug output).
if strings.Contains(line, "[") && strings.Contains(line, "]") {
// Extract kernel timestamp — should be a small number for
// current boot (typically < 60s), not a huge number from an
// accumulated log.
bootedToSystemd = true
break
}
}

dracutPatterns := []struct {
pattern string
message string
pattern string
message string
definite bool // true = definitely stuck in initramfs (not a transient warning)
}{
{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device"},
{"Could not boot", "dracut 'Could not boot' error detected"},
{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs"},
{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference"},
{"rd.break", "rd.break detected — initramfs dropped to debug shell"},
{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)"},
{"Could not boot", "dracut 'Could not boot' error detected", true},
{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs", true},
{"Entering emergency mode", "initramfs or systemd entered emergency mode", true},
{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)", true},
Comment thread
bfjelds marked this conversation as resolved.
{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device", false},
{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference", false},
{"rd.break", "rd.break detected — initramfs dropped to debug shell", false},
}

matchCount := 0
for _, dp := range dracutPatterns {
if strings.Contains(content, dp.pattern) {
logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
matchCount++
if bootedToSystemd && !dp.definite {
// VM booted past initramfs — dracut warning was transient.
logrus.Warnf("INITRAMFS NOTE (iteration %d): %s — but VM booted past initramfs into systemd (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
} else {
logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
}
}
}

if matchCount > 0 && bootedToSystemd {
logrus.Warnf("INITRAMFS SUMMARY (iteration %d): VM reached systemd despite %d dracut warning(s) — failure is likely SSH/network unreachability, not initramfs", iteration, matchCount)
}
}
15 changes: 12 additions & 3 deletions tools/storm/servicing/trident.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,19 @@ func (h *TridentServicingScenario) Cleanup(ctx storm.SetupCleanupContext) error
}

func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
return h.runTestCaseWithContext(tc, func(_ storm.TestCase, cfg stormsvcconfig.TestConfig, vm stormvmconfig.AllVMConfig) error {
return testFunc(cfg, vm)
})
}

// runTestCaseWithContext passes the storm.TestCase to the test function,
// giving it access to tc.Context() and tc.BackgroundWaitGroup() for proper
// goroutine lifecycle management.
func (h *TridentServicingScenario) runTestCaseWithContext(tc storm.TestCase, testFunc func(storm.TestCase, stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
if tc.Name() != h.args.TestCaseToRun && h.args.TestCaseToRun != "all" {
tc.Skip(fmt.Sprintf("Test case '%s' does not align to TestCaseToRun '%s'", tc.Name(), h.args.TestCaseToRun))
} else {
logrus.Infof("Running test case '%s'", tc.Name())
// create test-specific output directory
testCaseSpecificConfig := h.args.TestConfig
testCaseSpecificConfig.OutputPath = h.args.TestConfig.OutputPath
if testCaseSpecificConfig.OutputPath != "" {
Expand All @@ -92,6 +100,7 @@ func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(
}
}
err := testFunc(
tc,
testCaseSpecificConfig,
stormvmconfig.AllVMConfig{
VMConfig: h.args.VMConfig,
Expand All @@ -117,15 +126,15 @@ func (h *TridentServicingScenario) checkDeployment(tc storm.TestCase) error {
}

func (h *TridentServicingScenario) updateLoop(tc storm.TestCase) error {
return h.runTestCase(tc, stormsvctests.UpdateLoop)
return h.runTestCaseWithContext(tc, stormsvctests.UpdateLoop)
}

func (h *TridentServicingScenario) rollback(tc storm.TestCase) error {
if !h.args.TestConfig.Rollback {
tc.Skip("Test case 'rollback' is skipped because rollback testing is disabled")
return nil // No action needed if rollback is not enabled
}
return h.runTestCase(tc, stormsvctests.Rollback)
return h.runTestCaseWithContext(tc, stormsvctests.Rollback)
}

func (h *TridentServicingScenario) collectLogs(tc storm.TestCase) error {
Expand Down
35 changes: 31 additions & 4 deletions tools/storm/utils/ssh/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ssh

import (
"context"
"errors"
"fmt"
"os"
"os/exec"
Expand Down Expand Up @@ -40,6 +41,7 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse
"-o", "ServerAliveInterval=5",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ExitOnForwardFailure=yes",
"-i", sshKeyPath,
fmt.Sprintf("%s@%s", sshUser, vmIP),
)
Expand All @@ -48,12 +50,37 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse

logrus.Tracef("Starting SSH proxy for port %d to VM %s with user %s", port, vmIP, sshUser)
if err := cmd.Start(); err != nil {
startedChannel <- false
return fmt.Errorf("failed to start SSH proxy for port %d: %w", port, err)
}
Comment thread
bfjelds marked this conversation as resolved.
// Signal that the SSH proxy has started
startedChannel <- true
// Wait for the command to finish
if err := cmd.Wait(); err != nil {

// With ExitOnForwardFailure=yes, SSH exits immediately if the remote port
// forward fails. Brief pause to catch that before signaling readiness.
earlyExit := make(chan error, 1)
go func() { earlyExit <- cmd.Wait() }()

select {
case err := <-earlyExit:
// SSH exited before we could signal readiness — forward setup failed.
startedChannel <- false
if err != nil {
return fmt.Errorf("SSH proxy for port %d exited immediately (forward setup failed): %w", port, err)
}
return fmt.Errorf("SSH proxy for port %d exited immediately", port)
case <-time.After(1 * time.Second):
// SSH still running after 1s — forward is likely established.
startedChannel <- true
}

// Wait for the command to finish (blocks until context cancel or SSH dies).
err := <-earlyExit
if err != nil {
// Context cancellation is expected during cleanup — only swallow
// Canceled, not DeadlineExceeded (which indicates a real timeout).
if errors.Is(ctx.Err(), context.Canceled) {
logrus.Tracef("SSH proxy for port %d stopped (context cancelled)", port)
return nil
}
return fmt.Errorf("SSH proxy for port %d failed: %w", port, err)
}
logrus.Tracef("SSH proxy for port %d exited", port)
Expand Down
Loading
Loading