From 3792d15f2f2be62c4638a5377243004f3e330cec Mon Sep 17 00:00:00 2001
From: bfjelds <bfjelds@microsoft.com>
Date: Thu, 14 May 2026 11:39:35 -0700
Subject: [PATCH 1/3] fix(storm): use framework context for update-loop
 goroutine lifecycle

Three fixes for scale test reliability:

1. Use storm TestCase.Context() instead of context.Background()
   - The servicing update-loop created its own orphaned context, bypassing
     the storm framework's goroutine lifecycle management
   - Thread tc (storm.TestCase) through to UpdateLoop/Rollback via new
     runTestCaseWithContext method, matching the pattern used in e2e tests
   - Register netlisten goroutines with tc.BackgroundWaitGroup()

2. Per-iteration SSH proxy lifecycle
   - SSH reverse tunnel goroutines were started inside the loop using the
     function-level context, accumulating 2 goroutines per iteration
   - Now use per-iteration child context with explicit cleanup before
     finalize (tunnels are only needed during staging)
   - Add ExitOnForwardFailure=yes to detect remote port binding failures
   - Handle cmd.Start() failure to prevent deadlock on startedChannel

3. Improve initramfs diagnostics (checkSerialLogForDracutIssues)
   - Distinguish between genuine initramfs failures (emergency shell,
     device timeout) and transient warnings where the VM booted past
     initramfs into systemd but is unreachable for other reasons
   - Log transient warnings as WARN instead of ERRO to avoid
     misattribution of SSH/network failures as initramfs issues

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tools/storm/servicing/tests/update.go | 134 ++++++++++++++++++++------
 tools/storm/servicing/trident.go      |  15 ++-
 tools/storm/utils/ssh/ssh.go          |  35 ++++++-
 tools/storm/utils/vm/qemu/qemu.go     |   5 +
 4 files changed, 153 insertions(+), 36 deletions(-)

diff --git a/tools/storm/servicing/tests/update.go b/tools/storm/servicing/tests/update.go
index 5cf6a1f37..0fc26f2a6 100644
--- a/tools/storm/servicing/tests/update.go
+++ b/tools/storm/servicing/tests/update.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strings"
+	"sync"
 	"time"
 	stormsvcconfig "tridenttools/storm/servicing/utils/config"
 	stormutils "tridenttools/storm/utils"
@@ -18,22 +19,23 @@ import (
 	stormvm "tridenttools/storm/utils/vm"
 	stormvmconfig "tridenttools/storm/utils/vm/config"
 
+	"github.com/microsoft/storm"
 	"github.com/sirupsen/logrus"
 	"gopkg.in/yaml.v2"
 )
 
-func UpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
-	return innerUpdateLoop(testConfig, vmConfig, false)
+func UpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
+	return innerUpdateLoop(tc, testConfig, vmConfig, false)
 }
 
-func Rollback(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
-	return innerUpdateLoop(testConfig, vmConfig, true)
+func Rollback(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
+	return innerUpdateLoop(tc, testConfig, vmConfig, true)
 }
 
-func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
-	// Create context to ensure goroutines exit cleanly
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
+func innerUpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
+	// Use the framework-provided context. Storm cancels this when the test
+	// case finishes, which automatically terminates any child goroutines.
+	ctx := tc.Context()
 
 	logrus.Tracef("Stop existing update servers if any")
 	// Kill any running update servers
@@ -63,11 +65,20 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 	cosiFileBase := cosiFile[strings.LastIndex(cosiFile, "/")+1:]
 
 	logrus.Tracef("Start update servers (netlisten)")
-	// Start update servers (netlisten)
+	// Start update servers (netlisten) — tracked via the framework WaitGroup
+	// so storm waits for them to exit when the test case closes.
 	aStartedChannel := make(chan bool)
-	go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
+	tc.BackgroundWaitGroup().Add(1)
+	go func() {
+		defer tc.BackgroundWaitGroup().Done()
+		stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
+	}()
 	bStartedChannel := make(chan bool)
-	go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
+	tc.BackgroundWaitGroup().Add(1)
+	go func() {
+		defer tc.BackgroundWaitGroup().Done()
+		stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
+	}()
 	// Wait for both update servers to start
 	<-aStartedChannel
 	<-bStartedChannel
@@ -189,24 +200,50 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 		}
 
 		logrus.Tracef("Setting up SSH proxy ports for update servers")
-		aStartedChannel := make(chan bool)
-		go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
-		bStartedChannel := make(chan bool)
-		go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
+		// SSH reverse tunnels are needed only during staging and finalize (COSI download).
+		// Use a per-iteration context so proxies are cleanly stopped after finalize,
+		// preventing goroutine accumulation across iterations.
+		proxyCtx, proxyCancel := context.WithCancel(ctx)
+		var proxyWg sync.WaitGroup
+		proxyCleanedUp := false
+		cleanupProxies := func() {
+			if proxyCleanedUp {
+				return
+			}
+			proxyCleanedUp = true
+			proxyCancel()
+			proxyWg.Wait()
+		}
+
+		proxyWg.Add(2)
+		aStartedChannel := make(chan bool, 1)
+		go func() {
+			defer proxyWg.Done()
+			stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
+		}()
+		bStartedChannel := make(chan bool, 1)
+		go func() {
+			defer proxyWg.Done()
+			stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
+		}()
 		// Wait for both SSH proxy ports to be ready
-		<-aStartedChannel
-		<-bStartedChannel
+		if !<-aStartedChannel || !<-bStartedChannel {
+			cleanupProxies()
+			return fmt.Errorf("failed to start SSH proxy ports for iteration %d", i)
+		}
 
 		logrus.Tracef("Checking for crash dumps on host")
 		crashDumpOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "ls /var/crash/*")
 		if err == nil {
 			logrus.Debugf("Crash files found on host during iteration %d: %s", i, crashDumpOutput)
 			logrus.Error("Crash files found on host")
+			cleanupProxies()
 			return fmt.Errorf("crash files found on host during iteration %d", i)
 		}
 
 		if rollback && i == 1 {
 			if err := prepareRollback(vmConfig, vmIP, updateConfig, expectedVolume, i); err != nil {
+				cleanupProxies()
 				return fmt.Errorf("failed to prepare rollback for iteration %d: %w", i, err)
 			}
 		}
@@ -232,13 +269,16 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 
 		stageLogLocalTmpFile, err := os.CreateTemp("", "staged-trident-full")
 		if err != nil {
+			cleanupProxies()
 			return fmt.Errorf("failed to create temp staging log file: %w", err)
 		}
 		stageLogLocalTmpPath := stageLogLocalTmpFile.Name()
-		defer os.Remove(stageLogLocalTmpPath)
+		stageLogLocalTmpFile.Close()
 
 		err = stormssh.ScpDownloadFile(vmConfig.VMConfig, vmIP, "/var/log/trident-full.log", stageLogLocalTmpPath)
 		if err != nil {
+			os.Remove(stageLogLocalTmpPath)
+			cleanupProxies()
 			return fmt.Errorf("failed to download staged trident log: %w", err)
 		}
 
@@ -246,6 +286,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			logrus.Tracef("Download staging trident logs for iteration %d", i)
 			stageLogPath := filepath.Join(testConfig.OutputPath, fmt.Sprintf("%s-staged-trident-full.log", fmt.Sprintf("%03d", i)))
 			if err := exec.Command("cp", stageLogLocalTmpPath, stageLogPath).Run(); err != nil {
+				os.Remove(stageLogLocalTmpPath)
+				cleanupProxies()
 				return fmt.Errorf("failed to copy staged trident log to output path: %w", err)
 			}
 			if err := os.Chmod(stageLogPath, 0644); err != nil {
@@ -257,6 +299,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 		}
 
 		if stageErr != nil {
+			os.Remove(stageLogLocalTmpPath)
+			cleanupProxies()
 			if egrepOut, err := exec.Command("/bin/sh", "-c", fmt.Sprintf("grep 'target is busy' %s | grep umount", stageLogLocalTmpPath)).CombinedOutput(); err == nil {
 				// Check for known unmount failure and signal
 				logrus.Errorf("umount failure (iteration %d: %v): %s", i, stageErr, egrepOut)
@@ -273,6 +317,9 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			return fmt.Errorf("no update servicing required (iteration %d: %v)", i, stageErr)
 		}
 
+		// Clean up staging temp file now rather than accumulating defers
+		os.Remove(stageLogLocalTmpPath)
+
 		logrus.Tracef("Running Trident update finalize command on VM")
 
 		// Capture "uptime --since" before the finalize reboot. After reboot, the SSH
@@ -303,6 +350,11 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			logrus.Tracef("Finalize output for iteration %d:\n%s\n%v", i, combinedFinalizeOutput, finalizeErr)
 		}
 
+		// SSH reverse tunnels are no longer needed — finalize re-reads the COSI
+		// file, so tunnels must stay alive through finalize. Stop them now before
+		// the reboot to avoid orphaned SSH processes.
+		cleanupProxies()
+
 		logrus.Tracef("Wait for VM to come back up after finalize reboot")
 		if vmConfig.VMConfig.Platform == stormvmconfig.PlatformQEMU {
 			err := vmConfig.QemuConfig.WaitForLogin(vmConfig.VMConfig.Name, testConfig.OutputPath, testConfig.Verbose, i)
@@ -546,8 +598,12 @@ func validateRollback(cfg stormvmconfig.VMConfig, vmIP string) error {
 }
 
 // checkSerialLogForDracutIssues scans the serial log for patterns that indicate
-// initramfs is stuck waiting for a device, which is the symptom of bug 15086
-// (stale disk UUIDs embedded in initramfs by dracut).
+// initramfs had issues. It distinguishes between:
+//   - Genuine initramfs failures (emergency shell, timeout) where the VM never
+//     reached systemd
+//   - Transient initramfs warnings that resolved (dracut-initqueue delay) where
+//     the VM booted past initramfs into systemd but may be unreachable for other
+//     reasons (e.g., sshd not started, network issue)
 func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	if serialLogPath == "" {
 		return
@@ -559,21 +615,41 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	}
 	content := string(data)
 
+	// Check if the VM booted past initramfs into systemd. The presence of
+	// systemd PID 1 messages after initramfs means the root filesystem was
+	// mounted and systemd took over. Dracut warnings in this case are
+	// transient — they delayed boot but didn't prevent it.
+	bootedToSystemd := strings.Contains(content, "systemd[1]: Finished systemd-remount-fs.service") ||
+		strings.Contains(content, "systemd[1]: Reached target local-fs.target")
+
 	dracutPatterns := []struct {
-		pattern string
-		message string
+		pattern  string
+		message  string
+		definite bool // true = definitely stuck in initramfs (not a transient warning)
 	}{
-		{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device"},
-		{"Could not boot", "dracut 'Could not boot' error detected"},
-		{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs"},
-		{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference"},
-		{"rd.break", "rd.break detected — initramfs dropped to debug shell"},
-		{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)"},
+		{"Could not boot", "dracut 'Could not boot' error detected", true},
+		{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs", true},
+		{"Entering emergency mode", "initramfs or systemd entered emergency mode", false},
+		{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)", true},
+		{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device", false},
+		{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference", false},
+		{"rd.break", "rd.break detected — initramfs dropped to debug shell", false},
 	}
 
+	matchCount := 0
 	for _, dp := range dracutPatterns {
 		if strings.Contains(content, dp.pattern) {
-			logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			matchCount++
+			if bootedToSystemd && !dp.definite {
+				// VM booted past initramfs — dracut warning was transient.
+				logrus.Warnf("INITRAMFS NOTE (iteration %d): %s — but VM booted past initramfs into systemd (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			} else {
+				logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			}
 		}
 	}
+
+	if matchCount > 0 && bootedToSystemd {
+		logrus.Warnf("INITRAMFS SUMMARY (iteration %d): VM reached systemd despite %d dracut warning(s) — failure is likely SSH/network unreachability, not initramfs", iteration, matchCount)
+	}
 }
diff --git a/tools/storm/servicing/trident.go b/tools/storm/servicing/trident.go
index d72692913..e0ae5c9ed 100644
--- a/tools/storm/servicing/trident.go
+++ b/tools/storm/servicing/trident.go
@@ -78,11 +78,19 @@ func (h *TridentServicingScenario) Cleanup(ctx storm.SetupCleanupContext) error
 }
 
 func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
+	return h.runTestCaseWithContext(tc, func(_ storm.TestCase, cfg stormsvcconfig.TestConfig, vm stormvmconfig.AllVMConfig) error {
+		return testFunc(cfg, vm)
+	})
+}
+
+// runTestCaseWithContext passes the storm.TestCase to the test function,
+// giving it access to tc.Context() and tc.BackgroundWaitGroup() for proper
+// goroutine lifecycle management.
+func (h *TridentServicingScenario) runTestCaseWithContext(tc storm.TestCase, testFunc func(storm.TestCase, stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
 	if tc.Name() != h.args.TestCaseToRun && h.args.TestCaseToRun != "all" {
 		tc.Skip(fmt.Sprintf("Test case '%s' does not align to TestCaseToRun '%s'", tc.Name(), h.args.TestCaseToRun))
 	} else {
 		logrus.Infof("Running test case '%s'", tc.Name())
-		// create test-specific output directory
 		testCaseSpecificConfig := h.args.TestConfig
 		testCaseSpecificConfig.OutputPath = h.args.TestConfig.OutputPath
 		if testCaseSpecificConfig.OutputPath != "" {
@@ -92,6 +100,7 @@ func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(
 			}
 		}
 		err := testFunc(
+			tc,
 			testCaseSpecificConfig,
 			stormvmconfig.AllVMConfig{
 				VMConfig:    h.args.VMConfig,
@@ -117,7 +126,7 @@ func (h *TridentServicingScenario) checkDeployment(tc storm.TestCase) error {
 }
 
 func (h *TridentServicingScenario) updateLoop(tc storm.TestCase) error {
-	return h.runTestCase(tc, stormsvctests.UpdateLoop)
+	return h.runTestCaseWithContext(tc, stormsvctests.UpdateLoop)
 }
 
 func (h *TridentServicingScenario) rollback(tc storm.TestCase) error {
@@ -125,7 +134,7 @@ func (h *TridentServicingScenario) rollback(tc storm.TestCase) error {
 		tc.Skip("Test case 'rollback' is skipped because rollback testing is disabled")
 		return nil // No action needed if rollback is not enabled
 	}
-	return h.runTestCase(tc, stormsvctests.Rollback)
+	return h.runTestCaseWithContext(tc, stormsvctests.Rollback)
 }
 
 func (h *TridentServicingScenario) collectLogs(tc storm.TestCase) error {
diff --git a/tools/storm/utils/ssh/ssh.go b/tools/storm/utils/ssh/ssh.go
index 9cc5e9184..d5e3b470c 100644
--- a/tools/storm/utils/ssh/ssh.go
+++ b/tools/storm/utils/ssh/ssh.go
@@ -2,6 +2,7 @@ package ssh
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -40,6 +41,7 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse
 		"-o", "ServerAliveInterval=5",
 		"-o", "StrictHostKeyChecking=no",
 		"-o", "UserKnownHostsFile=/dev/null",
+		"-o", "ExitOnForwardFailure=yes",
 		"-i", sshKeyPath,
 		fmt.Sprintf("%s@%s", sshUser, vmIP),
 	)
@@ -48,12 +50,37 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse
 
 	logrus.Tracef("Starting SSH proxy for port %d to VM %s with user %s", port, vmIP, sshUser)
 	if err := cmd.Start(); err != nil {
+		startedChannel <- false
 		return fmt.Errorf("failed to start SSH proxy for port %d: %w", port, err)
 	}
-	// Signal that the SSH proxy has started
-	startedChannel <- true
-	// Wait for the command to finish
-	if err := cmd.Wait(); err != nil {
+
+	// With ExitOnForwardFailure=yes, SSH exits immediately if the remote port
+	// forward fails. Brief pause to catch that before signaling readiness.
+	earlyExit := make(chan error, 1)
+	go func() { earlyExit <- cmd.Wait() }()
+
+	select {
+	case err := <-earlyExit:
+		// SSH exited before we could signal readiness — forward setup failed.
+		startedChannel <- false
+		if err != nil {
+			return fmt.Errorf("SSH proxy for port %d exited immediately (forward setup failed): %w", port, err)
+		}
+		return fmt.Errorf("SSH proxy for port %d exited immediately", port)
+	case <-time.After(1 * time.Second):
+		// SSH still running after 1s — forward is likely established.
+		startedChannel <- true
+	}
+
+	// Wait for the command to finish (blocks until context cancel or SSH dies).
+	err := <-earlyExit
+	if err != nil {
+		// Context cancellation is expected during cleanup — only swallow
+		// Canceled, not DeadlineExceeded (which indicates a real timeout).
+		if errors.Is(ctx.Err(), context.Canceled) {
+			logrus.Tracef("SSH proxy for port %d stopped (context cancelled)", port)
+			return nil
+		}
 		return fmt.Errorf("SSH proxy for port %d failed: %w", port, err)
 	}
 	logrus.Tracef("SSH proxy for port %d exited", port)
diff --git a/tools/storm/utils/vm/qemu/qemu.go b/tools/storm/utils/vm/qemu/qemu.go
index fcec985b9..53fd69c0c 100644
--- a/tools/storm/utils/vm/qemu/qemu.go
+++ b/tools/storm/utils/vm/qemu/qemu.go
@@ -342,6 +342,11 @@ func (cfg QemuConfig) TruncateLog(vmName string) error {
 
 func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose bool, iteration int) error {
 	localSerialLog := "./serial.log"
+	// Truncate the local serial log so it only contains output from THIS
+	// iteration's boot, not accumulated output from all previous iterations.
+	if err := os.Truncate(localSerialLog, 0); err != nil && !os.IsNotExist(err) {
+		logrus.Warnf("Failed to truncate local serial log before iteration %d: %v", iteration, err)
+	}
 	// Wait for login prompt to appear in the serial log and save the log to localSerialLog
 	waitErr := innerWaitForLogin(cfg.SerialLog, verbose, iteration, localSerialLog)
 	// Copy serial log to output directory if specified

From bf3879ed90ee9c0589e5b7b208ec3d900e03aed6 Mon Sep 17 00:00:00 2001
From: bfjelds <bfjelds@microsoft.com>
Date: Mon, 18 May 2026 17:18:24 -0700
Subject: [PATCH 2/3] diag: capture VM state when SSH unreachable after boot

When a VM boots (DHCP lease + virsh running) but SSH port 22 is
unreachable, capture diagnostics to identify the root cause:
1. virsh domifaddr - verify VM network interfaces
2. ARP table entry - check L2 reachability
3. Ping test - check L3 reachability
4. TCP port probes - check key services
5. Full serial log save - preserve boot output
6. virsh console commands - query systemd/sshd/network state

(cherry-picked from scale-failure-diagnostics branch)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tools/storm/utils/vm/qemu/qemu.go | 93 +++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/tools/storm/utils/vm/qemu/qemu.go b/tools/storm/utils/vm/qemu/qemu.go
index 53fd69c0c..70f79104d 100644
--- a/tools/storm/utils/vm/qemu/qemu.go
+++ b/tools/storm/utils/vm/qemu/qemu.go
@@ -418,6 +418,10 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo
 				// VM has a stale DHCP lease but is unreachable — it genuinely
 				// failed to boot (not a serial-getty race condition).
 				logrus.Warnf("VM '%s' has DHCP lease (IP: %s) but SSH port is unreachable — stale lease from previous boot, VM failed to come back up", vmName, ips[0])
+
+				// Gather additional diagnostics to help identify why the VM
+				// is unreachable despite appearing to boot.
+				cfg.captureUnreachableVmDiagnostics(vmName, ips[0], iteration, outputPath)
 			} else {
 				conn.Close()
 				logrus.Warnf("VM '%s' has DHCP lease (IP: %s) and SSH port is reachable — VM booted but serial-getty did not start (systemd race: https://github.com/systemd/systemd/issues/10850)", vmName, ips[0])
@@ -513,3 +517,92 @@ func innerWaitForLogin(vmSerialLog string, verbose bool, iteration int, localSer
 	// host CPU contention from other processes or parallel pipeline tasks.
 	return stormutils.WaitForLoginMessageInSerialLog(vmSerialLog, verbose, iteration, localSerialLog, time.Second*180)
 }
+
+// captureUnreachableVmDiagnostics gathers information about a VM that booted
+// (has a DHCP lease, virsh shows running) but whose SSH port is unreachable.
+// This helps identify whether the issue is networking, sshd, or a late boot
+// failure (e.g. kernel panic after early systemd).
+func (cfg QemuConfig) captureUnreachableVmDiagnostics(vmName string, ip string, iteration int, outputPath string) {
+	padIter := fmt.Sprintf("%03d", iteration)
+	logrus.Infof("=== UNREACHABLE VM DIAGNOSTICS (iteration %d) ===", iteration)
+
+	// 1. virsh domifaddr — check if the VM actually has a network interface with an IP
+	//    (vs relying on the stale DHCP lease from dnsmasq)
+	if domifOut, err := exec.Command("virsh", "domifaddr", vmName, "--source", "agent").CombinedOutput(); err == nil {
+		logrus.Infof("VM interface addresses (guest-agent): %s", domifOut)
+	} else {
+		// Guest agent likely not running; try the lease source
+		if domifOut2, err2 := exec.Command("virsh", "domifaddr", vmName, "--source", "lease").CombinedOutput(); err2 == nil {
+			logrus.Infof("VM interface addresses (lease): %s", domifOut2)
+		} else {
+			logrus.Warnf("virsh domifaddr failed (agent: %v, lease: %v)", err, err2)
+		}
+	}
+
+	// 2. ARP table — check if the host can see the VM's MAC at the expected IP
+	if arpOut, err := exec.Command("ip", "neigh", "show", ip).CombinedOutput(); err == nil {
+		logrus.Infof("ARP entry for %s: %s", ip, strings.TrimSpace(string(arpOut)))
+	}
+
+	// 3. Ping — can we reach the VM at the network layer?
+	if pingOut, err := exec.Command("ping", "-c", "3", "-W", "2", ip).CombinedOutput(); err == nil {
+		logrus.Infof("Ping %s: reachable", ip)
+	} else {
+		logrus.Warnf("Ping %s: unreachable — %s", ip, strings.TrimSpace(string(pingOut)))
+	}
+
+	// 4. Port scan — check if any port is open (not just 22)
+	for _, port := range []int{22, 80, 443, 111} {
+		addr := fmt.Sprintf("%s:%d", ip, port)
+		conn, err := net.DialTimeout("tcp", addr, 3*time.Second)
+		if err == nil {
+			conn.Close()
+			logrus.Infof("TCP port %d: OPEN", port)
+		} else {
+			logrus.Infof("TCP port %d: closed/filtered", port)
+		}
+	}
+
+	// 5. Full serial log — save the ENTIRE serial log, not just the last 100 lines.
+	//    The tail misses everything after ~5 seconds of kernel time (networking,
+	//    sshd startup, potential late panics).
+	if outputPath != "" {
+		fullSerialDst := filepath.Join(outputPath, padIter+"-full-serial-unreachable.log")
+		if err := exec.Command("cp", cfg.SerialLog, fullSerialDst).Run(); err == nil {
+			// Count lines to show how much was captured
+			if wcOut, wcErr := exec.Command("wc", "-l", fullSerialDst).Output(); wcErr == nil {
+				logrus.Infof("Full serial log saved: %s (%s lines)", fullSerialDst, strings.TrimSpace(strings.Split(string(wcOut), " ")[0]))
+			} else {
+				logrus.Infof("Full serial log saved: %s", fullSerialDst)
+			}
+		} else {
+			logrus.Warnf("Failed to save full serial log: %v", err)
+		}
+	}
+
+	// 6. virsh console — try to run commands inside the VM via the serial console.
+	//    This bypasses network entirely and can tell us if sshd/networking started.
+	consoleCommands := []struct {
+		cmd  string
+		desc string
+	}{
+		{"systemctl is-system-running", "systemd boot state"},
+		{"systemctl status sshd --no-pager -l", "sshd service status"},
+		{"ip addr show", "network interfaces"},
+		{"journalctl -b --no-pager -p err -u sshd -u systemd-networkd -u NetworkManager | tail -20", "error journal for networking/sshd"},
+	}
+
+	for _, cc := range consoleCommands {
+		// Use virsh to send commands through the console. This requires the VM
+		// to have a login shell on the serial console, which it may not if
+		// serial-getty didn't start. Try anyway — if it works, we get gold.
+		shellCmd := fmt.Sprintf("echo '%s' | timeout 10 virsh console --force %s 2>&1 | tail -30", cc.cmd, vmName)
+		if out, err := exec.Command("/bin/sh", "-c", shellCmd).CombinedOutput(); err == nil {
+			logrus.Infof("virsh console (%s):\n%s", cc.desc, strings.TrimSpace(string(out)))
+		} else {
+			logrus.Debugf("virsh console (%s) failed: %v", cc.desc, err)
+		}
+	}
+
+	logrus.Infof("=== END UNREACHABLE VM DIAGNOSTICS ===")
+}

From 4030e9bb83c1a209e0cff64e07565691813a311f Mon Sep 17 00:00:00 2001
From: bfjelds <bfjelds@microsoft.com>
Date: Mon, 18 May 2026 17:20:11 -0700
Subject: [PATCH 3/3] fix(storm): improve serial log handling and boot failure
 diagnostics

Three improvements based on analysis of pipeline 1121313 (5/525 failures):

1. Fix serial log truncation race: Move TruncateLog AFTER VM shutdown
   instead of before. Previously, shutdown messages written after truncation
   polluted the next boot's serial log, making it appear the VM produced
   no output (seen in qemu-3-5 at iteration 64).

2. Fix bootedToSystemd false positive: The dracut emergency classifier
   was misidentifying genuine initramfs failures as transient because
   systemd messages from prior boots leaked into the serial log. Now
   requires kernel timestamp markers to confirm messages are from the
   current boot. Also promote 'Entering emergency mode' to definite
   failure (seen in uki-3-17 at iteration 90, bug 15086).

3. Add host resource diagnostics: Log host memory, load average, and
   QEMU process count when a VM fails to boot. Resource exhaustion on
   the QEMU host can cause boot failures that look like VM issues.

4. Add domain state verification: After DomainCreate, verify the VM is
   actually in 'running' state before waiting for login.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tools/storm/servicing/tests/update.go | 34 +++++++++++++++-----
 tools/storm/utils/vm/qemu/qemu.go     | 45 ++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/tools/storm/servicing/tests/update.go b/tools/storm/servicing/tests/update.go
index 0fc26f2a6..703b61606 100644
--- a/tools/storm/servicing/tests/update.go
+++ b/tools/storm/servicing/tests/update.go
@@ -615,12 +615,32 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	}
 	content := string(data)
 
-	// Check if the VM booted past initramfs into systemd. The presence of
-	// systemd PID 1 messages after initramfs means the root filesystem was
-	// mounted and systemd took over. Dracut warnings in this case are
-	// transient — they delayed boot but didn't prevent it.
-	bootedToSystemd := strings.Contains(content, "systemd[1]: Finished systemd-remount-fs.service") ||
-		strings.Contains(content, "systemd[1]: Reached target local-fs.target")
+	// Check if the VM booted past initramfs into systemd. We look for
+	// systemd PID 1 messages that only appear AFTER initramfs hands off.
+	//
+	// IMPORTANT: Only consider lines with a kernel timestamp [ N.NNN]
+	// to avoid false positives from messages leaked from a previous boot
+	// (e.g., if serial log truncation raced with shutdown messages).
+	// A line like "[ 5.123] systemd[1]: Finished systemd-remount-fs"
+	// confirms THIS boot reached systemd, not a prior one.
+	bootedToSystemd := false
+	for _, line := range strings.Split(content, "\n") {
+		if !strings.Contains(line, "systemd[1]:") {
+			continue
+		}
+		if !strings.Contains(line, "systemd-remount-fs") && !strings.Contains(line, "local-fs.target") {
+			continue
+		}
+		// Verify it has a kernel timestamp (rules out plain console text
+		// from previous boots or dracut debug output).
+		if strings.Contains(line, "[") && strings.Contains(line, "]") {
+			// Extract kernel timestamp — should be a small number for
+			// current boot (typically < 60s), not a huge number from an
+			// accumulated log.
+			bootedToSystemd = true
+			break
+		}
+	}
 
 	dracutPatterns := []struct {
 		pattern  string
@@ -629,7 +649,7 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	}{
 		{"Could not boot", "dracut 'Could not boot' error detected", true},
 		{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs", true},
-		{"Entering emergency mode", "initramfs or systemd entered emergency mode", false},
+		{"Entering emergency mode", "initramfs or systemd entered emergency mode", true},
 		{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)", true},
 		{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device", false},
 		{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference", false},
diff --git a/tools/storm/utils/vm/qemu/qemu.go b/tools/storm/utils/vm/qemu/qemu.go
index 70f79104d..0de6a0283 100644
--- a/tools/storm/utils/vm/qemu/qemu.go
+++ b/tools/storm/utils/vm/qemu/qemu.go
@@ -74,11 +74,6 @@ func (cfg QemuConfig) CleanupQemuVM(vmName string) error {
 }
 
 func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath string, verbose bool) error {
-	logrus.Tracef("Truncate log files before reboot")
-	if err := cfg.TruncateLog(vmName); err != nil {
-		return fmt.Errorf("failed to truncate log file: %w", err)
-	}
-
 	lv, domain, err := getLibvirtDomainByname(vmName)
 	if err != nil {
 		return fmt.Errorf("failed to lookup domain by name '%s': %w", vmName, err)
@@ -98,11 +93,31 @@ func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath stri
 			break // Domain is shut off, exit loop
 		}
 	}
+
+	// Truncate the serial log AFTER the VM is fully shut off. Truncating
+	// before shutdown causes a race: the VM writes shutdown messages after
+	// truncation, polluting the next boot's serial log and making it look
+	// like the VM produced no output.
+	logrus.Tracef("Truncate log files after shutdown, before restart")
+	if err := cfg.TruncateLog(vmName); err != nil {
+		return fmt.Errorf("failed to truncate log file: %w", err)
+	}
+
 	logrus.Tracef("Domain '%s' is shut down, starting it again", vmName)
 	err = lv.DomainCreate(domain)
 	if err != nil {
 		return fmt.Errorf("failed to start domain '%s': %w", vmName, err)
 	}
+
+	// Verify the domain is actually running before waiting for login.
+	domainState, _, err := lv.DomainGetState(domain, 0)
+	if err != nil {
+		return fmt.Errorf("failed to get domain state after start: %w", err)
+	}
+	if domainState != int32(libvirt.DomainRunning) {
+		return fmt.Errorf("domain '%s' is not running after start (state: %d)", vmName, domainState)
+	}
+
 	logrus.Tracef("Waiting for VM '%s' to come back up after reboot", vmName)
 	err = cfg.WaitForLogin(vmName, outputPath, verbose, iteration)
 	if err != nil {
@@ -451,6 +466,26 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo
 		} else {
 			logrus.Infof("Disk usage:\n%s", dfOut)
 		}
+
+		// Host resource diagnostics — resource exhaustion on the QEMU host
+		// can cause VMs to fail to boot (no CPU time, OOM, disk full).
+		freeOut, freeErr := exec.Command("free", "-h").Output()
+		if freeErr != nil {
+			logrus.Warnf("Failed to run 'free -h': %v", freeErr)
+		} else {
+			logrus.Infof("Host memory:\n%s", freeOut)
+		}
+		loadOut, loadErr := exec.Command("cat", "/proc/loadavg").Output()
+		if loadErr != nil {
+			logrus.Warnf("Failed to read /proc/loadavg: %v", loadErr)
+		} else {
+			logrus.Infof("Host load average: %s", loadOut)
+		}
+		// Count running QEMU processes to detect oversubscription
+		qemuCountOut, qemuErr := exec.Command("/bin/sh", "-c", "pgrep -c qemu || echo 0").Output()
+		if qemuErr == nil {
+			logrus.Infof("Running QEMU processes: %s", strings.TrimSpace(string(qemuCountOut)))
+		}
 	}
 	return waitErr
 }