diff --git a/tools/storm/servicing/tests/update.go b/tools/storm/servicing/tests/update.go
index 5cf6a1f37..703b61606 100644
--- a/tools/storm/servicing/tests/update.go
+++ b/tools/storm/servicing/tests/update.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strings"
+	"sync"
 	"time"
 	stormsvcconfig "tridenttools/storm/servicing/utils/config"
 	stormutils "tridenttools/storm/utils"
@@ -18,22 +19,23 @@ import (
 	stormvm "tridenttools/storm/utils/vm"
 	stormvmconfig "tridenttools/storm/utils/vm/config"
 
+	"github.com/microsoft/storm"
 	"github.com/sirupsen/logrus"
 	"gopkg.in/yaml.v2"
 )
 
-func UpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
-	return innerUpdateLoop(testConfig, vmConfig, false)
+func UpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
+	return innerUpdateLoop(tc, testConfig, vmConfig, false)
 }
 
-func Rollback(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
-	return innerUpdateLoop(testConfig, vmConfig, true)
+func Rollback(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error {
+	return innerUpdateLoop(tc, testConfig, vmConfig, true)
 }
 
-func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
-	// Create context to ensure goroutines exit cleanly
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
+func innerUpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error {
+	// Use the framework-provided context. Storm cancels this when the test
+	// case finishes, which automatically terminates any child goroutines.
+	ctx := tc.Context()
 
 	logrus.Tracef("Stop existing update servers if any")
 	// Kill any running update servers
@@ -63,11 +65,20 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 	cosiFileBase := cosiFile[strings.LastIndex(cosiFile, "/")+1:]
 
 	logrus.Tracef("Start update servers (netlisten)")
-	// Start update servers (netlisten)
+	// Start update servers (netlisten) — tracked via the framework WaitGroup
+	// so storm waits for them to exit when the test case closes.
 	aStartedChannel := make(chan bool)
-	go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
+	tc.BackgroundWaitGroup().Add(1)
+	go func() {
+		defer tc.BackgroundWaitGroup().Done()
+		stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel)
+	}()
 	bStartedChannel := make(chan bool)
-	go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
+	tc.BackgroundWaitGroup().Add(1)
+	go func() {
+		defer tc.BackgroundWaitGroup().Done()
+		stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel)
+	}()
 	// Wait for both update servers to start
 	<-aStartedChannel
 	<-bStartedChannel
@@ -189,24 +200,50 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 		}
 
 		logrus.Tracef("Setting up SSH proxy ports for update servers")
-		aStartedChannel := make(chan bool)
-		go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
-		bStartedChannel := make(chan bool)
-		go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
+		// SSH reverse tunnels are needed only during staging and finalize (COSI download).
+		// Use a per-iteration context so proxies are cleanly stopped after finalize,
+		// preventing goroutine accumulation across iterations.
+		proxyCtx, proxyCancel := context.WithCancel(ctx)
+		var proxyWg sync.WaitGroup
+		proxyCleanedUp := false
+		cleanupProxies := func() {
+			if proxyCleanedUp {
+				return
+			}
+			proxyCleanedUp = true
+			proxyCancel()
+			proxyWg.Wait()
+		}
+
+		proxyWg.Add(2)
+		aStartedChannel := make(chan bool, 1)
+		go func() {
+			defer proxyWg.Done()
+			stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel)
+		}()
+		bStartedChannel := make(chan bool, 1)
+		go func() {
+			defer proxyWg.Done()
+			stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel)
+		}()
 		// Wait for both SSH proxy ports to be ready
-		<-aStartedChannel
-		<-bStartedChannel
+		if !<-aStartedChannel || !<-bStartedChannel {
+			cleanupProxies()
+			return fmt.Errorf("failed to start SSH proxy ports for iteration %d", i)
+		}
 
 		logrus.Tracef("Checking for crash dumps on host")
 		crashDumpOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "ls /var/crash/*")
 		if err == nil {
 			logrus.Debugf("Crash files found on host during iteration %d: %s", i, crashDumpOutput)
 			logrus.Error("Crash files found on host")
+			cleanupProxies()
 			return fmt.Errorf("crash files found on host during iteration %d", i)
 		}
 
 		if rollback && i == 1 {
 			if err := prepareRollback(vmConfig, vmIP, updateConfig, expectedVolume, i); err != nil {
+				cleanupProxies()
 				return fmt.Errorf("failed to prepare rollback for iteration %d: %w", i, err)
 			}
 		}
@@ -232,13 +269,16 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 
 		stageLogLocalTmpFile, err := os.CreateTemp("", "staged-trident-full")
 		if err != nil {
+			cleanupProxies()
 			return fmt.Errorf("failed to create temp staging log file: %w", err)
 		}
 		stageLogLocalTmpPath := stageLogLocalTmpFile.Name()
-		defer os.Remove(stageLogLocalTmpPath)
+		stageLogLocalTmpFile.Close()
 
 		err = stormssh.ScpDownloadFile(vmConfig.VMConfig, vmIP, "/var/log/trident-full.log", stageLogLocalTmpPath)
 		if err != nil {
+			os.Remove(stageLogLocalTmpPath)
+			cleanupProxies()
 			return fmt.Errorf("failed to download staged trident log: %w", err)
 		}
 
@@ -246,6 +286,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			logrus.Tracef("Download staging trident logs for iteration %d", i)
 			stageLogPath := filepath.Join(testConfig.OutputPath, fmt.Sprintf("%s-staged-trident-full.log", fmt.Sprintf("%03d", i)))
 			if err := exec.Command("cp", stageLogLocalTmpPath, stageLogPath).Run(); err != nil {
+				os.Remove(stageLogLocalTmpPath)
+				cleanupProxies()
 				return fmt.Errorf("failed to copy staged trident log to output path: %w", err)
 			}
 			if err := os.Chmod(stageLogPath, 0644); err != nil {
@@ -257,6 +299,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 		}
 
 		if stageErr != nil {
+			os.Remove(stageLogLocalTmpPath)
+			cleanupProxies()
 			if egrepOut, err := exec.Command("/bin/sh", "-c", fmt.Sprintf("grep 'target is busy' %s | grep umount", stageLogLocalTmpPath)).CombinedOutput(); err == nil {
 				// Check for known unmount failure and signal
 				logrus.Errorf("umount failure (iteration %d: %v): %s", i, stageErr, egrepOut)
@@ -273,6 +317,9 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			return fmt.Errorf("no update servicing required (iteration %d: %v)", i, stageErr)
 		}
 
+		// Clean up staging temp file now rather than accumulating defers
+		os.Remove(stageLogLocalTmpPath)
+
 		logrus.Tracef("Running Trident update finalize command on VM")
 
 		// Capture "uptime --since" before the finalize reboot. After reboot, the SSH
@@ -303,6 +350,11 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi
 			logrus.Tracef("Finalize output for iteration %d:\n%s\n%v", i, combinedFinalizeOutput, finalizeErr)
 		}
 
+		// SSH reverse tunnels are no longer needed — finalize re-reads the COSI
+		// file, so tunnels must stay alive through finalize. Stop them now before
+		// the reboot to avoid orphaned SSH processes.
+		cleanupProxies()
+
 		logrus.Tracef("Wait for VM to come back up after finalize reboot")
 		if vmConfig.VMConfig.Platform == stormvmconfig.PlatformQEMU {
 			err := vmConfig.QemuConfig.WaitForLogin(vmConfig.VMConfig.Name, testConfig.OutputPath, testConfig.Verbose, i)
@@ -546,8 +598,12 @@ func validateRollback(cfg stormvmconfig.VMConfig, vmIP string) error {
 }
 
 // checkSerialLogForDracutIssues scans the serial log for patterns that indicate
-// initramfs is stuck waiting for a device, which is the symptom of bug 15086
-// (stale disk UUIDs embedded in initramfs by dracut).
+// initramfs had issues. It distinguishes between:
+//   - Genuine initramfs failures (emergency shell, timeout) where the VM never
+//     reached systemd
+//   - Transient initramfs warnings that resolved (dracut-initqueue delay) where
+//     the VM booted past initramfs into systemd but may be unreachable for other
+//     reasons (e.g., sshd not started, network issue)
 func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	if serialLogPath == "" {
 		return
@@ -559,21 +615,61 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) {
 	}
 	content := string(data)
 
+	// Check if the VM booted past initramfs into systemd. We look for
+	// systemd PID 1 messages that only appear AFTER initramfs hands off.
+	//
+	// IMPORTANT: Only consider lines with a kernel timestamp [ N.NNN]
+	// to avoid false positives from messages leaked from a previous boot
+	// (e.g., if serial log truncation raced with shutdown messages).
+	// A line like "[ 5.123] systemd[1]: Finished systemd-remount-fs"
+	// confirms THIS boot reached systemd, not a prior one.
+	bootedToSystemd := false
+	for _, line := range strings.Split(content, "\n") {
+		if !strings.Contains(line, "systemd[1]:") {
+			continue
+		}
+		if !strings.Contains(line, "systemd-remount-fs") && !strings.Contains(line, "local-fs.target") {
+			continue
+		}
+		// Verify it has a kernel timestamp (rules out plain console text
+		// from previous boots or dracut debug output).
+		if strings.Contains(line, "[") && strings.Contains(line, "]") {
+			// Extract kernel timestamp — should be a small number for
+			// current boot (typically < 60s), not a huge number from an
+			// accumulated log.
+			bootedToSystemd = true
+			break
+		}
+	}
+
 	dracutPatterns := []struct {
-		pattern string
-		message string
+		pattern  string
+		message  string
+		definite bool // true = definitely stuck in initramfs (not a transient warning)
 	}{
-		{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device"},
-		{"Could not boot", "dracut 'Could not boot' error detected"},
-		{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs"},
-		{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference"},
-		{"rd.break", "rd.break detected — initramfs dropped to debug shell"},
-		{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)"},
+		{"Could not boot", "dracut 'Could not boot' error detected", true},
+		{"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs", true},
+		{"Entering emergency mode", "initramfs or systemd entered emergency mode", true},
+		{"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)", true},
+		{"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device", false},
+		{"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference", false},
+		{"rd.break", "rd.break detected — initramfs dropped to debug shell", false},
 	}
 
+	matchCount := 0
 	for _, dp := range dracutPatterns {
 		if strings.Contains(content, dp.pattern) {
-			logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			matchCount++
+			if bootedToSystemd && !dp.definite {
+				// VM booted past initramfs — dracut warning was transient.
+				logrus.Warnf("INITRAMFS NOTE (iteration %d): %s — but VM booted past initramfs into systemd (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			} else {
+				logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern)
+			}
 		}
 	}
+
+	if matchCount > 0 && bootedToSystemd {
+		logrus.Warnf("INITRAMFS SUMMARY (iteration %d): VM reached systemd despite %d dracut warning(s) — failure is likely SSH/network unreachability, not initramfs", iteration, matchCount)
+	}
 }
diff --git a/tools/storm/servicing/trident.go b/tools/storm/servicing/trident.go
index d72692913..e0ae5c9ed 100644
--- a/tools/storm/servicing/trident.go
+++ b/tools/storm/servicing/trident.go
@@ -78,11 +78,19 @@ func (h *TridentServicingScenario) Cleanup(ctx storm.SetupCleanupContext) error
 }
 
 func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
+	return h.runTestCaseWithContext(tc, func(_ storm.TestCase, cfg stormsvcconfig.TestConfig, vm stormvmconfig.AllVMConfig) error {
+		return testFunc(cfg, vm)
+	})
+}
+
+// runTestCaseWithContext passes the storm.TestCase to the test function,
+// giving it access to tc.Context() and tc.BackgroundWaitGroup() for proper
+// goroutine lifecycle management.
+func (h *TridentServicingScenario) runTestCaseWithContext(tc storm.TestCase, testFunc func(storm.TestCase, stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error {
 	if tc.Name() != h.args.TestCaseToRun && h.args.TestCaseToRun != "all" {
 		tc.Skip(fmt.Sprintf("Test case '%s' does not align to TestCaseToRun '%s'", tc.Name(), h.args.TestCaseToRun))
 	} else {
 		logrus.Infof("Running test case '%s'", tc.Name())
-		// create test-specific output directory
 		testCaseSpecificConfig := h.args.TestConfig
 		testCaseSpecificConfig.OutputPath = h.args.TestConfig.OutputPath
 		if testCaseSpecificConfig.OutputPath != "" {
@@ -92,6 +100,7 @@ func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(
 			}
 		}
 		err := testFunc(
+			tc,
 			testCaseSpecificConfig,
 			stormvmconfig.AllVMConfig{
 				VMConfig:    h.args.VMConfig,
@@ -117,7 +126,7 @@ func (h *TridentServicingScenario) checkDeployment(tc storm.TestCase) error {
 }
 
 func (h *TridentServicingScenario) updateLoop(tc storm.TestCase) error {
-	return h.runTestCase(tc, stormsvctests.UpdateLoop)
+	return h.runTestCaseWithContext(tc, stormsvctests.UpdateLoop)
 }
 
 func (h *TridentServicingScenario) rollback(tc storm.TestCase) error {
@@ -125,7 +134,7 @@ func (h *TridentServicingScenario) rollback(tc storm.TestCase) error {
 		tc.Skip("Test case 'rollback' is skipped because rollback testing is disabled")
 		return nil // No action needed if rollback is not enabled
 	}
-	return h.runTestCase(tc, stormsvctests.Rollback)
+	return h.runTestCaseWithContext(tc, stormsvctests.Rollback)
 }
 
 func (h *TridentServicingScenario) collectLogs(tc storm.TestCase) error {
diff --git a/tools/storm/utils/ssh/ssh.go b/tools/storm/utils/ssh/ssh.go
index 9cc5e9184..d5e3b470c 100644
--- a/tools/storm/utils/ssh/ssh.go
+++ b/tools/storm/utils/ssh/ssh.go
@@ -2,6 +2,7 @@ package ssh
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -40,6 +41,7 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse
 		"-o", "ServerAliveInterval=5",
 		"-o", "StrictHostKeyChecking=no",
 		"-o", "UserKnownHostsFile=/dev/null",
+		"-o", "ExitOnForwardFailure=yes",
 		"-i", sshKeyPath,
 		fmt.Sprintf("%s@%s", sshUser, vmIP),
 	)
@@ -48,12 +50,37 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse
 
 	logrus.Tracef("Starting SSH proxy for port %d to VM %s with user %s", port, vmIP, sshUser)
 	if err := cmd.Start(); err != nil {
+		startedChannel <- false
 		return fmt.Errorf("failed to start SSH proxy for port %d: %w", port, err)
 	}
-	// Signal that the SSH proxy has started
-	startedChannel <- true
-	// Wait for the command to finish
-	if err := cmd.Wait(); err != nil {
+
+	// With ExitOnForwardFailure=yes, SSH exits immediately if the remote port
+	// forward fails. Brief pause to catch that before signaling readiness.
+	earlyExit := make(chan error, 1)
+	go func() { earlyExit <- cmd.Wait() }()
+
+	select {
+	case err := <-earlyExit:
+		// SSH exited before we could signal readiness — forward setup failed.
+		startedChannel <- false
+		if err != nil {
+			return fmt.Errorf("SSH proxy for port %d exited immediately (forward setup failed): %w", port, err)
+		}
+		return fmt.Errorf("SSH proxy for port %d exited immediately", port)
+	case <-time.After(1 * time.Second):
+		// SSH still running after 1s — forward is likely established.
+		startedChannel <- true
+	}
+
+	// Wait for the command to finish (blocks until context cancel or SSH dies).
+	err := <-earlyExit
+	if err != nil {
+		// Context cancellation is expected during cleanup — only swallow
+		// Canceled, not DeadlineExceeded (which indicates a real timeout).
+		if errors.Is(ctx.Err(), context.Canceled) {
+			logrus.Tracef("SSH proxy for port %d stopped (context cancelled)", port)
+			return nil
+		}
 		return fmt.Errorf("SSH proxy for port %d failed: %w", port, err)
 	}
 	logrus.Tracef("SSH proxy for port %d exited", port)
diff --git a/tools/storm/utils/vm/qemu/qemu.go b/tools/storm/utils/vm/qemu/qemu.go
index fcec985b9..0de6a0283 100644
--- a/tools/storm/utils/vm/qemu/qemu.go
+++ b/tools/storm/utils/vm/qemu/qemu.go
@@ -74,11 +74,6 @@ func (cfg QemuConfig) CleanupQemuVM(vmName string) error {
 }
 
 func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath string, verbose bool) error {
-	logrus.Tracef("Truncate log files before reboot")
-	if err := cfg.TruncateLog(vmName); err != nil {
-		return fmt.Errorf("failed to truncate log file: %w", err)
-	}
-
 	lv, domain, err := getLibvirtDomainByname(vmName)
 	if err != nil {
 		return fmt.Errorf("failed to lookup domain by name '%s': %w", vmName, err)
@@ -98,11 +93,31 @@ func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath stri
 			break // Domain is shut off, exit loop
 		}
 	}
+
+	// Truncate the serial log AFTER the VM is fully shut off. Truncating
+	// before shutdown causes a race: the VM writes shutdown messages after
+	// truncation, polluting the next boot's serial log and making it look
+	// like the VM produced no output.
+	logrus.Tracef("Truncate log files after shutdown, before restart")
+	if err := cfg.TruncateLog(vmName); err != nil {
+		return fmt.Errorf("failed to truncate log file: %w", err)
+	}
+
 	logrus.Tracef("Domain '%s' is shut down, starting it again", vmName)
 	err = lv.DomainCreate(domain)
 	if err != nil {
 		return fmt.Errorf("failed to start domain '%s': %w", vmName, err)
 	}
+
+	// Verify the domain is actually running before waiting for login.
+	domainState, _, err := lv.DomainGetState(domain, 0)
+	if err != nil {
+		return fmt.Errorf("failed to get domain state after start: %w", err)
+	}
+	if domainState != int32(libvirt.DomainRunning) {
+		return fmt.Errorf("domain '%s' is not running after start (state: %d)", vmName, domainState)
+	}
+
 	logrus.Tracef("Waiting for VM '%s' to come back up after reboot", vmName)
 	err = cfg.WaitForLogin(vmName, outputPath, verbose, iteration)
 	if err != nil {
@@ -342,6 +357,11 @@ func (cfg QemuConfig) TruncateLog(vmName string) error {
 
 func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose bool, iteration int) error {
 	localSerialLog := "./serial.log"
+	// Truncate the local serial log so it only contains output from THIS
+	// iteration's boot, not accumulated output from all previous iterations.
+	if err := os.Truncate(localSerialLog, 0); err != nil && !os.IsNotExist(err) {
+		logrus.Warnf("Failed to truncate local serial log before iteration %d: %v", iteration, err)
+	}
 	// Wait for login prompt to appear in the serial log and save the log to localSerialLog
 	waitErr := innerWaitForLogin(cfg.SerialLog, verbose, iteration, localSerialLog)
 	// Copy serial log to output directory if specified
@@ -413,6 +433,10 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo
 				// VM has a stale DHCP lease but is unreachable — it genuinely
 				// failed to boot (not a serial-getty race condition).
 				logrus.Warnf("VM '%s' has DHCP lease (IP: %s) but SSH port is unreachable — stale lease from previous boot, VM failed to come back up", vmName, ips[0])
+
+				// Gather additional diagnostics to help identify why the VM
+				// is unreachable despite appearing to boot.
+				cfg.captureUnreachableVmDiagnostics(vmName, ips[0], iteration, outputPath)
 			} else {
 				conn.Close()
 				logrus.Warnf("VM '%s' has DHCP lease (IP: %s) and SSH port is reachable — VM booted but serial-getty did not start (systemd race: https://github.com/systemd/systemd/issues/10850)", vmName, ips[0])
@@ -442,6 +466,26 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo
 		} else {
 			logrus.Infof("Disk usage:\n%s", dfOut)
 		}
+
+		// Host resource diagnostics — resource exhaustion on the QEMU host
+		// can cause VMs to fail to boot (no CPU time, OOM, disk full).
+		freeOut, freeErr := exec.Command("free", "-h").Output()
+		if freeErr != nil {
+			logrus.Warnf("Failed to run 'free -h': %v", freeErr)
+		} else {
+			logrus.Infof("Host memory:\n%s", freeOut)
+		}
+		loadOut, loadErr := exec.Command("cat", "/proc/loadavg").Output()
+		if loadErr != nil {
+			logrus.Warnf("Failed to read /proc/loadavg: %v", loadErr)
+		} else {
+			logrus.Infof("Host load average: %s", loadOut)
+		}
+		// Count running QEMU processes to detect oversubscription
+		qemuCountOut, qemuErr := exec.Command("/bin/sh", "-c", "pgrep -c qemu || echo 0").Output()
+		if qemuErr == nil {
+			logrus.Infof("Running QEMU processes: %s", strings.TrimSpace(string(qemuCountOut)))
+		}
 	}
 	return waitErr
 }
@@ -508,3 +552,92 @@ func innerWaitForLogin(vmSerialLog string, verbose bool, iteration int, localSer
 	// host CPU contention from other processes or parallel pipeline tasks.
 	return stormutils.WaitForLoginMessageInSerialLog(vmSerialLog, verbose, iteration, localSerialLog, time.Second*180)
 }
+
+// captureUnreachableVmDiagnostics gathers information about a VM that booted
+// (has a DHCP lease, virsh shows running) but whose SSH port is unreachable.
+// This helps identify whether the issue is networking, sshd, or a late boot
+// failure (e.g. kernel panic after early systemd).
+func (cfg QemuConfig) captureUnreachableVmDiagnostics(vmName string, ip string, iteration int, outputPath string) {
+	padIter := fmt.Sprintf("%03d", iteration)
+	logrus.Infof("=== UNREACHABLE VM DIAGNOSTICS (iteration %d) ===", iteration)
+
+	// 1. virsh domifaddr — check if the VM actually has a network interface with an IP
+	//    (vs relying on the stale DHCP lease from dnsmasq)
+	if domifOut, err := exec.Command("virsh", "domifaddr", vmName, "--source", "agent").CombinedOutput(); err == nil {
+		logrus.Infof("VM interface addresses (guest-agent): %s", domifOut)
+	} else {
+		// Guest agent likely not running; try the lease source
+		if domifOut2, err2 := exec.Command("virsh", "domifaddr", vmName, "--source", "lease").CombinedOutput(); err2 == nil {
+			logrus.Infof("VM interface addresses (lease): %s", domifOut2)
+		} else {
+			logrus.Warnf("virsh domifaddr failed (agent: %v, lease: %v)", err, err2)
+		}
+	}
+
+	// 2. ARP table — check if the host can see the VM's MAC at the expected IP
+	if arpOut, err := exec.Command("ip", "neigh", "show", ip).CombinedOutput(); err == nil {
+		logrus.Infof("ARP entry for %s: %s", ip, strings.TrimSpace(string(arpOut)))
+	}
+
+	// 3. Ping — can we reach the VM at the network layer?
+	if pingOut, err := exec.Command("ping", "-c", "3", "-W", "2", ip).CombinedOutput(); err == nil {
+		logrus.Infof("Ping %s: reachable", ip)
+	} else {
+		logrus.Warnf("Ping %s: unreachable — %s", ip, strings.TrimSpace(string(pingOut)))
+	}
+
+	// 4. Port scan — check if any port is open (not just 22)
+	for _, port := range []int{22, 80, 443, 111} {
+		addr := fmt.Sprintf("%s:%d", ip, port)
+		conn, err := net.DialTimeout("tcp", addr, 3*time.Second)
+		if err == nil {
+			conn.Close()
+			logrus.Infof("TCP port %d: OPEN", port)
+		} else {
+			logrus.Infof("TCP port %d: closed/filtered", port)
+		}
+	}
+
+	// 5. Full serial log — save the ENTIRE serial log, not just the last 100 lines.
+	//    The tail misses everything after ~5 seconds of kernel time (networking,
+	//    sshd startup, potential late panics).
+	if outputPath != "" {
+		fullSerialDst := filepath.Join(outputPath, padIter+"-full-serial-unreachable.log")
+		if err := exec.Command("cp", cfg.SerialLog, fullSerialDst).Run(); err == nil {
+			// Count lines to show how much was captured
+			if wcOut, wcErr := exec.Command("wc", "-l", fullSerialDst).Output(); wcErr == nil {
+				logrus.Infof("Full serial log saved: %s (%s lines)", fullSerialDst, strings.TrimSpace(strings.Split(string(wcOut), " ")[0]))
+			} else {
+				logrus.Infof("Full serial log saved: %s", fullSerialDst)
+			}
+		} else {
+			logrus.Warnf("Failed to save full serial log: %v", err)
+		}
+	}
+
+	// 6. virsh console — try to run commands inside the VM via the serial console.
+	//    This bypasses network entirely and can tell us if sshd/networking started.
+	consoleCommands := []struct {
+		cmd  string
+		desc string
+	}{
+		{"systemctl is-system-running", "systemd boot state"},
+		{"systemctl status sshd --no-pager -l", "sshd service status"},
+		{"ip addr show", "network interfaces"},
+		{"journalctl -b --no-pager -p err -u sshd -u systemd-networkd -u NetworkManager | tail -20", "error journal for networking/sshd"},
+	}
+
+	for _, cc := range consoleCommands {
+		// Use virsh to send commands through the console. This requires the VM
+		// to have a login shell on the serial console, which it may not if
+		// serial-getty didn't start. Try anyway — if it works, we get gold.
+		shellCmd := fmt.Sprintf("echo '%s' | timeout 10 virsh console --force %s 2>&1 | tail -30", cc.cmd, vmName)
+		if out, err := exec.Command("/bin/sh", "-c", shellCmd).CombinedOutput(); err == nil {
+			logrus.Infof("virsh console (%s):\n%s", cc.desc, strings.TrimSpace(string(out)))
+		} else {
+			logrus.Debugf("virsh console (%s) failed: %v", cc.desc, err)
+		}
+	}
+
+	logrus.Infof("=== END UNREACHABLE VM DIAGNOSTICS ===")
+}