diff --git a/tools/storm/servicing/tests/update.go b/tools/storm/servicing/tests/update.go index 5cf6a1f37..703b61606 100644 --- a/tools/storm/servicing/tests/update.go +++ b/tools/storm/servicing/tests/update.go @@ -8,6 +8,7 @@ import ( "path/filepath" "regexp" "strings" + "sync" "time" stormsvcconfig "tridenttools/storm/servicing/utils/config" stormutils "tridenttools/storm/utils" @@ -18,22 +19,23 @@ import ( stormvm "tridenttools/storm/utils/vm" stormvmconfig "tridenttools/storm/utils/vm/config" + "github.com/microsoft/storm" "github.com/sirupsen/logrus" "gopkg.in/yaml.v2" ) -func UpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error { - return innerUpdateLoop(testConfig, vmConfig, false) +func UpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error { + return innerUpdateLoop(tc, testConfig, vmConfig, false) } -func Rollback(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error { - return innerUpdateLoop(testConfig, vmConfig, true) +func Rollback(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig) error { + return innerUpdateLoop(tc, testConfig, vmConfig, true) } -func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error { - // Create context to ensure goroutines exit cleanly - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() +func innerUpdateLoop(tc storm.TestCase, testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, rollback bool) error { + // Use the framework-provided context. Storm cancels this when the test + // case finishes, which automatically terminates any child goroutines. + ctx := tc.Context() logrus.Tracef("Stop existing update servers if any") // Kill any running update servers @@ -63,11 +65,20 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi cosiFileBase := cosiFile[strings.LastIndex(cosiFile, "/")+1:] logrus.Tracef("Start update servers (netlisten)") - // Start update servers (netlisten) + // Start update servers (netlisten) — tracked via the framework WaitGroup + // so storm waits for them to exit when the test case closes. aStartedChannel := make(chan bool) - go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel) + tc.BackgroundWaitGroup().Add(1) + go func() { + defer tc.BackgroundWaitGroup().Done() + stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortA, fmt.Sprintf("%s/update-a", testConfig.ArtifactsDir), "logstream-full-update-a.log", aStartedChannel) + }() bStartedChannel := make(chan bool) - go stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel) + tc.BackgroundWaitGroup().Add(1) + go func() { + defer tc.BackgroundWaitGroup().Done() + stormnetlisten.StartNetListenAndWait(ctx, testConfig.UpdatePortB, fmt.Sprintf("%s/update-b", testConfig.ArtifactsDir), "logstream-full-update-b.log", bStartedChannel) + }() // Wait for both update servers to start <-aStartedChannel <-bStartedChannel @@ -189,24 +200,50 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi } logrus.Tracef("Setting up SSH proxy ports for update servers") - aStartedChannel := make(chan bool) - go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel) - bStartedChannel := make(chan bool) - go stormssh.StartSshProxyPortAndWait(ctx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel) + // SSH reverse tunnels are needed only during staging and finalize (COSI download). + // Use a per-iteration context so proxies are cleanly stopped after finalize, + // preventing goroutine accumulation across iterations. + proxyCtx, proxyCancel := context.WithCancel(ctx) + var proxyWg sync.WaitGroup + proxyCleanedUp := false + cleanupProxies := func() { + if proxyCleanedUp { + return + } + proxyCleanedUp = true + proxyCancel() + proxyWg.Wait() + } + + proxyWg.Add(2) + aStartedChannel := make(chan bool, 1) + go func() { + defer proxyWg.Done() + stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortA, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, aStartedChannel) + }() + bStartedChannel := make(chan bool, 1) + go func() { + defer proxyWg.Done() + stormssh.StartSshProxyPortAndWait(proxyCtx, testConfig.UpdatePortB, vmIP, vmConfig.VMConfig.User, vmConfig.VMConfig.SshPrivateKeyPath, bStartedChannel) + }() // Wait for both SSH proxy ports to be ready - <-aStartedChannel - <-bStartedChannel + if !<-aStartedChannel || !<-bStartedChannel { + cleanupProxies() + return fmt.Errorf("failed to start SSH proxy ports for iteration %d", i) + } logrus.Tracef("Checking for crash dumps on host") crashDumpOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "ls /var/crash/*") if err == nil { logrus.Debugf("Crash files found on host during iteration %d: %s", i, crashDumpOutput) logrus.Error("Crash files found on host") + cleanupProxies() return fmt.Errorf("crash files found on host during iteration %d", i) } if rollback && i == 1 { if err := prepareRollback(vmConfig, vmIP, updateConfig, expectedVolume, i); err != nil { + cleanupProxies() return fmt.Errorf("failed to prepare rollback for iteration %d: %w", i, err) } } @@ -232,13 +269,16 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi stageLogLocalTmpFile, err := os.CreateTemp("", "staged-trident-full") if err != nil { + cleanupProxies() return fmt.Errorf("failed to create temp staging log file: %w", err) } stageLogLocalTmpPath := stageLogLocalTmpFile.Name() - defer os.Remove(stageLogLocalTmpPath) + stageLogLocalTmpFile.Close() err = stormssh.ScpDownloadFile(vmConfig.VMConfig, vmIP, "/var/log/trident-full.log", stageLogLocalTmpPath) if err != nil { + os.Remove(stageLogLocalTmpPath) + cleanupProxies() return fmt.Errorf("failed to download staged trident log: %w", err) } @@ -246,6 +286,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi logrus.Tracef("Download staging trident logs for iteration %d", i) stageLogPath := filepath.Join(testConfig.OutputPath, fmt.Sprintf("%s-staged-trident-full.log", fmt.Sprintf("%03d", i))) if err := exec.Command("cp", stageLogLocalTmpPath, stageLogPath).Run(); err != nil { + os.Remove(stageLogLocalTmpPath) + cleanupProxies() return fmt.Errorf("failed to copy staged trident log to output path: %w", err) } if err := os.Chmod(stageLogPath, 0644); err != nil { @@ -257,6 +299,8 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi } if stageErr != nil { + os.Remove(stageLogLocalTmpPath) + cleanupProxies() if egrepOut, err := exec.Command("/bin/sh", "-c", fmt.Sprintf("grep 'target is busy' %s | grep umount", stageLogLocalTmpPath)).CombinedOutput(); err == nil { // Check for known unmount failure and signal logrus.Errorf("umount failure (iteration %d: %v): %s", i, stageErr, egrepOut) @@ -273,6 +317,9 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi return fmt.Errorf("no update servicing required (iteration %d: %v)", i, stageErr) } + // Clean up staging temp file now rather than accumulating defers + os.Remove(stageLogLocalTmpPath) + logrus.Tracef("Running Trident update finalize command on VM") // Capture "uptime --since" before the finalize reboot. After reboot, the SSH @@ -303,6 +350,11 @@ func innerUpdateLoop(testConfig stormsvcconfig.TestConfig, vmConfig stormvmconfi logrus.Tracef("Finalize output for iteration %d:\n%s\n%v", i, combinedFinalizeOutput, finalizeErr) } + // SSH reverse tunnels are no longer needed — finalize re-reads the COSI + // file, so tunnels must stay alive through finalize. Stop them now before + // the reboot to avoid orphaned SSH processes. + cleanupProxies() + logrus.Tracef("Wait for VM to come back up after finalize reboot") if vmConfig.VMConfig.Platform == stormvmconfig.PlatformQEMU { err := vmConfig.QemuConfig.WaitForLogin(vmConfig.VMConfig.Name, testConfig.OutputPath, testConfig.Verbose, i) @@ -546,8 +598,12 @@ func validateRollback(cfg stormvmconfig.VMConfig, vmIP string) error { } // checkSerialLogForDracutIssues scans the serial log for patterns that indicate -// initramfs is stuck waiting for a device, which is the symptom of bug 15086 -// (stale disk UUIDs embedded in initramfs by dracut). +// initramfs had issues. It distinguishes between: +// - Genuine initramfs failures (emergency shell, timeout) where the VM never +// reached systemd +// - Transient initramfs warnings that resolved (dracut-initqueue delay) where +// the VM booted past initramfs into systemd but may be unreachable for other +// reasons (e.g., sshd not started, network issue) func checkSerialLogForDracutIssues(serialLogPath string, iteration int) { if serialLogPath == "" { return @@ -559,21 +615,61 @@ func checkSerialLogForDracutIssues(serialLogPath string, iteration int) { } content := string(data) + // Check if the VM booted past initramfs into systemd. We look for + // systemd PID 1 messages that only appear AFTER initramfs hands off. + // + // IMPORTANT: Only consider lines with a kernel timestamp [ N.NNN] + // to avoid false positives from messages leaked from a previous boot + // (e.g., if serial log truncation raced with shutdown messages). + // A line like "[ 5.123] systemd[1]: Finished systemd-remount-fs" + // confirms THIS boot reached systemd, not a prior one. + bootedToSystemd := false + for _, line := range strings.Split(content, "\n") { + if !strings.Contains(line, "systemd[1]:") { + continue + } + if !strings.Contains(line, "systemd-remount-fs") && !strings.Contains(line, "local-fs.target") { + continue + } + // Verify it has a kernel timestamp (rules out plain console text + // from previous boots or dracut debug output). + if strings.Contains(line, "[") && strings.Contains(line, "]") { + // Extract kernel timestamp — should be a small number for + // current boot (typically < 60s), not a huge number from an + // accumulated log. + bootedToSystemd = true + break + } + } + dracutPatterns := []struct { - pattern string - message string + pattern string + message string + definite bool // true = definitely stuck in initramfs (not a transient warning) }{ - {"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device"}, - {"Could not boot", "dracut 'Could not boot' error detected"}, - {"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs"}, - {"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference"}, - {"rd.break", "rd.break detected — initramfs dropped to debug shell"}, - {"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)"}, + {"Could not boot", "dracut 'Could not boot' error detected", true}, + {"Starting dracut emergency shell", "dracut emergency shell activated — boot failed in initramfs", true}, + {"Entering emergency mode", "initramfs or systemd entered emergency mode", true}, + {"Timed out waiting for device", "dracut timed out waiting for device — likely stale UUID in initramfs (bug 15086)", true}, + {"dracut-initqueue[", "dracut-initqueue warning detected — initramfs may be waiting for a device", false}, + {"Warning: /dev/disk/by", "dracut warning about /dev/disk/by-* path — possible stale UUID reference", false}, + {"rd.break", "rd.break detected — initramfs dropped to debug shell", false}, } + matchCount := 0 for _, dp := range dracutPatterns { if strings.Contains(content, dp.pattern) { - logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern) + matchCount++ + if bootedToSystemd && !dp.definite { + // VM booted past initramfs — dracut warning was transient. + logrus.Warnf("INITRAMFS NOTE (iteration %d): %s — but VM booted past initramfs into systemd (matched '%s' in serial log)", iteration, dp.message, dp.pattern) + } else { + logrus.Errorf("INITRAMFS DIAGNOSTIC (iteration %d): %s (matched '%s' in serial log)", iteration, dp.message, dp.pattern) + } } } + + if matchCount > 0 && bootedToSystemd { + logrus.Warnf("INITRAMFS SUMMARY (iteration %d): VM reached systemd despite %d dracut warning(s) — failure is likely SSH/network unreachability, not initramfs", iteration, matchCount) + } } diff --git a/tools/storm/servicing/trident.go b/tools/storm/servicing/trident.go index d72692913..e0ae5c9ed 100644 --- a/tools/storm/servicing/trident.go +++ b/tools/storm/servicing/trident.go @@ -78,11 +78,19 @@ func (h *TridentServicingScenario) Cleanup(ctx storm.SetupCleanupContext) error } func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func(stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error { + return h.runTestCaseWithContext(tc, func(_ storm.TestCase, cfg stormsvcconfig.TestConfig, vm stormvmconfig.AllVMConfig) error { + return testFunc(cfg, vm) + }) +} + +// runTestCaseWithContext passes the storm.TestCase to the test function, +// giving it access to tc.Context() and tc.BackgroundWaitGroup() for proper +// goroutine lifecycle management. +func (h *TridentServicingScenario) runTestCaseWithContext(tc storm.TestCase, testFunc func(storm.TestCase, stormsvcconfig.TestConfig, stormvmconfig.AllVMConfig) error) error { if tc.Name() != h.args.TestCaseToRun && h.args.TestCaseToRun != "all" { tc.Skip(fmt.Sprintf("Test case '%s' does not align to TestCaseToRun '%s'", tc.Name(), h.args.TestCaseToRun)) } else { logrus.Infof("Running test case '%s'", tc.Name()) - // create test-specific output directory testCaseSpecificConfig := h.args.TestConfig testCaseSpecificConfig.OutputPath = h.args.TestConfig.OutputPath if testCaseSpecificConfig.OutputPath != "" { @@ -92,6 +100,7 @@ func (h *TridentServicingScenario) runTestCase(tc storm.TestCase, testFunc func( } } err := testFunc( + tc, testCaseSpecificConfig, stormvmconfig.AllVMConfig{ VMConfig: h.args.VMConfig, @@ -117,7 +126,7 @@ func (h *TridentServicingScenario) checkDeployment(tc storm.TestCase) error { } func (h *TridentServicingScenario) updateLoop(tc storm.TestCase) error { - return h.runTestCase(tc, stormsvctests.UpdateLoop) + return h.runTestCaseWithContext(tc, stormsvctests.UpdateLoop) } func (h *TridentServicingScenario) rollback(tc storm.TestCase) error { @@ -125,7 +134,7 @@ func (h *TridentServicingScenario) rollback(tc storm.TestCase) error { tc.Skip("Test case 'rollback' is skipped because rollback testing is disabled") return nil // No action needed if rollback is not enabled } - return h.runTestCase(tc, stormsvctests.Rollback) + return h.runTestCaseWithContext(tc, stormsvctests.Rollback) } func (h *TridentServicingScenario) collectLogs(tc storm.TestCase) error { diff --git a/tools/storm/utils/ssh/ssh.go b/tools/storm/utils/ssh/ssh.go index 9cc5e9184..d5e3b470c 100644 --- a/tools/storm/utils/ssh/ssh.go +++ b/tools/storm/utils/ssh/ssh.go @@ -2,6 +2,7 @@ package ssh import ( "context" + "errors" "fmt" "os" "os/exec" @@ -40,6 +41,7 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse "-o", "ServerAliveInterval=5", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + "-o", "ExitOnForwardFailure=yes", "-i", sshKeyPath, fmt.Sprintf("%s@%s", sshUser, vmIP), ) @@ -48,12 +50,37 @@ func StartSshProxyPortAndWait(ctx context.Context, port int, vmIP string, sshUse logrus.Tracef("Starting SSH proxy for port %d to VM %s with user %s", port, vmIP, sshUser) if err := cmd.Start(); err != nil { + startedChannel <- false return fmt.Errorf("failed to start SSH proxy for port %d: %w", port, err) } - // Signal that the SSH proxy has started - startedChannel <- true - // Wait for the command to finish - if err := cmd.Wait(); err != nil { + + // With ExitOnForwardFailure=yes, SSH exits immediately if the remote port + // forward fails. Brief pause to catch that before signaling readiness. + earlyExit := make(chan error, 1) + go func() { earlyExit <- cmd.Wait() }() + + select { + case err := <-earlyExit: + // SSH exited before we could signal readiness — forward setup failed. + startedChannel <- false + if err != nil { + return fmt.Errorf("SSH proxy for port %d exited immediately (forward setup failed): %w", port, err) + } + return fmt.Errorf("SSH proxy for port %d exited immediately", port) + case <-time.After(1 * time.Second): + // SSH still running after 1s — forward is likely established. + startedChannel <- true + } + + // Wait for the command to finish (blocks until context cancel or SSH dies). + err := <-earlyExit + if err != nil { + // Context cancellation is expected during cleanup — only swallow + // Canceled, not DeadlineExceeded (which indicates a real timeout). + if errors.Is(ctx.Err(), context.Canceled) { + logrus.Tracef("SSH proxy for port %d stopped (context cancelled)", port) + return nil + } return fmt.Errorf("SSH proxy for port %d failed: %w", port, err) } logrus.Tracef("SSH proxy for port %d exited", port) diff --git a/tools/storm/utils/vm/qemu/qemu.go b/tools/storm/utils/vm/qemu/qemu.go index fcec985b9..0de6a0283 100644 --- a/tools/storm/utils/vm/qemu/qemu.go +++ b/tools/storm/utils/vm/qemu/qemu.go @@ -74,11 +74,6 @@ func (cfg QemuConfig) CleanupQemuVM(vmName string) error { } func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath string, verbose bool) error { - logrus.Tracef("Truncate log files before reboot") - if err := cfg.TruncateLog(vmName); err != nil { - return fmt.Errorf("failed to truncate log file: %w", err) - } - lv, domain, err := getLibvirtDomainByname(vmName) if err != nil { return fmt.Errorf("failed to lookup domain by name '%s': %w", vmName, err) @@ -98,11 +93,31 @@ func (cfg QemuConfig) RebootQemuVm(vmName string, iteration int, outputPath stri break // Domain is shut off, exit loop } } + + // Truncate the serial log AFTER the VM is fully shut off. Truncating + // before shutdown causes a race: the VM writes shutdown messages after + // truncation, polluting the next boot's serial log and making it look + // like the VM produced no output. + logrus.Tracef("Truncate log files after shutdown, before restart") + if err := cfg.TruncateLog(vmName); err != nil { + return fmt.Errorf("failed to truncate log file: %w", err) + } + logrus.Tracef("Domain '%s' is shut down, starting it again", vmName) err = lv.DomainCreate(domain) if err != nil { return fmt.Errorf("failed to start domain '%s': %w", vmName, err) } + + // Verify the domain is actually running before waiting for login. + domainState, _, err := lv.DomainGetState(domain, 0) + if err != nil { + return fmt.Errorf("failed to get domain state after start: %w", err) + } + if domainState != int32(libvirt.DomainRunning) { + return fmt.Errorf("domain '%s' is not running after start (state: %d)", vmName, domainState) + } + logrus.Tracef("Waiting for VM '%s' to come back up after reboot", vmName) err = cfg.WaitForLogin(vmName, outputPath, verbose, iteration) if err != nil { @@ -342,6 +357,11 @@ func (cfg QemuConfig) TruncateLog(vmName string) error { func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose bool, iteration int) error { localSerialLog := "./serial.log" + // Truncate the local serial log so it only contains output from THIS + // iteration's boot, not accumulated output from all previous iterations. + if err := os.Truncate(localSerialLog, 0); err != nil && !os.IsNotExist(err) { + logrus.Warnf("Failed to truncate local serial log before iteration %d: %v", iteration, err) + } // Wait for login prompt to appear in the serial log and save the log to localSerialLog waitErr := innerWaitForLogin(cfg.SerialLog, verbose, iteration, localSerialLog) // Copy serial log to output directory if specified @@ -413,6 +433,10 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo // VM has a stale DHCP lease but is unreachable — it genuinely // failed to boot (not a serial-getty race condition). logrus.Warnf("VM '%s' has DHCP lease (IP: %s) but SSH port is unreachable — stale lease from previous boot, VM failed to come back up", vmName, ips[0]) + + // Gather additional diagnostics to help identify why the VM + // is unreachable despite appearing to boot. + cfg.captureUnreachableVmDiagnostics(vmName, ips[0], iteration, outputPath) } else { conn.Close() logrus.Warnf("VM '%s' has DHCP lease (IP: %s) and SSH port is reachable — VM booted but serial-getty did not start (systemd race: https://github.com/systemd/systemd/issues/10850)", vmName, ips[0]) @@ -442,6 +466,26 @@ func (cfg QemuConfig) WaitForLogin(vmName string, outputPath string, verbose boo } else { logrus.Infof("Disk usage:\n%s", dfOut) } + + // Host resource diagnostics — resource exhaustion on the QEMU host + // can cause VMs to fail to boot (no CPU time, OOM, disk full). + freeOut, freeErr := exec.Command("free", "-h").Output() + if freeErr != nil { + logrus.Warnf("Failed to run 'free -h': %v", freeErr) + } else { + logrus.Infof("Host memory:\n%s", freeOut) + } + loadOut, loadErr := exec.Command("cat", "/proc/loadavg").Output() + if loadErr != nil { + logrus.Warnf("Failed to read /proc/loadavg: %v", loadErr) + } else { + logrus.Infof("Host load average: %s", loadOut) + } + // Count running QEMU processes to detect oversubscription + qemuCountOut, qemuErr := exec.Command("/bin/sh", "-c", "pgrep -c qemu || echo 0").Output() + if qemuErr == nil { + logrus.Infof("Running QEMU processes: %s", strings.TrimSpace(string(qemuCountOut))) + } } return waitErr } @@ -508,3 +552,92 @@ func innerWaitForLogin(vmSerialLog string, verbose bool, iteration int, localSer // host CPU contention from other processes or parallel pipeline tasks. return stormutils.WaitForLoginMessageInSerialLog(vmSerialLog, verbose, iteration, localSerialLog, time.Second*180) } + +// captureUnreachableVmDiagnostics gathers information about a VM that booted +// (has a DHCP lease, virsh shows running) but whose SSH port is unreachable. +// This helps identify whether the issue is networking, sshd, or a late boot +// failure (e.g. kernel panic after early systemd). +func (cfg QemuConfig) captureUnreachableVmDiagnostics(vmName string, ip string, iteration int, outputPath string) { + padIter := fmt.Sprintf("%03d", iteration) + logrus.Infof("=== UNREACHABLE VM DIAGNOSTICS (iteration %d) ===", iteration) + + // 1. virsh domifaddr — check if the VM actually has a network interface with an IP + // (vs relying on the stale DHCP lease from dnsmasq) + if domifOut, err := exec.Command("virsh", "domifaddr", vmName, "--source", "agent").CombinedOutput(); err == nil { + logrus.Infof("VM interface addresses (guest-agent): %s", domifOut) + } else { + // Guest agent likely not running; try the lease source + if domifOut2, err2 := exec.Command("virsh", "domifaddr", vmName, "--source", "lease").CombinedOutput(); err2 == nil { + logrus.Infof("VM interface addresses (lease): %s", domifOut2) + } else { + logrus.Warnf("virsh domifaddr failed (agent: %v, lease: %v)", err, err2) + } + } + + // 2. ARP table — check if the host can see the VM's MAC at the expected IP + if arpOut, err := exec.Command("ip", "neigh", "show", ip).CombinedOutput(); err == nil { + logrus.Infof("ARP entry for %s: %s", ip, strings.TrimSpace(string(arpOut))) + } + + // 3. Ping — can we reach the VM at the network layer? + if pingOut, err := exec.Command("ping", "-c", "3", "-W", "2", ip).CombinedOutput(); err == nil { + logrus.Infof("Ping %s: reachable", ip) + } else { + logrus.Warnf("Ping %s: unreachable — %s", ip, strings.TrimSpace(string(pingOut))) + } + + // 4. Port scan — check if any port is open (not just 22) + for _, port := range []int{22, 80, 443, 111} { + addr := fmt.Sprintf("%s:%d", ip, port) + conn, err := net.DialTimeout("tcp", addr, 3*time.Second) + if err == nil { + conn.Close() + logrus.Infof("TCP port %d: OPEN", port) + } else { + logrus.Infof("TCP port %d: closed/filtered", port) + } + } + + // 5. Full serial log — save the ENTIRE serial log, not just the last 100 lines. + // The tail misses everything after ~5 seconds of kernel time (networking, + // sshd startup, potential late panics). + if outputPath != "" { + fullSerialDst := filepath.Join(outputPath, padIter+"-full-serial-unreachable.log") + if err := exec.Command("cp", cfg.SerialLog, fullSerialDst).Run(); err == nil { + // Count lines to show how much was captured + if wcOut, wcErr := exec.Command("wc", "-l", fullSerialDst).Output(); wcErr == nil { + logrus.Infof("Full serial log saved: %s (%s lines)", fullSerialDst, strings.TrimSpace(strings.Split(string(wcOut), " ")[0])) + } else { + logrus.Infof("Full serial log saved: %s", fullSerialDst) + } + } else { + logrus.Warnf("Failed to save full serial log: %v", err) + } + } + + // 6. virsh console — try to run commands inside the VM via the serial console. + // This bypasses network entirely and can tell us if sshd/networking started. + consoleCommands := []struct { + cmd string + desc string + }{ + {"systemctl is-system-running", "systemd boot state"}, + {"systemctl status sshd --no-pager -l", "sshd service status"}, + {"ip addr show", "network interfaces"}, + {"journalctl -b --no-pager -p err -u sshd -u systemd-networkd -u NetworkManager | tail -20", "error journal for networking/sshd"}, + } + + for _, cc := range consoleCommands { + // Use virsh to send commands through the console. This requires the VM + // to have a login shell on the serial console, which it may not if + // serial-getty didn't start. Try anyway — if it works, we get gold. + shellCmd := fmt.Sprintf("echo '%s' | timeout 10 virsh console --force %s 2>&1 | tail -30", cc.cmd, vmName) + if out, err := exec.Command("/bin/sh", "-c", shellCmd).CombinedOutput(); err == nil { + logrus.Infof("virsh console (%s):\n%s", cc.desc, strings.TrimSpace(string(out))) + } else { + logrus.Debugf("virsh console (%s) failed: %v", cc.desc, err) + } + } + + logrus.Infof("=== END UNREACHABLE VM DIAGNOSTICS ===") +}