From 6bd029ca310673550bdbaa6951e7446f53fe7bca Mon Sep 17 00:00:00 2001 From: Anurag Madnawat Date: Thu, 19 Mar 2026 01:40:02 +0530 Subject: [PATCH] fix: [CI-21435]: Add Windows network recovery for hotpool VMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Windows VMs lose outbound internet connectivity after GCE suspend/resume when used as hotpool instances. After resume, DHCP leases may expire, DNS cache is stale, and system clock drifts — similar to the Linux ARM64 clock issue (CI-21434). When the connectivity check fails on Windows, this fix attempts network recovery by: - Renewing DHCP lease (ipconfig /renew) - Flushing DNS cache (ipconfig /flushdns) - Syncing system clock (w32tm /resync) - Re-adding DNS servers (8.8.8.8, 1.1.1.1) The recovery runs before returning the error, so the next RetryHealth attempt benefits from the restored networking. Co-Authored-By: Claude Opus 4.6 (1M context) --- handler/health.go | 1 + handler/network_recovery_other.go | 10 ++++++ handler/network_recovery_windows.go | 53 +++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 handler/network_recovery_other.go create mode 100644 handler/network_recovery_windows.go diff --git a/handler/health.go b/handler/health.go index 71369f6e..68fc2ab3 100644 --- a/handler/health.go +++ b/handler/health.go @@ -37,6 +37,7 @@ func HandleHealth() http.HandlerFunc { checkDuration := getConnectivityCheckDuration(r.URL.Query()) err := checkInternetConnectivity(checkDuration) if err != nil { + attemptNetworkRecovery() WriteError(w, err) return } diff --git a/handler/network_recovery_other.go b/handler/network_recovery_other.go new file mode 100644 index 00000000..d42b5097 --- /dev/null +++ b/handler/network_recovery_other.go @@ -0,0 +1,10 @@ +// Copyright 2022 Drone.IO Inc. All rights reserved. +// Use of this source code is governed by the Polyform License +// that can be found in the LICENSE file. + +//go:build !windows + +package handler + +// attemptNetworkRecovery is a no-op on non-Windows platforms. +func attemptNetworkRecovery() {} diff --git a/handler/network_recovery_windows.go b/handler/network_recovery_windows.go new file mode 100644 index 00000000..228d56a1 --- /dev/null +++ b/handler/network_recovery_windows.go @@ -0,0 +1,53 @@ +// Copyright 2022 Drone.IO Inc. All rights reserved. +// Use of this source code is governed by the Polyform License +// that can be found in the LICENSE file. + +//go:build windows + +package handler + +import ( + "fmt" + "os/exec" + "strings" + "time" + + "github.com/sirupsen/logrus" +) + +// attemptNetworkRecovery tries to restore Windows network connectivity +// after GCE suspend/resume by renewing DHCP, flushing DNS, syncing clock, +// and re-adding DNS servers. +func attemptNetworkRecovery() { + logrus.Infoln("Attempting Windows network recovery after connectivity failure") + start := time.Now() + + cmds := []struct { + name string + args []string + }{ + {"ipconfig", []string{"/renew"}}, + {"ipconfig", []string{"/flushdns"}}, + {"w32tm", []string{"/resync", "/nowait"}}, + {"netsh", []string{"interface", "ipv4", "add", "dnsserver", "Ethernet", "8.8.8.8", "index=1"}}, + {"netsh", []string{"interface", "ipv4", "add", "dnsserver", "Ethernet", "1.1.1.1", "index=2"}}, + } + + var errors []string + for _, c := range cmds { + out, err := exec.Command(c.name, c.args...).CombinedOutput() + if err != nil { + msg := fmt.Sprintf("%s %s failed: %v (output: %s)", c.name, strings.Join(c.args, " "), err, strings.TrimSpace(string(out))) + errors = append(errors, msg) + logrus.Warnln(msg) + } + } + + if len(errors) > 0 { + logrus.WithField("errors", len(errors)).WithField("elapsed", time.Since(start)). + Warnln("Network recovery completed with some errors") + } else { + logrus.WithField("elapsed", time.Since(start)). + Infoln("Network recovery completed successfully") + } +}