From aa851c701c0fe5ef7a1caff642aa7e48a553bf2d Mon Sep 17 00:00:00 2001 From: Ben Hillis Date: Fri, 10 Apr 2026 08:45:41 -0700 Subject: [PATCH] Detect unexpected VM exit and terminate wslcsession.exe When the VM is killed externally (e.g. hcsdiag /kill, kernel panic), the wslcsession.exe process now detects it and exits cleanly instead of hanging indefinitely as a zombie. Implementation: - Add GetExitEvent() to IWSLCVirtualMachine IDL. The SYSTEM service duplicates HcsVirtualMachine::m_vmExitEvent via COM system_handle marshaling so the session process can wait on it. - WSLCVirtualMachine calls GetExitEvent() during Initialize() and exposes VmExitedEvent() for consumers to monitor. - WSLCSession monitors the exit event via IORelay. On unexpected VM exit, OnVmExited() spawns a thread to call Terminate() (must be a separate thread to avoid deadlock with IORelay::Stop). - WSLCSessionManager registers a cleanup callback on HcsVirtualMachine to terminate sessions when VM exits. Callback is cleared in ~HcsVirtualMachine to avoid firing during normal shutdown. - Harden Terminate() to skip dockerd signal/wait and unmount when the VM is already dead, avoiding unnecessary 30s+ hangs. - Add tests: VmKillTerminatesSession, VmKillFailsInFlightOperations, CleanShutdownStillWorks. --- src/windows/service/exe/HcsVirtualMachine.cpp | 31 +++++- src/windows/service/exe/HcsVirtualMachine.h | 7 ++ src/windows/service/inc/wslc.idl | 4 + src/windows/wslcsession/WSLCSession.cpp | 104 +++++++++++++++--- src/windows/wslcsession/WSLCSession.h | 2 + test/windows/WSLCTests.cpp | 92 ++++++++++++++++ 6 files changed, 222 insertions(+), 18 deletions(-) diff --git a/src/windows/service/exe/HcsVirtualMachine.cpp b/src/windows/service/exe/HcsVirtualMachine.cpp index b554128f0..561e24991 100644 --- a/src/windows/service/exe/HcsVirtualMachine.cpp +++ b/src/windows/service/exe/HcsVirtualMachine.cpp @@ -49,7 +49,7 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings) // Build HCS settings hcs::ComputeSystem systemSettings{}; - systemSettings.Owner = L"WSL"; + systemSettings.Owner = std::format(L"WSLC-{}", Settings->DisplayName); systemSettings.ShouldTerminateOnLastHandleClosed = true; // Determine which schema version to use based on the Windows version. Windows 10 does not support @@ -281,6 +281,15 @@ HcsVirtualMachine::~HcsVirtualMachine() { std::lock_guard lock(m_lock); + // Clear the session termination callback before destroying the VM. + // During normal shutdown, the session is already terminating — firing the + // callback would cause a redundant (and potentially crashing) COM call. + // Uses its own lock to avoid deadlock with HCS callback thread. + { + std::lock_guard callbackLock(m_sessionTerminationCallbackLock); + m_sessionTerminationCallback = nullptr; + } + // Wait up to 5 seconds for the VM to terminate gracefully. bool forceTerminate = false; if (!m_vmExitEvent.wait(5000)) @@ -581,6 +590,15 @@ try } CATCH_RETURN() +HRESULT HcsVirtualMachine::RegisterTerminationCallback(_In_ ITerminationCallback* Callback) +try +{ + std::lock_guard lock(m_sessionTerminationCallbackLock); + m_sessionTerminationCallback = Callback; + return S_OK; +} +CATCH_RETURN() + void CALLBACK HcsVirtualMachine::OnVmExitCallback(HCS_EVENT* Event, void* Context) try { @@ -630,6 +648,17 @@ void HcsVirtualMachine::OnExit(const HCS_EVENT* Event) { LOG_IF_FAILED(m_terminationCallback->OnTermination(reason, Event->EventData)); } + + wil::com_ptr sessionCallback; + { + std::lock_guard lock(m_sessionTerminationCallbackLock); + sessionCallback = m_sessionTerminationCallback; + } + + if (sessionCallback) + { + LOG_IF_FAILED(sessionCallback->OnTermination(reason, Event->EventData)); + } } void HcsVirtualMachine::OnCrash(const HCS_EVENT* Event) diff --git a/src/windows/service/exe/HcsVirtualMachine.h b/src/windows/service/exe/HcsVirtualMachine.h index 5f6cdf36d..9324eea9b 100644 --- a/src/windows/service/exe/HcsVirtualMachine.h +++ b/src/windows/service/exe/HcsVirtualMachine.h @@ -43,6 +43,7 @@ class HcsVirtualMachine IFACEMETHOD(DetachDisk)(_In_ ULONG Lun) override; IFACEMETHOD(AddShare)(_In_ LPCWSTR WindowsPath, _In_ BOOL ReadOnly, _Out_ GUID* ShareId) override; IFACEMETHOD(RemoveShare)(_In_ REFGUID ShareId) override; + IFACEMETHOD(RegisterTerminationCallback)(_In_ ITerminationCallback* Callback) override; private: struct DiskInfo @@ -97,6 +98,12 @@ class HcsVirtualMachine bool m_crashLogCaptured = false; wil::com_ptr m_terminationCallback; + + // Session-side termination callback, registered via RegisterTerminationCallback(). + // Guarded by m_sessionTerminationCallbackLock (separate from m_lock to avoid + // deadlock between HCS callback thread and destructor which holds m_lock). + std::mutex m_sessionTerminationCallbackLock; + wil::com_ptr m_sessionTerminationCallback; }; } // namespace wsl::windows::service::wslc diff --git a/src/windows/service/inc/wslc.idl b/src/windows/service/inc/wslc.idl index dce02dbae..88078778d 100644 --- a/src/windows/service/inc/wslc.idl +++ b/src/windows/service/inc/wslc.idl @@ -425,6 +425,10 @@ interface IWSLCVirtualMachine : IUnknown // Removes a previously added filesystem share. HRESULT RemoveShare([in] REFGUID ShareId); + + // Registers a callback to be invoked when the VM exits. + // The callback receives the exit reason and optional details. + HRESULT RegisterTerminationCallback([in] ITerminationCallback* Callback); } typedef enum _WSLCSessionStorageFlags diff --git a/src/windows/wslcsession/WSLCSession.cpp b/src/windows/wslcsession/WSLCSession.cpp index 6ee817a7a..c0c031c7f 100644 --- a/src/windows/wslcsession/WSLCSession.cpp +++ b/src/windows/wslcsession/WSLCSession.cpp @@ -26,6 +26,28 @@ using wsl::windows::service::wslc::UserHandle; using wsl::windows::service::wslc::WSLCSession; using wsl::windows::service::wslc::WSLCVirtualMachine; +// COM callback that signals a local event when the VM terminates. +// Registered with IWSLCVirtualMachine::RegisterTerminationCallback() so the +// SYSTEM service can notify us cross-process when HCS reports VM exit. +struct VmTerminationCallback : winrt::implements +{ + VmTerminationCallback(HANDLE event) + { + HANDLE dup = nullptr; + THROW_IF_WIN32_BOOL_FALSE(DuplicateHandle(GetCurrentProcess(), event, GetCurrentProcess(), &dup, 0, FALSE, DUPLICATE_SAME_ACCESS)); + m_event.reset(dup); + } + + HRESULT STDMETHODCALLTYPE OnTermination(WSLCVirtualMachineTerminationReason, LPCWSTR) override + { + m_event.SetEvent(); + return S_OK; + } + +private: + wil::unique_event m_event; +}; + constexpr auto c_containerdStorage = "/var/lib/docker"; namespace { @@ -289,6 +311,11 @@ try m_virtualMachine->Initialize(); + // Register a COM callback with the SYSTEM service to be notified when the VM exits. + // The callback signals m_vmExitedEvent, which IORelay monitors to trigger OnVmExited(). + auto vmTermCallback = winrt::make(m_vmExitedEvent.get()); + THROW_IF_FAILED(Vm->RegisterTerminationCallback(vmTermCallback.as().get())); + // Configure storage. ConfigureStorage(*Settings, tokenInfo->User.Sid); @@ -306,6 +333,10 @@ try // Start the event tracker. m_eventTracker.emplace(m_dockerClient.value(), m_id, m_ioRelay); + // Monitor for unexpected VM exit. + m_ioRelay.AddHandle( + std::make_unique(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this))); + // Recover any existing containers from storage. RecoverExistingVolumes(); RecoverExistingContainers(); @@ -413,6 +444,30 @@ void WSLCSession::OnDockerdExited() } } +void WSLCSession::OnVmExited() +{ + if (m_sessionTerminatingEvent.is_signaled()) + { + return; // Already shutting down (normal termination path). + } + + WSL_LOG( + "UnexpectedVmExit", + TraceLoggingLevel(WINEVENT_LEVEL_WARNING), + TraceLoggingValue(m_id, "SessionId"), + TraceLoggingValue(m_displayName.c_str(), "Name")); + + // N.B. This callback runs on the IORelay thread. Terminate() calls m_ioRelay.Stop() + // which joins the IORelay thread, so we must run termination on a separate thread + // to avoid deadlock. Capture a COM reference to prevent the session from being + // destroyed before the thread runs. + Microsoft::WRL::ComPtr self(this); + std::thread([self]() { + wsl::windows::common::wslutil::SetThreadDescription(L"VmExitTermination"); + LOG_IF_FAILED(self->Terminate()); + }).detach(); +} + void WSLCSession::OnDockerdLog(const gsl::span& buffer) try { @@ -1792,40 +1847,55 @@ try m_eventTracker.reset(); m_dockerClient.reset(); + // Check if the VM has already exited (e.g., killed externally). + // If so, skip operations that require a live VM to avoid unnecessary waits. + const bool vmDead = m_vmExitedEvent.is_signaled(); + // Stop dockerd. // N.B. dockerd wait a couple seconds if there are any outstanding HTTP request sockets opened. if (m_dockerdProcess.has_value()) { - LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM)); - - int exitCode = -1; - try + if (!vmDead) { - exitCode = m_dockerdProcess->Wait(30 * 1000); - } - catch (...) - { - LOG_CAUGHT_EXCEPTION(); + LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM)); + + int exitCode = -1; try { - m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL); - exitCode = m_dockerdProcess->Wait(10 * 1000); + exitCode = m_dockerdProcess->Wait(30 * 1000); } - CATCH_LOG(); + catch (...) + { + LOG_CAUGHT_EXCEPTION(); + try + { + m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL); + exitCode = m_dockerdProcess->Wait(10 * 1000); + } + CATCH_LOG(); + } + + WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code")); + } + else + { + WSL_LOG("SkippingDockerdShutdown_VmDead"); } - WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code")); m_dockerdProcess.reset(); } if (m_virtualMachine) { - // N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running. - try + if (!vmDead) { - m_virtualMachine->Unmount(c_containerdStorage); + // N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running. + try + { + m_virtualMachine->Unmount(c_containerdStorage); + } + CATCH_LOG(); } - CATCH_LOG(); m_virtualMachine.reset(); } diff --git a/src/windows/wslcsession/WSLCSession.h b/src/windows/wslcsession/WSLCSession.h index e56f138b8..11b6ff87a 100644 --- a/src/windows/wslcsession/WSLCSession.h +++ b/src/windows/wslcsession/WSLCSession.h @@ -127,6 +127,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession void OnContainerDeleted(const WSLCContainerImpl* Container); void OnDockerdLog(const gsl::span& Data); void OnDockerdExited(); + void OnVmExited(); void StartDockerd(); void ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, const WSLCHandle ImageHandle); void RecoverExistingContainers(); @@ -150,6 +151,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession std::unordered_map> m_volumes; std::unordered_set m_anonymousVolumes; // TODO: Implement proper anonymous volume support. wil::unique_event m_sessionTerminatingEvent{wil::EventOptions::ManualReset}; + wil::unique_event m_vmExitedEvent{wil::EventOptions::ManualReset}; wil::srwlock m_lock; IORelay m_ioRelay; std::optional m_dockerdProcess; diff --git a/test/windows/WSLCTests.cpp b/test/windows/WSLCTests.cpp index af52f2dd6..ca109b497 100644 --- a/test/windows/WSLCTests.cpp +++ b/test/windows/WSLCTests.cpp @@ -6740,4 +6740,96 @@ class WSLCTests ValidateProcessOutput(initProcess, {{1, "OK\n"}}); } + + // Kills the VM for a session by finding it via the "WSLC-" owner in hcsdiag output. + // hcsdiag detail line format: " VM, , , WSLC-" + static void KillSessionVm(LPCWSTR sessionName) + { + auto ownerTag = std::format(L"WSLC-{}", sessionName); + + wsl::windows::common::SubProcess listProc(nullptr, L"hcsdiag.exe list"); + auto listOutput = listProc.RunAndCaptureOutput(10000); + + auto& output = listOutput.Stdout; + auto ownerPos = output.find(ownerTag); + VERIFY_IS_TRUE(ownerPos != std::wstring::npos); + + // The GUID (36 chars) appears before ", WSLC-" in the detail line. + auto guidEnd = output.rfind(L", ", ownerPos); + VERIFY_IS_TRUE(guidEnd != std::wstring::npos && guidEnd >= 36); + + auto vmId = output.substr(guidEnd - 36, 36); + VERIFY_IS_TRUE(wsl::shared::string::ToGuid(vmId.c_str()).has_value()); + + VERIFY_ARE_EQUAL(wsl::windows::common::SubProcess(nullptr, std::format(L"hcsdiag.exe kill {}", vmId).c_str()).Run(10000), 0u); + } + + // Waits for a session to terminate (GetState returns terminated or RPC error). + static bool WaitForSessionTermination(IWSLCSession* session, DWORD timeoutSeconds = 30) + { + for (DWORD i = 0; i < timeoutSeconds; i++) + { + Sleep(1000); + + WSLCSessionState state{}; + auto hr = session->GetState(&state); + if (FAILED(hr) || state == WSLCSessionStateTerminated) + { + return true; + } + } + + return false; + } + + WSLC_TEST_METHOD(VmKillTerminatesSession) + { + static constexpr auto c_sessionName = L"wslc-vm-kill-test"; + auto settings = GetDefaultSessionSettings(c_sessionName); + auto session = CreateSession(settings); + + KillSessionVm(c_sessionName); + + VERIFY_IS_TRUE(WaitForSessionTermination(session.get())); + } + + WSLC_TEST_METHOD(VmKillFailsInFlightOperations) + { + static constexpr auto c_sessionName = L"wslc-vm-kill-inflight-test"; + auto settings = GetDefaultSessionSettings(c_sessionName); + auto session = CreateSession(settings); + + WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "99999"}); + auto process = launcher.Launch(*session); + + KillSessionVm(c_sessionName); + + // The process should fail (not hang). + auto exitEvent = process.GetExitEvent(); + bool exited = exitEvent.wait(30000); + if (!exited) + { + WSLCProcessState processState{}; + int exitCode{}; + VERIFY_IS_TRUE(FAILED(process.Get().GetState(&processState, &exitCode))); + } + } + + WSLC_TEST_METHOD(CleanShutdownStillWorks) + { + auto settings = GetDefaultSessionSettings(L"wslc-clean-shutdown-test"); + auto session = CreateSession(settings); + + ExpectCommandResult(session.get(), {"/bin/echo", "hello"}, 0); + + auto hr = session->Terminate(); + VERIFY_IS_TRUE(SUCCEEDED(hr) || hr == HRESULT_FROM_WIN32(RPC_S_CALL_FAILED)); + + if (SUCCEEDED(hr)) + { + WSLCSessionState state{}; + VERIFY_SUCCEEDED(session->GetState(&state)); + VERIFY_ARE_EQUAL(state, WSLCSessionStateTerminated); + } + } };