Skip to content

Commit aa851c7

Browse files
author
Ben Hillis
committed
Detect unexpected VM exit and terminate wslcsession.exe
When the VM is killed externally (e.g. hcsdiag /kill, kernel panic), the wslcsession.exe process now detects it and exits cleanly instead of hanging indefinitely as a zombie. Implementation: - Add GetExitEvent() to IWSLCVirtualMachine IDL. The SYSTEM service duplicates HcsVirtualMachine::m_vmExitEvent via COM system_handle marshaling so the session process can wait on it. - WSLCVirtualMachine calls GetExitEvent() during Initialize() and exposes VmExitedEvent() for consumers to monitor. - WSLCSession monitors the exit event via IORelay. On unexpected VM exit, OnVmExited() spawns a thread to call Terminate() (must be a separate thread to avoid deadlock with IORelay::Stop). - WSLCSessionManager registers a cleanup callback on HcsVirtualMachine to terminate sessions when VM exits. Callback is cleared in ~HcsVirtualMachine to avoid firing during normal shutdown. - Harden Terminate() to skip dockerd signal/wait and unmount when the VM is already dead, avoiding unnecessary 30s+ hangs. - Add tests: VmKillTerminatesSession, VmKillFailsInFlightOperations, CleanShutdownStillWorks.
1 parent 251e6cf commit aa851c7

6 files changed

Lines changed: 222 additions & 18 deletions

File tree

src/windows/service/exe/HcsVirtualMachine.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings)
4949

5050
// Build HCS settings
5151
hcs::ComputeSystem systemSettings{};
52-
systemSettings.Owner = L"WSL";
52+
systemSettings.Owner = std::format(L"WSLC-{}", Settings->DisplayName);
5353
systemSettings.ShouldTerminateOnLastHandleClosed = true;
5454

5555
// Determine which schema version to use based on the Windows version. Windows 10 does not support
@@ -281,6 +281,15 @@ HcsVirtualMachine::~HcsVirtualMachine()
281281
{
282282
std::lock_guard lock(m_lock);
283283

284+
// Clear the session termination callback before destroying the VM.
285+
// During normal shutdown, the session is already terminating — firing the
286+
// callback would cause a redundant (and potentially crashing) COM call.
287+
// Uses its own lock to avoid deadlock with HCS callback thread.
288+
{
289+
std::lock_guard callbackLock(m_sessionTerminationCallbackLock);
290+
m_sessionTerminationCallback = nullptr;
291+
}
292+
284293
// Wait up to 5 seconds for the VM to terminate gracefully.
285294
bool forceTerminate = false;
286295
if (!m_vmExitEvent.wait(5000))
@@ -581,6 +590,15 @@ try
581590
}
582591
CATCH_RETURN()
583592

593+
HRESULT HcsVirtualMachine::RegisterTerminationCallback(_In_ ITerminationCallback* Callback)
594+
try
595+
{
596+
std::lock_guard lock(m_sessionTerminationCallbackLock);
597+
m_sessionTerminationCallback = Callback;
598+
return S_OK;
599+
}
600+
CATCH_RETURN()
601+
584602
void CALLBACK HcsVirtualMachine::OnVmExitCallback(HCS_EVENT* Event, void* Context)
585603
try
586604
{
@@ -630,6 +648,17 @@ void HcsVirtualMachine::OnExit(const HCS_EVENT* Event)
630648
{
631649
LOG_IF_FAILED(m_terminationCallback->OnTermination(reason, Event->EventData));
632650
}
651+
652+
wil::com_ptr<ITerminationCallback> sessionCallback;
653+
{
654+
std::lock_guard lock(m_sessionTerminationCallbackLock);
655+
sessionCallback = m_sessionTerminationCallback;
656+
}
657+
658+
if (sessionCallback)
659+
{
660+
LOG_IF_FAILED(sessionCallback->OnTermination(reason, Event->EventData));
661+
}
633662
}
634663

635664
void HcsVirtualMachine::OnCrash(const HCS_EVENT* Event)

src/windows/service/exe/HcsVirtualMachine.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class HcsVirtualMachine
4343
IFACEMETHOD(DetachDisk)(_In_ ULONG Lun) override;
4444
IFACEMETHOD(AddShare)(_In_ LPCWSTR WindowsPath, _In_ BOOL ReadOnly, _Out_ GUID* ShareId) override;
4545
IFACEMETHOD(RemoveShare)(_In_ REFGUID ShareId) override;
46+
IFACEMETHOD(RegisterTerminationCallback)(_In_ ITerminationCallback* Callback) override;
4647

4748
private:
4849
struct DiskInfo
@@ -97,6 +98,12 @@ class HcsVirtualMachine
9798
bool m_crashLogCaptured = false;
9899

99100
wil::com_ptr<ITerminationCallback> m_terminationCallback;
101+
102+
// Session-side termination callback, registered via RegisterTerminationCallback().
103+
// Guarded by m_sessionTerminationCallbackLock (separate from m_lock to avoid
104+
// deadlock between HCS callback thread and destructor which holds m_lock).
105+
std::mutex m_sessionTerminationCallbackLock;
106+
wil::com_ptr<ITerminationCallback> m_sessionTerminationCallback;
100107
};
101108

102109
} // namespace wsl::windows::service::wslc

src/windows/service/inc/wslc.idl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,10 @@ interface IWSLCVirtualMachine : IUnknown
425425

426426
// Removes a previously added filesystem share.
427427
HRESULT RemoveShare([in] REFGUID ShareId);
428+
429+
// Registers a callback to be invoked when the VM exits.
430+
// The callback receives the exit reason and optional details.
431+
HRESULT RegisterTerminationCallback([in] ITerminationCallback* Callback);
428432
}
429433

430434
typedef enum _WSLCSessionStorageFlags

src/windows/wslcsession/WSLCSession.cpp

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,28 @@ using wsl::windows::service::wslc::UserHandle;
2626
using wsl::windows::service::wslc::WSLCSession;
2727
using wsl::windows::service::wslc::WSLCVirtualMachine;
2828

29+
// COM callback that signals a local event when the VM terminates.
30+
// Registered with IWSLCVirtualMachine::RegisterTerminationCallback() so the
31+
// SYSTEM service can notify us cross-process when HCS reports VM exit.
32+
struct VmTerminationCallback : winrt::implements<VmTerminationCallback, ITerminationCallback>
33+
{
34+
VmTerminationCallback(HANDLE event)
35+
{
36+
HANDLE dup = nullptr;
37+
THROW_IF_WIN32_BOOL_FALSE(DuplicateHandle(GetCurrentProcess(), event, GetCurrentProcess(), &dup, 0, FALSE, DUPLICATE_SAME_ACCESS));
38+
m_event.reset(dup);
39+
}
40+
41+
HRESULT STDMETHODCALLTYPE OnTermination(WSLCVirtualMachineTerminationReason, LPCWSTR) override
42+
{
43+
m_event.SetEvent();
44+
return S_OK;
45+
}
46+
47+
private:
48+
wil::unique_event m_event;
49+
};
50+
2951
constexpr auto c_containerdStorage = "/var/lib/docker";
3052

3153
namespace {
@@ -289,6 +311,11 @@ try
289311

290312
m_virtualMachine->Initialize();
291313

314+
// Register a COM callback with the SYSTEM service to be notified when the VM exits.
315+
// The callback signals m_vmExitedEvent, which IORelay monitors to trigger OnVmExited().
316+
auto vmTermCallback = winrt::make<VmTerminationCallback>(m_vmExitedEvent.get());
317+
THROW_IF_FAILED(Vm->RegisterTerminationCallback(vmTermCallback.as<ITerminationCallback>().get()));
318+
292319
// Configure storage.
293320
ConfigureStorage(*Settings, tokenInfo->User.Sid);
294321

@@ -306,6 +333,10 @@ try
306333
// Start the event tracker.
307334
m_eventTracker.emplace(m_dockerClient.value(), m_id, m_ioRelay);
308335

336+
// Monitor for unexpected VM exit.
337+
m_ioRelay.AddHandle(
338+
std::make_unique<windows::common::relay::EventHandle>(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this)));
339+
309340
// Recover any existing containers from storage.
310341
RecoverExistingVolumes();
311342
RecoverExistingContainers();
@@ -413,6 +444,30 @@ void WSLCSession::OnDockerdExited()
413444
}
414445
}
415446

447+
void WSLCSession::OnVmExited()
448+
{
449+
if (m_sessionTerminatingEvent.is_signaled())
450+
{
451+
return; // Already shutting down (normal termination path).
452+
}
453+
454+
WSL_LOG(
455+
"UnexpectedVmExit",
456+
TraceLoggingLevel(WINEVENT_LEVEL_WARNING),
457+
TraceLoggingValue(m_id, "SessionId"),
458+
TraceLoggingValue(m_displayName.c_str(), "Name"));
459+
460+
// N.B. This callback runs on the IORelay thread. Terminate() calls m_ioRelay.Stop()
461+
// which joins the IORelay thread, so we must run termination on a separate thread
462+
// to avoid deadlock. Capture a COM reference to prevent the session from being
463+
// destroyed before the thread runs.
464+
Microsoft::WRL::ComPtr<WSLCSession> self(this);
465+
std::thread([self]() {
466+
wsl::windows::common::wslutil::SetThreadDescription(L"VmExitTermination");
467+
LOG_IF_FAILED(self->Terminate());
468+
}).detach();
469+
}
470+
416471
void WSLCSession::OnDockerdLog(const gsl::span<char>& buffer)
417472
try
418473
{
@@ -1792,40 +1847,55 @@ try
17921847
m_eventTracker.reset();
17931848
m_dockerClient.reset();
17941849

1850+
// Check if the VM has already exited (e.g., killed externally).
1851+
// If so, skip operations that require a live VM to avoid unnecessary waits.
1852+
const bool vmDead = m_vmExitedEvent.is_signaled();
1853+
17951854
// Stop dockerd.
17961855
// N.B. dockerd wait a couple seconds if there are any outstanding HTTP request sockets opened.
17971856
if (m_dockerdProcess.has_value())
17981857
{
1799-
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
1800-
1801-
int exitCode = -1;
1802-
try
1858+
if (!vmDead)
18031859
{
1804-
exitCode = m_dockerdProcess->Wait(30 * 1000);
1805-
}
1806-
catch (...)
1807-
{
1808-
LOG_CAUGHT_EXCEPTION();
1860+
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
1861+
1862+
int exitCode = -1;
18091863
try
18101864
{
1811-
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
1812-
exitCode = m_dockerdProcess->Wait(10 * 1000);
1865+
exitCode = m_dockerdProcess->Wait(30 * 1000);
18131866
}
1814-
CATCH_LOG();
1867+
catch (...)
1868+
{
1869+
LOG_CAUGHT_EXCEPTION();
1870+
try
1871+
{
1872+
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
1873+
exitCode = m_dockerdProcess->Wait(10 * 1000);
1874+
}
1875+
CATCH_LOG();
1876+
}
1877+
1878+
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
1879+
}
1880+
else
1881+
{
1882+
WSL_LOG("SkippingDockerdShutdown_VmDead");
18151883
}
18161884

1817-
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
18181885
m_dockerdProcess.reset();
18191886
}
18201887

18211888
if (m_virtualMachine)
18221889
{
1823-
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
1824-
try
1890+
if (!vmDead)
18251891
{
1826-
m_virtualMachine->Unmount(c_containerdStorage);
1892+
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
1893+
try
1894+
{
1895+
m_virtualMachine->Unmount(c_containerdStorage);
1896+
}
1897+
CATCH_LOG();
18271898
}
1828-
CATCH_LOG();
18291899

18301900
m_virtualMachine.reset();
18311901
}

src/windows/wslcsession/WSLCSession.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession
127127
void OnContainerDeleted(const WSLCContainerImpl* Container);
128128
void OnDockerdLog(const gsl::span<char>& Data);
129129
void OnDockerdExited();
130+
void OnVmExited();
130131
void StartDockerd();
131132
void ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, const WSLCHandle ImageHandle);
132133
void RecoverExistingContainers();
@@ -150,6 +151,7 @@ class DECLSPEC_UUID("4877FEFC-4977-4929-A958-9F36AA1892A4") WSLCSession
150151
std::unordered_map<std::string, std::unique_ptr<WSLCVhdVolumeImpl>> m_volumes;
151152
std::unordered_set<std::string> m_anonymousVolumes; // TODO: Implement proper anonymous volume support.
152153
wil::unique_event m_sessionTerminatingEvent{wil::EventOptions::ManualReset};
154+
wil::unique_event m_vmExitedEvent{wil::EventOptions::ManualReset};
153155
wil::srwlock m_lock;
154156
IORelay m_ioRelay;
155157
std::optional<ServiceRunningProcess> m_dockerdProcess;

test/windows/WSLCTests.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6740,4 +6740,96 @@ class WSLCTests
67406740

67416741
ValidateProcessOutput(initProcess, {{1, "OK\n"}});
67426742
}
6743+
6744+
// Kills the VM for a session by finding it via the "WSLC-<name>" owner in hcsdiag output.
6745+
// hcsdiag detail line format: " VM, <State>, <GUID>, WSLC-<name>"
6746+
static void KillSessionVm(LPCWSTR sessionName)
6747+
{
6748+
auto ownerTag = std::format(L"WSLC-{}", sessionName);
6749+
6750+
wsl::windows::common::SubProcess listProc(nullptr, L"hcsdiag.exe list");
6751+
auto listOutput = listProc.RunAndCaptureOutput(10000);
6752+
6753+
auto& output = listOutput.Stdout;
6754+
auto ownerPos = output.find(ownerTag);
6755+
VERIFY_IS_TRUE(ownerPos != std::wstring::npos);
6756+
6757+
// The GUID (36 chars) appears before ", WSLC-<name>" in the detail line.
6758+
auto guidEnd = output.rfind(L", ", ownerPos);
6759+
VERIFY_IS_TRUE(guidEnd != std::wstring::npos && guidEnd >= 36);
6760+
6761+
auto vmId = output.substr(guidEnd - 36, 36);
6762+
VERIFY_IS_TRUE(wsl::shared::string::ToGuid(vmId.c_str()).has_value());
6763+
6764+
VERIFY_ARE_EQUAL(wsl::windows::common::SubProcess(nullptr, std::format(L"hcsdiag.exe kill {}", vmId).c_str()).Run(10000), 0u);
6765+
}
6766+
6767+
// Waits for a session to terminate (GetState returns terminated or RPC error).
6768+
static bool WaitForSessionTermination(IWSLCSession* session, DWORD timeoutSeconds = 30)
6769+
{
6770+
for (DWORD i = 0; i < timeoutSeconds; i++)
6771+
{
6772+
Sleep(1000);
6773+
6774+
WSLCSessionState state{};
6775+
auto hr = session->GetState(&state);
6776+
if (FAILED(hr) || state == WSLCSessionStateTerminated)
6777+
{
6778+
return true;
6779+
}
6780+
}
6781+
6782+
return false;
6783+
}
6784+
6785+
WSLC_TEST_METHOD(VmKillTerminatesSession)
6786+
{
6787+
static constexpr auto c_sessionName = L"wslc-vm-kill-test";
6788+
auto settings = GetDefaultSessionSettings(c_sessionName);
6789+
auto session = CreateSession(settings);
6790+
6791+
KillSessionVm(c_sessionName);
6792+
6793+
VERIFY_IS_TRUE(WaitForSessionTermination(session.get()));
6794+
}
6795+
6796+
WSLC_TEST_METHOD(VmKillFailsInFlightOperations)
6797+
{
6798+
static constexpr auto c_sessionName = L"wslc-vm-kill-inflight-test";
6799+
auto settings = GetDefaultSessionSettings(c_sessionName);
6800+
auto session = CreateSession(settings);
6801+
6802+
WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "99999"});
6803+
auto process = launcher.Launch(*session);
6804+
6805+
KillSessionVm(c_sessionName);
6806+
6807+
// The process should fail (not hang).
6808+
auto exitEvent = process.GetExitEvent();
6809+
bool exited = exitEvent.wait(30000);
6810+
if (!exited)
6811+
{
6812+
WSLCProcessState processState{};
6813+
int exitCode{};
6814+
VERIFY_IS_TRUE(FAILED(process.Get().GetState(&processState, &exitCode)));
6815+
}
6816+
}
6817+
6818+
WSLC_TEST_METHOD(CleanShutdownStillWorks)
6819+
{
6820+
auto settings = GetDefaultSessionSettings(L"wslc-clean-shutdown-test");
6821+
auto session = CreateSession(settings);
6822+
6823+
ExpectCommandResult(session.get(), {"/bin/echo", "hello"}, 0);
6824+
6825+
auto hr = session->Terminate();
6826+
VERIFY_IS_TRUE(SUCCEEDED(hr) || hr == HRESULT_FROM_WIN32(RPC_S_CALL_FAILED));
6827+
6828+
if (SUCCEEDED(hr))
6829+
{
6830+
WSLCSessionState state{};
6831+
VERIFY_SUCCEEDED(session->GetState(&state));
6832+
VERIFY_ARE_EQUAL(state, WSLCSessionStateTerminated);
6833+
}
6834+
}
67436835
};

0 commit comments

Comments
 (0)