From e3529abc0d3db5cfa6c651737722f094f2254927 Mon Sep 17 00:00:00 2001 From: dorianzheng Date: Fri, 3 Apr 2026 23:44:28 +0800 Subject: [PATCH 01/33] Disable TSI fallback for offline guests --- include/libkrun.h | 19 ++++++++++++++ src/devices/src/virtio/vsock/device.rs | 4 +++ src/devices/src/virtio/vsock/muxer.rs | 9 +++++-- src/libkrun/src/lib.rs | 36 +++++++++++++++++++------- src/vmm/src/builder.rs | 10 ++++--- 5 files changed, 63 insertions(+), 15 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index b8f8008a5..af0733e4c 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -462,6 +462,25 @@ int32_t krun_add_net_unixgram(uint32_t ctx_id, uint32_t features, uint32_t flags); +/** + * Disables automatic TSI networking for this context. + * + * When no virtio-net devices are added, libkrun normally enables the TSI backend. + * Call this function to keep the guest fully offline instead: no virtio-net + * device will be attached and TSI will remain disabled. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * + * Notes: + * This function only affects the automatic fallback path used when no network + * devices are added. It should be called before krun_start_enter. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_disable_tsi(uint32_t ctx_id); + /** * Adds an independent virtio-net device with the tap backend. * Call to this function disables TSI backend. diff --git a/src/devices/src/virtio/vsock/device.rs b/src/devices/src/virtio/vsock/device.rs index a61043b4c..f57208798 100644 --- a/src/devices/src/virtio/vsock/device.rs +++ b/src/devices/src/virtio/vsock/device.rs @@ -78,6 +78,10 @@ impl Vsock { self.cid } + pub fn enable_tsi(&self) -> bool { + self.muxer.enable_tsi() + } + /// Walk the driver-provided RX queue buffers and attempt to fill them up with any data that we /// have pending. Return `true` if descriptors have been added to the used ring, and `false` /// otherwise. diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index f4c10247e..ea790b099 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -4,9 +4,10 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use super::super::Queue as VirtQueue; +use super::VsockError; use super::defs; use super::defs::uapi; -use super::muxer_rxq::{rx_to_pkt, MuxerRxQ}; +use super::muxer_rxq::{MuxerRxQ, rx_to_pkt}; use super::muxer_thread::MuxerThread; use super::packet::{TsiConnectReq, TsiGetnameRsp, VsockPacket}; use super::proxy::{Proxy, ProxyRemoval, ProxyUpdate}; @@ -18,7 +19,7 @@ use super::tsi_stream::TsiStreamProxy; use super::unix::UnixProxy; use super::TsiFlags; use super::VsockError; -use crossbeam_channel::{unbounded, Sender}; +use crossbeam_channel::{Sender, unbounded}; use utils::epoll::{ControlOperation, Epoll, EpollEvent, EventSet}; use vm_memory::GuestMemoryMmap; @@ -132,6 +133,10 @@ impl VsockMuxer { } } + pub fn enable_tsi(&self) -> bool { + self.enable_tsi + } + pub(crate) fn activate( &mut self, mem: GuestMemoryMmap, diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 8acf6d205..9ac95e483 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -3,13 +3,13 @@ extern crate log; use crossbeam_channel::unbounded; #[cfg(feature = "blk")] +use devices::virtio::CacheType; +#[cfg(feature = "blk")] use devices::virtio::block::{ImageType, SyncMode}; #[cfg(feature = "gpu")] use devices::virtio::gpu::display::DisplayInfo; #[cfg(feature = "net")] use devices::virtio::net::device::VirtioNetBackend; -#[cfg(feature = "blk")] -use devices::virtio::CacheType; use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; @@ -19,13 +19,13 @@ use once_cell::sync::Lazy; use polly::event_manager::EventManager; #[cfg(all(feature = "blk", not(feature = "tee")))] use rand::distr::{Alphanumeric, SampleString}; -use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::collections::hash_map::Entry; use std::convert::TryInto; use std::env; #[cfg(target_os = "linux")] use std::ffi::CString; -use std::ffi::{c_void, CStr}; +use std::ffi::{CStr, c_void}; use std::fs::File; use std::io::IsTerminal; #[cfg(target_os = "linux")] @@ -33,9 +33,9 @@ use std::os::fd::AsRawFd; use std::os::fd::{BorrowedFd, FromRawFd, RawFd}; use std::path::PathBuf; use std::slice; -use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::LazyLock; use std::sync::Mutex; +use std::sync::atomic::{AtomicI32, Ordering}; use utils::eventfd::EventFd; use vmm::resources::{ DefaultVirtioConsoleConfig, PortConfig, SerialConsoleConfig, TsiFlags, VirtioConsoleConfigMode, @@ -52,7 +52,7 @@ use vmm::vmm_config::fs::FsDeviceConfig; use vmm::vmm_config::kernel_bundle::KernelBundle; #[cfg(feature = "tee")] use vmm::vmm_config::kernel_bundle::{InitrdBundle, QbootBundle}; -use vmm::vmm_config::kernel_cmdline::{KernelCmdlineConfig, DEFAULT_KERNEL_CMDLINE}; +use vmm::vmm_config::kernel_cmdline::{DEFAULT_KERNEL_CMDLINE, KernelCmdlineConfig}; use vmm::vmm_config::machine_config::VmConfig; #[cfg(feature = "net")] use vmm::vmm_config::net::NetworkInterfaceConfig; @@ -62,7 +62,7 @@ use vmm::vmm_config::vsock::VsockDeviceConfig; use aws_nitro::enclave::NitroEnclave; #[cfg(feature = "gpu")] -use devices::virtio::display::{DisplayInfoEdid, PhysicalSize, MAX_DISPLAYS}; +use devices::virtio::display::{DisplayInfoEdid, MAX_DISPLAYS, PhysicalSize}; #[cfg(feature = "input")] use krun_input::{InputConfigBackend, InputEventProviderBackend}; @@ -143,6 +143,8 @@ struct ContextConfig { legacy_net_cfg: Option, #[cfg(feature = "net")] legacy_mac: Option<[u8; 6]>, + #[cfg(feature = "net")] + disable_tsi: bool, net_index: u8, tsi_port_map: Option>, vsock_config: VsockConfig, @@ -1063,6 +1065,20 @@ pub unsafe extern "C" fn krun_add_net_unixgram( KRUN_SUCCESS } +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(feature = "net")] +pub unsafe extern "C" fn krun_disable_tsi(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.disable_tsi = true; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS +} + #[allow(clippy::missing_safety_doc)] #[no_mangle] #[cfg(all(target_os = "linux", feature = "net"))] @@ -1093,7 +1109,9 @@ pub unsafe extern "C" fn krun_add_net_tap( if features & (NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_TSO6 | NET_FEATURE_GUEST_UFO) != 0 && features & NET_FEATURE_GUEST_CSUM == 0 { - debug!("Network tap backend requires GUEST_CSUM to be requested if any of GUEST_TSO4, GUEST_TSO6 and/or GUEST_UFO are required"); + debug!( + "Network tap backend requires GUEST_CSUM to be requested if any of GUEST_TSO4, GUEST_TSO6 and/or GUEST_UFO are required" + ); return -libc::EINVAL; } @@ -1922,7 +1940,7 @@ pub extern "C" fn krun_has_feature(feature: u64) -> c_int { pub extern "C" fn krun_get_max_vcpus() -> i32 { #[cfg(target_os = "macos")] { - use hvf::bindings::{hv_vm_get_max_vcpu_count, HV_SUCCESS}; + use hvf::bindings::{HV_SUCCESS, hv_vm_get_max_vcpu_count}; let mut max_vcpu_count: u32 = 0; let ret = unsafe { hv_vm_get_max_vcpu_count(&mut max_vcpu_count as *mut u32) }; if ret == HV_SUCCESS { diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 80609eb6b..898bb56ad 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -3,9 +3,9 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. +use crossbeam_channel::Sender; #[cfg(target_os = "macos")] use crossbeam_channel::unbounded; -use crossbeam_channel::Sender; use kernel::cmdline::Cmdline; #[cfg(target_os = "macos")] use std::collections::HashMap; @@ -45,7 +45,7 @@ use devices::legacy::{IoApic, IrqChipT}; use devices::legacy::{IrqChip, IrqChipDevice}; #[cfg(all(target_os = "linux", target_arch = "aarch64"))] use devices::legacy::{KvmGicV2, KvmGicV3}; -use devices::virtio::{port_io, MmioTransport, PortDescription, VirtioDevice, Vsock}; +use devices::virtio::{MmioTransport, PortDescription, VirtioDevice, Vsock, port_io}; #[cfg(feature = "tee")] use kbs_types::Tee; @@ -72,7 +72,7 @@ use device_manager::shm::ShmManager; use devices::virtio::display::DisplayInfo; #[cfg(feature = "gpu")] use devices::virtio::display::NoopDisplayBackend; -#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +#[cfg(not(any(feature = "tee", feature = "nitro")))] use devices::virtio::{fs::ExportTable, VirtioShmRegion}; use flate2::read::GzDecoder; #[cfg(feature = "gpu")] @@ -90,13 +90,15 @@ use utils::eventfd::EventFd; use utils::worker_message::WorkerMessage; #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] use vm_memory::mmap::MmapRegion; -#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +#[cfg(not(any(feature = "tee", feature = "nitro")))] use vm_memory::Address; use vm_memory::Bytes; #[cfg(not(feature = "aws-nitro"))] use vm_memory::GuestMemory; #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] use vm_memory::GuestRegionMmap; +#[cfg(all(target_arch = "x86_64", not(feature = "efi"), not(feature = "tee")))] +use vm_memory::mmap::MmapRegion; use vm_memory::{GuestAddress, GuestMemoryMmap}; /// Errors associated with starting the instance. From ef33551cf26f61fadf67aceb82cdea2b41639fd9 Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 16 Apr 2026 17:33:05 +0800 Subject: [PATCH 02/33] feat: add Windows WHPX hypervisor backend Add WHPX (Windows Hypervisor Platform) VMM backend enabling libkrun to run on Windows alongside existing KVM and Hypervisor.framework backends. - 33 new VMM files in src/vmm/src/windows/ (WHPX bindings, vCPU loop, memory management, interrupt controller, device emulation) - C API entry points in src/libkrun/src/windows_api.rs - Unix stub functions in lib.rs for krun_start/wait/stop/console/net Co-Authored-By: Claude Opus 4.6 --- src/libkrun/src/lib.rs | 62 +- src/libkrun/src/windows_api.rs | 810 ++++++++++ src/vm-memory/Cargo.lock | 689 +++++++++ src/vmm/src/lib.rs | 2 + src/vmm/src/windows/boot/loader.rs | 451 ++++++ src/vmm/src/windows/boot/mod.rs | 5 + src/vmm/src/windows/boot/params.rs | 245 +++ src/vmm/src/windows/boot/setup.rs | 315 ++++ src/vmm/src/windows/cmdline.rs | 181 +++ src/vmm/src/windows/context.rs | 306 ++++ src/vmm/src/windows/devices/manager.rs | 705 +++++++++ src/vmm/src/windows/devices/mod.rs | 7 + src/vmm/src/windows/devices/pic.rs | 683 +++++++++ src/vmm/src/windows/devices/pit.rs | 648 ++++++++ src/vmm/src/windows/devices/serial.rs | 381 +++++ src/vmm/src/windows/devices/virtio/block.rs | 635 ++++++++ src/vmm/src/windows/devices/virtio/disk.rs | 1078 ++++++++++++++ src/vmm/src/windows/devices/virtio/mmio.rs | 663 +++++++++ src/vmm/src/windows/devices/virtio/mod.rs | 16 + src/vmm/src/windows/devices/virtio/net.rs | 893 +++++++++++ .../windows/devices/virtio/p9/filesystem.rs | 1089 ++++++++++++++ src/vmm/src/windows/devices/virtio/p9/mod.rs | 820 ++++++++++ .../src/windows/devices/virtio/p9/protocol.rs | 1316 +++++++++++++++++ src/vmm/src/windows/devices/virtio/queue.rs | 699 +++++++++ .../devices/virtio/vsock/connection.rs | 681 +++++++++ .../src/windows/devices/virtio/vsock/mod.rs | 1238 ++++++++++++++++ .../windows/devices/virtio/vsock/packet.rs | 449 ++++++ src/vmm/src/windows/error.rs | 116 ++ src/vmm/src/windows/insn.rs | 662 +++++++++ src/vmm/src/windows/memory.rs | 325 ++++ src/vmm/src/windows/mod.rs | 16 + src/vmm/src/windows/runner.rs | 531 +++++++ src/vmm/src/windows/types.rs | 112 ++ src/vmm/src/windows/vcpu.rs | 112 ++ src/vmm/src/windows/whpx.rs | 872 +++++++++++ 35 files changed, 17812 insertions(+), 1 deletion(-) create mode 100644 src/libkrun/src/windows_api.rs create mode 100644 src/vm-memory/Cargo.lock create mode 100644 src/vmm/src/windows/boot/loader.rs create mode 100644 src/vmm/src/windows/boot/mod.rs create mode 100644 src/vmm/src/windows/boot/params.rs create mode 100644 src/vmm/src/windows/boot/setup.rs create mode 100644 src/vmm/src/windows/cmdline.rs create mode 100644 src/vmm/src/windows/context.rs create mode 100644 src/vmm/src/windows/devices/manager.rs create mode 100644 src/vmm/src/windows/devices/mod.rs create mode 100644 src/vmm/src/windows/devices/pic.rs create mode 100644 src/vmm/src/windows/devices/pit.rs create mode 100644 src/vmm/src/windows/devices/serial.rs create mode 100644 src/vmm/src/windows/devices/virtio/block.rs create mode 100644 src/vmm/src/windows/devices/virtio/disk.rs create mode 100644 src/vmm/src/windows/devices/virtio/mmio.rs create mode 100644 src/vmm/src/windows/devices/virtio/mod.rs create mode 100644 src/vmm/src/windows/devices/virtio/net.rs create mode 100644 src/vmm/src/windows/devices/virtio/p9/filesystem.rs create mode 100644 src/vmm/src/windows/devices/virtio/p9/mod.rs create mode 100644 src/vmm/src/windows/devices/virtio/p9/protocol.rs create mode 100644 src/vmm/src/windows/devices/virtio/queue.rs create mode 100644 src/vmm/src/windows/devices/virtio/vsock/connection.rs create mode 100644 src/vmm/src/windows/devices/virtio/vsock/mod.rs create mode 100644 src/vmm/src/windows/devices/virtio/vsock/packet.rs create mode 100644 src/vmm/src/windows/error.rs create mode 100644 src/vmm/src/windows/insn.rs create mode 100644 src/vmm/src/windows/memory.rs create mode 100644 src/vmm/src/windows/mod.rs create mode 100644 src/vmm/src/windows/runner.rs create mode 100644 src/vmm/src/windows/types.rs create mode 100644 src/vmm/src/windows/vcpu.rs create mode 100644 src/vmm/src/windows/whpx.rs diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 9ac95e483..ce13b10e2 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -1,6 +1,12 @@ #[macro_use] extern crate log; +// On Windows, the entire C API is implemented in windows_api.rs, +// delegating to vmm::windows::* instead of the Unix VMM infrastructure. +#[cfg(target_os = "windows")] +mod windows_api; + +#[cfg(not(target_os = "windows"))] use crossbeam_channel::unbounded; #[cfg(feature = "blk")] use devices::virtio::CacheType; @@ -2783,7 +2789,61 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { } } -#[cfg(feature = "aws-nitro")] +// ============================================================================ +// New functions — Unix stubs (full implementations in windows_api.rs) +// ============================================================================ + +/// Start VM on a background thread (non-blocking). +/// Not yet implemented on Unix — use krun_start_enter() instead. +#[cfg(not(target_os = "windows"))] +#[no_mangle] +pub extern "C" fn krun_start(_ctx_id: u32) -> i32 { + -libc::ENOSYS +} + +/// Block until a running VM exits. Returns exit code. +/// Not yet implemented on Unix. +#[cfg(not(target_os = "windows"))] +#[no_mangle] +pub extern "C" fn krun_wait(_ctx_id: u32) -> i32 { + -libc::ENOSYS +} + +/// Request a running VM to stop (non-blocking). +/// Not yet implemented on Unix. +#[cfg(not(target_os = "windows"))] +#[no_mangle] +pub extern "C" fn krun_stop(_ctx_id: u32) -> i32 { + -libc::ENOSYS +} + +/// Get captured console output for a VM. +/// Not yet implemented on Unix. +#[cfg(not(target_os = "windows"))] +#[no_mangle] +pub unsafe extern "C" fn krun_get_console_output( + _ctx_id: u32, + _buf: *mut u8, + _buf_size: u32, +) -> i32 { + -libc::ENOSYS +} + +/// Add a network device backed by a TCP endpoint. +/// Not yet implemented on Unix — use krun_add_net_unixstream/unixgram instead. +#[cfg(not(target_os = "windows"))] +#[no_mangle] +pub unsafe extern "C" fn krun_add_net( + _ctx_id: u32, + _c_endpoint: *const c_char, + _c_mac: *const u8, +) -> i32 { + -libc::ENOSYS +} + +// ============================================================================ + +#[cfg(feature = "nitro")] #[no_mangle] fn krun_start_enter_nitro(ctx_id: u32) -> i32 { let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { diff --git a/src/libkrun/src/windows_api.rs b/src/libkrun/src/windows_api.rs new file mode 100644 index 000000000..5828fb295 --- /dev/null +++ b/src/libkrun/src/windows_api.rs @@ -0,0 +1,810 @@ +//! Windows C API implementation for libkrun. +//! +//! All functions follow the libkrun convention: +//! - Return 0 on success, negative on error +//! - Context IDs are u32 +//! - Strings are null-terminated C strings +//! +//! On Windows, functions delegate to `vmm::windows::*` instead of the +//! Unix-specific VMM infrastructure. + +use std::ffi::CStr; +use std::os::raw::c_char; +use std::path::PathBuf; + +use vmm::windows::context::{ + self, DiskConfig, FsMount, NetConfig, VsockPort, DISK_FORMAT_QCOW2, DISK_FORMAT_RAW, +}; +use vmm::windows::devices::manager as devices; +use vmm::windows::error::{Result, WkrunError}; +use vmm::windows::types::VmState; + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Convert a Result to a C API return code (0 = success, negative = error). +fn to_c_result(result: Result<()>) -> i32 { + match result { + Ok(()) => 0, + Err(ref e) => { + log::error!("{}", e); + i32::from(e) + } + } +} + +/// Convert a C string to a Rust PathBuf. Returns None for null pointers. +/// +/// # Safety +/// +/// The pointer must be null or point to a valid null-terminated C string. +unsafe fn c_str_to_path(ptr: *const c_char) -> Option { + if ptr.is_null() { + None + } else { + Some(PathBuf::from( + CStr::from_ptr(ptr).to_string_lossy().into_owned(), + )) + } +} + +/// Convert a C string to a Rust String. Returns None for null pointers. +/// +/// # Safety +/// +/// The pointer must be null or point to a valid null-terminated C string. +unsafe fn c_str_to_string(ptr: *const c_char) -> Option { + if ptr.is_null() { + None + } else { + Some(CStr::from_ptr(ptr).to_string_lossy().into_owned()) + } +} + +/// Convert a null-terminated array of C strings to a Vec. +/// +/// # Safety +/// +/// `arr` must be null or point to a null-terminated array of null-terminated C strings. +unsafe fn c_str_array_to_vec(arr: *const *const c_char) -> Vec { + if arr.is_null() { + return Vec::new(); + } + let mut result = Vec::new(); + let mut ptr = arr; + loop { + let s = *ptr; + if s.is_null() { + break; + } + result.push(CStr::from_ptr(s).to_string_lossy().into_owned()); + ptr = ptr.add(1); + } + result +} + +// Maximum number of arguments/environment variables we allow. +const MAX_ARGS: usize = 4096; + +// ============================================================================ +// Logging +// ============================================================================ + +#[no_mangle] +pub extern "C" fn krun_set_log_level(level: u32) -> i32 { + let filter = match level { + 0 => log::LevelFilter::Off, + 1 => log::LevelFilter::Error, + 2 => log::LevelFilter::Warn, + 3 => log::LevelFilter::Info, + 4 => log::LevelFilter::Debug, + 5 => log::LevelFilter::Trace, + _ => return -libc::EINVAL, + }; + log::set_max_level(filter); + 0 +} + +#[no_mangle] +pub unsafe extern "C" fn krun_init_log( + _target: i32, + level: u32, + _style: u32, + _options: u32, +) -> i32 { + let env_filter = match level { + 0 => "off", + 1 => "error", + 2 => "warn", + 3 => "info", + 4 => "debug", + 5 => "trace", + _ => "warn", + }; + let _ = env_logger::Builder::new() + .parse_filters(env_filter) + .try_init(); + 0 +} + +// ============================================================================ +// Context management +// ============================================================================ + +#[no_mangle] +pub extern "C" fn krun_create_ctx() -> i32 { + match context::create_ctx() { + Ok(id) => id as i32, + Err(e) => { + log::error!("krun_create_ctx: {}", e); + -1 + } + } +} + +#[no_mangle] +pub extern "C" fn krun_free_ctx(ctx_id: u32) -> i32 { + to_c_result(context::free_ctx(ctx_id)) +} + +// ============================================================================ +// VM configuration +// ============================================================================ + +#[no_mangle] +pub extern "C" fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + if ctx.state != VmState::Created { + return Err(WkrunError::InvalidState { + expected: "Created", + actual: ctx.state.to_string(), + }); + } + if num_vcpus == 0 { + return Err(WkrunError::Config("num_vcpus must be > 0".into())); + } + if ram_mib == 0 { + return Err(WkrunError::Config("ram_mib must be > 0".into())); + } + ctx.num_vcpus = num_vcpus; + ctx.ram_mib = ram_mib; + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.root_path = c_str_to_path(c_root_path); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_virtiofs( + ctx_id: u32, + c_tag: *const c_char, + c_path: *const c_char, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + let tag = c_str_to_string(c_tag) + .ok_or_else(|| WkrunError::Config("virtiofs tag cannot be null".into()))?; + let path = c_str_to_path(c_path) + .ok_or_else(|| WkrunError::Config("virtiofs path cannot be null".into()))?; + ctx.fs_mounts.push(FsMount { + tag, + host_path: path, + }); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_virtiofs2( + ctx_id: u32, + c_tag: *const c_char, + c_path: *const c_char, + _port: u32, +) -> i32 { + // On Windows, virtiofs2 is treated the same as virtiofs (no port parameter needed). + krun_add_virtiofs(ctx_id, c_tag, c_path) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_disk2( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + disk_format: u32, + read_only: bool, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + let id = c_str_to_string(c_block_id) + .ok_or_else(|| WkrunError::Config("block_id cannot be null".into()))?; + let path = c_str_to_path(c_disk_path) + .ok_or_else(|| WkrunError::Config("disk_path cannot be null".into()))?; + if disk_format != DISK_FORMAT_RAW && disk_format != DISK_FORMAT_QCOW2 { + return Err(WkrunError::Config(format!( + "unsupported disk format: {}", + disk_format + ))); + } + ctx.disks.push(DiskConfig { + block_id: id, + path, + format: disk_format, + read_only, + }); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_disk( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + read_only: bool, +) -> i32 { + krun_add_disk2(ctx_id, c_block_id, c_disk_path, DISK_FORMAT_RAW, read_only) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_vsock_port2( + ctx_id: u32, + port: u32, + c_filepath: *const c_char, + listen: bool, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + let path = c_str_to_path(c_filepath) + .ok_or_else(|| WkrunError::Config("vsock filepath cannot be null".into()))?; + ctx.vsock_ports.push(VsockPort { + port, + host_path: path, + listen, + host_tcp_port: None, + }); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_exec( + ctx_id: u32, + c_exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.exec_path = c_str_to_string(c_exec_path); + let args = c_str_array_to_vec(argv); + if args.len() > MAX_ARGS { + return Err(WkrunError::Config(format!( + "too many arguments: {} > {}", + args.len(), + MAX_ARGS + ))); + } + ctx.argv = args; + let env = c_str_array_to_vec(envp); + if env.len() > MAX_ARGS { + return Err(WkrunError::Config(format!( + "too many env vars: {} > {}", + env.len(), + MAX_ARGS + ))); + } + ctx.envp = env; + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_env(ctx_id: u32, c_envp: *const *const c_char) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + let env = c_str_array_to_vec(c_envp); + if env.len() > MAX_ARGS { + return Err(WkrunError::Config(format!( + "too many env vars: {} > {}", + env.len(), + MAX_ARGS + ))); + } + ctx.envp = env; + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_workdir(ctx_id: u32, c_workdir_path: *const c_char) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.workdir = c_str_to_string(c_workdir_path); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_rlimits(ctx_id: u32, c_rlimits: *const *const c_char) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.rlimits = c_str_array_to_vec(c_rlimits); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_console_output(ctx_id: u32, c_filepath: *const c_char) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.console_output = c_str_to_path(c_filepath); + Ok(()) + })) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_kernel( + ctx_id: u32, + c_kernel_path: *const c_char, + _format: u32, + c_initramfs: *const c_char, + c_cmdline: *const c_char, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.kernel_path = c_str_to_path(c_kernel_path); + ctx.initramfs_path = c_str_to_path(c_initramfs); + ctx.kernel_cmdline = c_str_to_string(c_cmdline); + Ok(()) + })) +} + +// ============================================================================ +// Networking +// ============================================================================ + +/// Add a network device backed by a TCP endpoint. +/// +/// On Windows, networking uses TCP sockets to a userspace network proxy +/// (e.g., gvproxy). This replaces the Unix-specific `krun_add_net_unixstream` +/// and `krun_add_net_unixgram`. +#[no_mangle] +pub unsafe extern "C" fn krun_add_net( + ctx_id: u32, + c_endpoint: *const c_char, + c_mac: *const u8, +) -> i32 { + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + let path = c_str_to_path(c_endpoint) + .ok_or_else(|| WkrunError::Config("net endpoint cannot be null".into()))?; + let mac = if c_mac.is_null() { + vmm::windows::devices::virtio::net::generate_mac(ctx_id) + } else { + let mut buf = [0u8; 6]; + std::ptr::copy_nonoverlapping(c_mac, buf.as_mut_ptr(), 6); + buf + }; + ctx.net_config = Some(NetConfig { + mac, + socket_path: path, + }); + Ok(()) + })) +} + +/// Unix stream networking — not available on Windows. +#[no_mangle] +pub unsafe extern "C" fn krun_add_net_unixstream( + _ctx_id: u32, + _c_path: *const c_char, + _fd: i32, + _c_mac: *const u8, + _features: u32, + _flags: u32, +) -> i32 { + log::warn!("krun_add_net_unixstream: not available on Windows, use krun_add_net"); + -libc::ENOSYS +} + +/// Unix dgram networking — not available on Windows. +#[no_mangle] +pub unsafe extern "C" fn krun_add_net_unixgram( + _ctx_id: u32, + _c_path: *const c_char, + _fd: i32, + _c_mac: *const u8, + _features: u32, + _flags: u32, +) -> i32 { + log::warn!("krun_add_net_unixgram: not available on Windows, use krun_add_net"); + -libc::ENOSYS +} + +// ============================================================================ +// No-ops on Windows +// ============================================================================ + +#[no_mangle] +pub extern "C" fn krun_setuid(_ctx_id: u32, _uid: u32) -> i32 { + 0 // No-op on Windows +} + +#[no_mangle] +pub extern "C" fn krun_setgid(_ctx_id: u32, _gid: u32) -> i32 { + 0 // No-op on Windows +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_gpu_options(_ctx_id: u32, _virgl_flags: u32) -> i32 { + 0 // No-op +} + +#[no_mangle] +pub extern "C" fn krun_split_irqchip(_ctx_id: u32, _enable: bool) -> i32 { + 0 // No-op on Windows +} + +#[no_mangle] +pub unsafe extern "C" fn krun_disable_tsi(_ctx_id: u32) -> i32 { + 0 // No-op on Windows (no TSI) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_nested_virt(_ctx_id: u32, _enabled: bool) -> i32 { + 0 // No-op on Windows +} + +#[no_mangle] +pub unsafe extern "C" fn krun_check_nested_virt() -> i32 { + 0 // Not supported on Windows +} + +#[no_mangle] +pub extern "C" fn krun_get_max_vcpus() -> i32 { + // WHPX supports up to 64 vCPUs, but we cap at a reasonable default. + 64 +} + +#[no_mangle] +pub extern "C" fn krun_get_shutdown_eventfd(_ctx_id: u32) -> i32 { + -libc::ENOSYS // eventfd not available on Windows +} + +#[no_mangle] +pub extern "C" fn krun_disable_implicit_console(_ctx_id: u32) -> i32 { + 0 // No-op +} + +// Stubs for functions that reference Unix-only features. +#[no_mangle] +pub unsafe extern "C" fn krun_set_root_disk(_ctx_id: u32, _c_disk_path: *const c_char) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_data_disk(_ctx_id: u32, _c_disk_path: *const c_char) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_root_disk_remount( + _ctx_id: u32, + _device: *const c_char, + _fstype: *const c_char, + _options: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_mapped_volumes( + _ctx_id: u32, + _c_mapped_volumes: *const *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_port_map( + _ctx_id: u32, + _c_port_map: *const *const c_char, +) -> i32 { + 0 // No-op +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_passt_fd(_ctx_id: u32, _fd: i32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_gvproxy_path(_ctx_id: u32, _c_path: *const c_char) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_net_mac(_ctx_id: u32, _c_mac: *const u8) -> i32 { + 0 // No-op, use krun_add_net +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_snd_device(_ctx_id: u32, _enable: bool) -> i32 { + 0 // No-op +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_firmware(_ctx_id: u32, _c_path: *const c_char) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_smbios_oem_strings( + _ctx_id: u32, + _strings: *const *const c_char, +) -> i32 { + 0 // No-op +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_vsock_port( + ctx_id: u32, + port: u32, + c_filepath: *const c_char, +) -> i32 { + krun_add_vsock_port2(ctx_id, port, c_filepath, false) +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_tee_config_file( + _ctx_id: u32, + _c_filepath: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +// ============================================================================ +// VM lifecycle +// ============================================================================ + +/// Start and enter the VM (blocking). Returns exit code. +#[no_mangle] +pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { + let ctx = match context::take_ctx(ctx_id) { + Ok(ctx) => ctx, + Err(e) => { + log::error!("krun_start_enter: {}", e); + return i32::from(&e); + } + }; + + match vmm::windows::runner::run(ctx) { + Ok(exit_code) => exit_code, + Err(ref e) => { + log::error!("krun_start_enter: {}", e); + i32::from(e) + } + } +} + +/// Start VM on a background thread (non-blocking). Returns 0 on success. +#[no_mangle] +pub extern "C" fn krun_start(ctx_id: u32) -> i32 { + let ctx = match context::take_ctx(ctx_id) { + Ok(ctx) => ctx, + Err(ref e) => { + log::error!("krun_start: {}", e); + return i32::from(e); + } + }; + to_c_result(vmm::windows::runner::start(ctx_id, ctx)) +} + +/// Block until a running VM exits. Returns exit code. +#[no_mangle] +pub extern "C" fn krun_wait(ctx_id: u32) -> i32 { + match vmm::windows::runner::wait(ctx_id) { + Ok(exit_code) => exit_code, + Err(ref e) => { + log::error!("krun_wait: {}", e); + i32::from(e) + } + } +} + +/// Request a running VM to stop (non-blocking). Returns 0 on success. +#[no_mangle] +pub extern "C" fn krun_stop(ctx_id: u32) -> i32 { + to_c_result(vmm::windows::runner::stop(ctx_id)) +} + +/// Get captured console output for a VM. +/// +/// If `buf` is null or `buf_size` is 0, returns the total number of bytes available. +/// Otherwise, copies up to `buf_size` bytes into `buf` and returns the number copied. +/// Returns -1 if the ctx_id has no console buffer. +#[no_mangle] +pub unsafe extern "C" fn krun_get_console_output( + ctx_id: u32, + buf: *mut u8, + buf_size: u32, +) -> i32 { + let output = match devices::get_console_output(ctx_id) { + Some(data) => data, + None => return -1, + }; + + if buf.is_null() || buf_size == 0 { + return output.len() as i32; + } + + let copy_len = std::cmp::min(output.len(), buf_size as usize); + if copy_len > 0 { + std::ptr::copy_nonoverlapping(output.as_ptr(), buf, copy_len); + } + copy_len as i32 +} + +// ============================================================================ +// Display / Input / Console stubs (not supported on Windows) +// ============================================================================ + +#[no_mangle] +pub extern "C" fn krun_set_display_backend(_ctx_id: u32, _backend: u32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_display(_ctx_id: u32, _width: u32, _height: u32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub extern "C" fn krun_display_set_refresh_rate( + _ctx_id: u32, + _display_id: u32, + _rate: u32, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub extern "C" fn krun_display_set_physical_size( + _ctx_id: u32, + _display_id: u32, + _mm_width: u32, + _mm_height: u32, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub extern "C" fn krun_display_set_dpi(_ctx_id: u32, _display_id: u32, _dpi: u32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_display_set_edid( + _ctx_id: u32, + _display_id: u32, + _edid: *const u8, + _edid_size: u32, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_input_device( + _ctx_id: u32, + _c_path: *const c_char, + _input_type: u32, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_input_device_fd(_ctx_id: u32, _input_fd: i32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_virtio_console_default( + _ctx_id: u32, + _port_name: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_virtio_console_multiport(_ctx_id: u32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_console_port_tty( + _ctx_id: u32, + _name: *const c_char, + _port_name: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_console_port_inout( + _ctx_id: u32, + _name: *const c_char, + _port_name: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_add_serial_console_default(_ctx_id: u32) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_set_kernel_console( + _ctx_id: u32, + _console_id: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +// ============================================================================ +// Disk format 3 stub +// ============================================================================ + +#[no_mangle] +pub unsafe extern "C" fn krun_add_disk3( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + disk_format: u32, + read_only: bool, + _cache_type: u32, + _sync_mode: u32, +) -> i32 { + // Ignore cache_type and sync_mode on Windows, delegate to disk2. + krun_add_disk2(ctx_id, c_block_id, c_disk_path, disk_format, read_only) +} + +// ============================================================================ +// GPU options 2 stub +// ============================================================================ + +#[no_mangle] +pub unsafe extern "C" fn krun_set_gpu_options2( + _ctx_id: u32, + _virgl_flags: u32, + _shm_size: u64, +) -> i32 { + 0 // No-op +} + +// ============================================================================ +// Nitro / TEE stubs +// ============================================================================ + +#[no_mangle] +pub unsafe extern "C" fn krun_nitro_set_image( + _ctx_id: u32, + _c_image_filepath: *const c_char, +) -> i32 { + -libc::ENOSYS +} + +#[no_mangle] +pub unsafe extern "C" fn krun_nitro_set_start_flags(_ctx_id: u32, _start_flags: u64) -> i32 { + -libc::ENOSYS +} + +// ============================================================================ +// Net tap stubs +// ============================================================================ + +#[no_mangle] +pub unsafe extern "C" fn krun_add_net_tap( + _ctx_id: u32, + _tap_name: *const c_char, + _c_mac: *const u8, +) -> i32 { + -libc::ENOSYS +} diff --git a/src/vm-memory/Cargo.lock b/src/vm-memory/Cargo.lock new file mode 100644 index 000000000..0916609fe --- /dev/null +++ b/src/vm-memory/Cargo.lock @@ -0,0 +1,689 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "matches" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "vm-memory" +version = "0.16.2" +dependencies = [ + "arc-swap", + "bitflags 2.9.1", + "criterion", + "libc", + "matches", + "thiserror", + "vmm-sys-util", + "winapi", +] + +[[package]] +name = "vmm-sys-util" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 0f0f8c258..598da766a 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -30,6 +30,8 @@ mod linux; use crate::linux::vstate; #[cfg(target_os = "macos")] mod macos; +#[cfg(target_os = "windows")] +pub mod windows; mod terminal; pub mod worker; diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs new file mode 100644 index 000000000..8181fbf8a --- /dev/null +++ b/src/vmm/src/windows/boot/loader.rs @@ -0,0 +1,451 @@ +//! Linux bzImage kernel loader. +//! +//! Parses a bzImage file, loads the protected-mode kernel into guest memory, +//! sets up page tables, GDT, boot parameters, and kernel command line. + +use super::params::HDRS_MAGIC; +use super::super::error::{Result, WkrunError}; + +#[cfg(any(target_os = "windows", test))] +use super::params::{E820Entry, E820_RAM, E820_RESERVED}; + +// These imports are only used by the Windows-only load_kernel() function. +#[cfg(target_os = "windows")] +use super::params::BootParams; +#[cfg(target_os = "windows")] +use super::setup::{build_gdt, build_page_tables, configure_boot_registers, gdt_bytes}; +#[cfg(target_os = "windows")] +use super::super::memory::{ + CMDLINE_MAX_SIZE, CMDLINE_START, KERNEL_64BIT_ENTRY_OFFSET, KERNEL_START, PDPT_START, PD_START, + PML4_START, ZERO_PAGE_START, +}; +#[cfg(target_os = "windows")] +use super::super::types::{SpecialRegisters, StandardRegisters}; + +/// Loadflags bit: kernel was loaded high (at 0x100000). +#[cfg(any(target_os = "windows", test))] +const LOADED_HIGH: u8 = 0x01; + +/// Loadflags bit: can use heap (setup heap). +#[cfg(target_os = "windows")] +const CAN_USE_HEAP: u8 = 0x80; + +/// Parsed bzImage header information. +#[derive(Debug)] +pub struct KernelHeader { + /// Boot protocol version (e.g., 0x020F for 2.15). + pub protocol_version: u16, + /// Number of setup sectors (real-mode kernel). + pub setup_sects: u8, + /// Byte offset of the protected-mode kernel within the bzImage. + pub kernel_offset: usize, + /// Size of the protected-mode kernel in bytes. + pub kernel_size: usize, + /// Load flags from the setup header. + pub loadflags: u8, +} + +/// Parse a bzImage and extract header information. +/// +/// Validates the setup header magic ("HdrS") and protocol version, +/// then computes the offset and size of the protected-mode kernel. +pub fn parse_bzimage(kernel_image: &[u8]) -> Result { + // Minimum size: at least the setup header through version field (0x208). + if kernel_image.len() < 0x208 { + return Err(WkrunError::Boot(format!( + "kernel image too small: {} bytes (need at least {})", + kernel_image.len(), + 0x208 + ))); + } + + // Check "HdrS" magic at offset 0x202. + let header_magic = u32::from_le_bytes( + kernel_image[0x202..0x206] + .try_into() + .map_err(|_| WkrunError::Boot("failed to read header magic".into()))?, + ); + if header_magic != HDRS_MAGIC { + return Err(WkrunError::Boot(format!( + "invalid bzImage header magic: expected 0x{:08X} (HdrS), got 0x{:08X}", + HDRS_MAGIC, header_magic + ))); + } + + // Read boot protocol version at offset 0x206. + let protocol_version = u16::from_le_bytes( + kernel_image[0x206..0x208] + .try_into() + .map_err(|_| WkrunError::Boot("failed to read protocol version".into()))?, + ); + + // We require protocol version >= 2.06 for 64-bit boot. + if protocol_version < 0x0206 { + return Err(WkrunError::Boot(format!( + "boot protocol version 0x{:04X} too old (need >= 0x0206)", + protocol_version + ))); + } + + // Read setup_sects at offset 0x1F1. If 0, default to 4. + let mut setup_sects = kernel_image[0x1F1]; + if setup_sects == 0 { + setup_sects = 4; + } + + // Read loadflags at offset 0x211. + let loadflags = kernel_image[0x211]; + + // Protected-mode kernel starts after (setup_sects + 1) * 512 bytes. + // The "+1" accounts for the boot sector (first 512 bytes). + let kernel_offset = (setup_sects as usize + 1) * 512; + if kernel_offset >= kernel_image.len() { + return Err(WkrunError::Boot(format!( + "setup_sects {} puts kernel offset {} beyond image size {}", + setup_sects, + kernel_offset, + kernel_image.len() + ))); + } + + let kernel_size = kernel_image.len() - kernel_offset; + + Ok(KernelHeader { + protocol_version, + setup_sects, + kernel_offset, + kernel_size, + loadflags, + }) +} + +/// Build the E820 memory map for the guest. +/// +/// Creates a standard memory map with: +/// - Low memory (0 .. 0x9FC00) — 640KB conventional +/// - Reserved (0x9FC00 .. 0x100000) — BIOS area +/// - High memory (0x100000 .. ram_end) — main RAM +#[cfg(any(target_os = "windows", test))] +fn build_e820_map(ram_mib: u32) -> Vec { + let ram_bytes = (ram_mib as u64) * 1024 * 1024; + + let mut entries = Vec::new(); + + // Low memory: 0 to 640KB (conventional memory). + entries.push(E820Entry { + addr: 0, + size: 0x9FC00, + entry_type: E820_RAM, + _pad: 0, + }); + + // Reserved: 640KB to 1MB (BIOS, VGA, etc). + entries.push(E820Entry { + addr: 0x9FC00, + size: 0x100000 - 0x9FC00, + entry_type: E820_RESERVED, + _pad: 0, + }); + + // High memory: 1MB to end of RAM. + if ram_bytes > 0x100000 { + entries.push(E820Entry { + addr: 0x100000, + size: ram_bytes - 0x100000, + entry_type: E820_RAM, + _pad: 0, + }); + } + + entries +} + +/// Load a Linux bzImage kernel into guest memory and configure for boot. +/// +/// This performs the complete boot setup: +/// 1. Parse the bzImage header +/// 2. Copy the protected-mode kernel to KERNEL_START (0x100000) +/// 3. Write page tables (PML4, PDPT, PD) to guest memory +/// 4. Write GDT to guest memory +/// 5. Write boot parameters (zero page) with E820 map +/// 6. Write kernel command line +/// 7. Optionally load initrd into high guest memory +/// 8. Configure vCPU registers for 64-bit long mode entry +/// +/// Returns the initial vCPU register state. +#[cfg(target_os = "windows")] +pub fn load_kernel( + guest_mem: &super::super::memory::GuestMemory, + kernel_image: &[u8], + cmdline: &str, + ram_mib: u32, +) -> Result<(StandardRegisters, SpecialRegisters)> { + load_kernel_with_initrd(guest_mem, kernel_image, cmdline, ram_mib, None) +} + +/// Load a Linux bzImage kernel with an optional initrd. +#[cfg(target_os = "windows")] +pub fn load_kernel_with_initrd( + guest_mem: &super::super::memory::GuestMemory, + kernel_image: &[u8], + cmdline: &str, + ram_mib: u32, + initrd: Option<&[u8]>, +) -> Result<(StandardRegisters, SpecialRegisters)> { + let header = parse_bzimage(kernel_image)?; + + // Validate kernel fits in guest memory. + let kernel_end = KERNEL_START + header.kernel_size as u64; + let ram_bytes = (ram_mib as u64) * 1024 * 1024; + if kernel_end > ram_bytes { + return Err(WkrunError::Boot(format!( + "kernel ({} bytes) doesn't fit in {} MiB RAM (needs at least 0x{:X} bytes)", + header.kernel_size, ram_mib, kernel_end + ))); + } + + // Validate command line fits. + let cmdline_bytes = cmdline.as_bytes(); + if cmdline_bytes.len() as u64 + 1 > CMDLINE_MAX_SIZE { + return Err(WkrunError::Boot(format!( + "kernel command line too long: {} bytes (max {})", + cmdline_bytes.len(), + CMDLINE_MAX_SIZE - 1 + ))); + } + + // 1. Copy protected-mode kernel to KERNEL_START. + let kernel_data = &kernel_image[header.kernel_offset..]; + guest_mem.write_at_addr(KERNEL_START, kernel_data)?; + + // 2. Write page tables. + let page_tables = build_page_tables(); + guest_mem.write_at_addr(PML4_START, page_tables.pml4_bytes())?; + guest_mem.write_at_addr(PDPT_START, page_tables.pdpt_bytes())?; + for i in 0..4 { + guest_mem.write_at_addr(PD_START + i as u64 * 0x1000, page_tables.pd_bytes(i))?; + } + + // 3. Write GDT. + let gdt = build_gdt(); + let gdt_data = gdt_bytes(&gdt); + // GDT_ADDR is 0x500, defined in setup.rs. Use the constant from memory layout. + guest_mem.write_at_addr(0x500, &gdt_data)?; + + // 4. Build and write boot parameters (zero page). + let mut boot_params = BootParams::new(); + boot_params.set_boot_flag(); + boot_params.set_header_magic(); + boot_params.set_version(header.protocol_version); + boot_params.set_loader_type(0xFF); // Undefined bootloader + boot_params.set_loadflags(LOADED_HIGH | CAN_USE_HEAP); + + // Copy relevant fields from the kernel's own setup header into boot_params. + // The kernel reads some fields back from the zero page that it originally set. + copy_setup_header(&mut boot_params, kernel_image, &header); + + // Set kernel command line. + boot_params.set_cmdline_ptr(CMDLINE_START as u32); + boot_params.set_cmdline_size(cmdline_bytes.len() as u32); + + // Set E820 memory map. + let e820_map = build_e820_map(ram_mib); + boot_params.set_e820_map(&e820_map); + + // Load initrd if provided. Place at the end of RAM (page-aligned). + if let Some(initrd_data) = initrd { + if !initrd_data.is_empty() { + let initrd_size = initrd_data.len() as u64; + // Align initrd to end of RAM, start at page boundary. + let initrd_end = ram_bytes; + let initrd_start = (initrd_end - initrd_size) & !0xFFF; // Page-align down + + if initrd_start < kernel_end { + return Err(WkrunError::Boot(format!( + "initrd ({} bytes) overlaps with kernel at 0x{:X} (initrd at 0x{:X})", + initrd_size, kernel_end, initrd_start + ))); + } + + guest_mem.write_at_addr(initrd_start, initrd_data)?; + boot_params.set_ramdisk(initrd_start as u32, initrd_data.len() as u32); + } + } + + guest_mem.write_at_addr(ZERO_PAGE_START, &boot_params.data)?; + + // 5. Write kernel command line (null-terminated). + let mut cmdline_buf = cmdline_bytes.to_vec(); + cmdline_buf.push(0); // null terminator + guest_mem.write_at_addr(CMDLINE_START, &cmdline_buf)?; + + // 6. Configure vCPU registers for 64-bit long mode. + // The 64-bit entry point (startup_64) is at KERNEL_START + 0x200. + Ok(configure_boot_registers( + KERNEL_START + KERNEL_64BIT_ENTRY_OFFSET, + )) +} + +/// Copy select fields from the kernel's setup header into boot_params. +/// +/// The kernel expects certain fields in the zero page to match what it +/// originally placed in its own setup header. We copy the fields that +/// the kernel reads back during early boot. +#[cfg(target_os = "windows")] +fn copy_setup_header(boot_params: &mut BootParams, kernel_image: &[u8], header: &KernelHeader) { + // setup_sects at offset 0x1F1. + boot_params.data[0x1F1] = header.setup_sects; + + // Copy the setup header region (0x1F1..0x268) from the kernel image. + // This includes fields like code32_start, kernel_alignment, init_size, etc. + // that the kernel reads back during boot. + let header_end = 0x268.min(kernel_image.len()); + if header_end > 0x1F1 { + let src = &kernel_image[0x1F1..header_end]; + boot_params.data[0x1F1..header_end].copy_from_slice(src); + } + + // Override the fields we explicitly set (they take precedence over what + // was in the original kernel header). + boot_params.set_boot_flag(); + boot_params.set_header_magic(); + boot_params.set_version(header.protocol_version); + boot_params.set_loader_type(0xFF); + boot_params.set_loadflags(LOADED_HIGH | CAN_USE_HEAP); +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Build a minimal valid bzImage header for testing. + fn make_test_bzimage(setup_sects: u8, protocol_version: u16, kernel_payload: &[u8]) -> Vec { + // Total real-mode size: (setup_sects + 1) * 512 + let real_mode_size = (setup_sects as usize + 1) * 512; + let mut image = vec![0u8; real_mode_size + kernel_payload.len()]; + + // setup_sects at 0x1F1 + image[0x1F1] = setup_sects; + + // "HdrS" magic at 0x202 + image[0x202..0x206].copy_from_slice(&HDRS_MAGIC.to_le_bytes()); + + // Protocol version at 0x206 + image[0x206..0x208].copy_from_slice(&protocol_version.to_le_bytes()); + + // Loadflags at 0x211 (LOADED_HIGH) + image[0x211] = LOADED_HIGH; + + // Copy kernel payload after real-mode code + image[real_mode_size..].copy_from_slice(kernel_payload); + + image + } + + #[test] + fn test_parse_bzimage_valid() { + let kernel_payload = vec![0xCC; 1024]; // 1KB of int3 + let image = make_test_bzimage(4, 0x020F, &kernel_payload); + + let header = parse_bzimage(&image).expect("should parse valid bzImage"); + assert_eq!(header.protocol_version, 0x020F); + assert_eq!(header.setup_sects, 4); + assert_eq!(header.kernel_offset, (4 + 1) * 512); + assert_eq!(header.kernel_size, 1024); + assert_eq!(header.loadflags & LOADED_HIGH, LOADED_HIGH); + } + + #[test] + fn test_parse_bzimage_setup_sects_zero_defaults_to_4() { + // setup_sects=0 defaults to 4, so kernel_offset = (4+1)*512 = 2560. + // Build image large enough to accommodate this. + let mut image = vec![0u8; (4 + 1) * 512 + 512]; // real-mode + kernel + image[0x1F1] = 0; // setup_sects = 0 + image[0x202..0x206].copy_from_slice(&HDRS_MAGIC.to_le_bytes()); + image[0x206..0x208].copy_from_slice(&0x0206u16.to_le_bytes()); + image[0x211] = LOADED_HIGH; + + let header = parse_bzimage(&image).expect("should parse with setup_sects=0"); + assert_eq!(header.setup_sects, 4); // defaulted from 0 + assert_eq!(header.kernel_offset, (4 + 1) * 512); + } + + #[test] + fn test_parse_bzimage_too_small() { + let image = vec![0u8; 100]; // Way too small + let err = parse_bzimage(&image).unwrap_err(); + assert!( + err.to_string().contains("too small"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_parse_bzimage_bad_magic() { + let mut image = vec![0u8; 0x300]; + image[0x1F1] = 1; + // Don't set "HdrS" magic + let err = parse_bzimage(&image).unwrap_err(); + assert!( + err.to_string().contains("header magic"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_parse_bzimage_old_protocol() { + let mut image = vec![0u8; 0x300]; + image[0x1F1] = 1; + image[0x202..0x206].copy_from_slice(&HDRS_MAGIC.to_le_bytes()); + image[0x206..0x208].copy_from_slice(&0x0200u16.to_le_bytes()); // too old + let err = parse_bzimage(&image).unwrap_err(); + assert!( + err.to_string().contains("too old"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_parse_bzimage_kernel_offset_beyond_image() { + let mut image = vec![0u8; 0x300]; // only ~768 bytes + image[0x1F1] = 10; // setup_sects=10 → offset = 11*512 = 5632 > 768 + image[0x202..0x206].copy_from_slice(&HDRS_MAGIC.to_le_bytes()); + image[0x206..0x208].copy_from_slice(&0x0206u16.to_le_bytes()); + let err = parse_bzimage(&image).unwrap_err(); + assert!( + err.to_string().contains("beyond image size"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_build_e820_map_256mb() { + let map = build_e820_map(256); + assert_eq!(map.len(), 3); + + // Low memory: 0 .. 640KB + assert_eq!(map[0].addr, 0); + assert_eq!(map[0].size, 0x9FC00); + assert_eq!(map[0].entry_type, E820_RAM); + + // Reserved: 640KB .. 1MB + assert_eq!(map[1].addr, 0x9FC00); + assert_eq!(map[1].entry_type, E820_RESERVED); + + // High memory: 1MB .. 256MB + assert_eq!(map[2].addr, 0x100000); + assert_eq!(map[2].size, 256 * 1024 * 1024 - 0x100000); + assert_eq!(map[2].entry_type, E820_RAM); + } + + #[test] + fn test_build_e820_map_1mb_no_high_memory() { + // With only 1MB of RAM, high memory region should be empty (1MB - 1MB = 0). + let map = build_e820_map(1); + assert_eq!(map.len(), 2, "1MB RAM should only have low + reserved"); + } +} diff --git a/src/vmm/src/windows/boot/mod.rs b/src/vmm/src/windows/boot/mod.rs new file mode 100644 index 000000000..80f348f48 --- /dev/null +++ b/src/vmm/src/windows/boot/mod.rs @@ -0,0 +1,5 @@ +//! Linux kernel boot support for x86_64 (Windows WHPX backend). + +pub mod loader; +pub mod params; +pub mod setup; diff --git a/src/vmm/src/windows/boot/params.rs b/src/vmm/src/windows/boot/params.rs new file mode 100644 index 000000000..b1c410e6b --- /dev/null +++ b/src/vmm/src/windows/boot/params.rs @@ -0,0 +1,245 @@ +//! Linux boot_params (zero page) structure. +//! +//! Subset of the Linux boot protocol's boot_params structure +//! needed for direct bzImage boot. + +use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; + +/// E820 memory map entry type constants. +pub const E820_RAM: u32 = 1; +pub const E820_RESERVED: u32 = 2; +pub const E820_ACPI: u32 = 3; + +/// Linux boot protocol magic number. +pub const BOOT_MAGIC: u16 = 0xAA55; + +/// Header magic "HdrS". +pub const HDRS_MAGIC: u32 = 0x5372_6448; + +/// Minimum boot protocol version we support (2.06+). +pub const MIN_BOOT_PROTOCOL: u16 = 0x0206; + +/// E820 memory map entry. +#[repr(C)] +#[derive(Debug, Default, Clone, Copy, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct E820Entry { + pub addr: u64, + pub size: u64, + pub entry_type: u32, + pub _pad: u32, +} + +/// Minimal subset of Linux setup_header structure. +/// Located at offset 0x1F1 in the zero page. +#[repr(C, packed)] +#[derive(Debug, Default, Clone, Copy)] +pub struct SetupHeader { + pub setup_sects: u8, + pub root_flags: u16, + pub syssize: u32, + pub ram_size: u16, + pub vid_mode: u16, + pub root_dev: u16, + pub boot_flag: u16, + pub jump: u16, + pub header: u32, + pub version: u16, + pub realmode_swtch: u32, + pub start_sys_seg: u16, + pub kernel_version: u16, + pub type_of_loader: u8, + pub loadflags: u8, + pub setup_move_size: u16, + pub code32_start: u32, + pub ramdisk_image: u32, + pub ramdisk_size: u32, + pub bootsect_kludge: u32, + pub heap_end_ptr: u16, + pub ext_loader_ver: u8, + pub ext_loader_type: u8, + pub cmd_line_ptr: u32, + pub initrd_addr_max: u32, + pub kernel_alignment: u32, + pub relocatable_kernel: u8, + pub min_alignment: u8, + pub xloadflags: u16, + pub cmdline_size: u32, + pub hardware_subarch: u32, + pub hardware_subarch_data: u64, + pub payload_offset: u32, + pub payload_length: u32, + pub setup_data: u64, + pub pref_address: u64, + pub init_size: u32, + pub handover_offset: u32, +} + +/// Boot parameters (zero page) — the key structure passed to the Linux kernel. +pub struct BootParams { + /// The raw 4096-byte zero page buffer. + pub data: [u8; 4096], +} + +impl Default for BootParams { + fn default() -> Self { + BootParams { data: [0u8; 4096] } + } +} + +impl BootParams { + /// Create a new BootParams with default values. + pub fn new() -> Self { + Self::default() + } + + /// Set the E820 memory map. + pub fn set_e820_map(&mut self, entries: &[E820Entry]) { + let count = entries.len().min(128) as u8; + self.data[0x1E8] = count; + + let base_offset = 0x2D0; + for (i, entry) in entries.iter().take(128).enumerate() { + let offset = base_offset + i * 20; + self.data[offset..offset + 8].copy_from_slice(&entry.addr.to_le_bytes()); + self.data[offset + 8..offset + 16].copy_from_slice(&entry.size.to_le_bytes()); + self.data[offset + 16..offset + 20].copy_from_slice(&entry.entry_type.to_le_bytes()); + } + } + + /// Set the command line pointer. + pub fn set_cmdline_ptr(&mut self, addr: u32) { + self.data[0x228..0x22C].copy_from_slice(&addr.to_le_bytes()); + } + + /// Set the command line size. + pub fn set_cmdline_size(&mut self, size: u32) { + self.data[0x238..0x23C].copy_from_slice(&size.to_le_bytes()); + } + + /// Set the boot flag (must be 0xAA55). + pub fn set_boot_flag(&mut self) { + self.data[0x1FE..0x200].copy_from_slice(&BOOT_MAGIC.to_le_bytes()); + } + + /// Set the setup header magic ("HdrS"). + pub fn set_header_magic(&mut self) { + self.data[0x202..0x206].copy_from_slice(&HDRS_MAGIC.to_le_bytes()); + } + + /// Set the boot protocol version. + pub fn set_version(&mut self, version: u16) { + self.data[0x206..0x208].copy_from_slice(&version.to_le_bytes()); + } + + /// Set the type_of_loader field (0xFF = undefined bootloader). + pub fn set_loader_type(&mut self, loader_type: u8) { + self.data[0x210] = loader_type; + } + + /// Set load flags. + pub fn set_loadflags(&mut self, flags: u8) { + self.data[0x211] = flags; + } + + /// Set the ramdisk image address. + pub fn set_ramdisk(&mut self, addr: u32, size: u32) { + self.data[0x218..0x21C].copy_from_slice(&addr.to_le_bytes()); + self.data[0x21C..0x220].copy_from_slice(&size.to_le_bytes()); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_boot_params_default_is_zeroed() { + let params = BootParams::new(); + assert!(params.data.iter().all(|&b| b == 0)); + } + + #[test] + fn test_boot_params_set_boot_flag() { + let mut params = BootParams::new(); + params.set_boot_flag(); + let flag = u16::from_le_bytes([params.data[0x1FE], params.data[0x1FF]]); + assert_eq!(flag, BOOT_MAGIC); + } + + #[test] + fn test_boot_params_set_header_magic() { + let mut params = BootParams::new(); + params.set_header_magic(); + let magic = u32::from_le_bytes([ + params.data[0x202], + params.data[0x203], + params.data[0x204], + params.data[0x205], + ]); + assert_eq!(magic, HDRS_MAGIC); + } + + #[test] + fn test_boot_params_set_cmdline() { + let mut params = BootParams::new(); + params.set_cmdline_ptr(0x20000); + params.set_cmdline_size(256); + + let ptr = u32::from_le_bytes(params.data[0x228..0x22C].try_into().unwrap()); + let size = u32::from_le_bytes(params.data[0x238..0x23C].try_into().unwrap()); + assert_eq!(ptr, 0x20000); + assert_eq!(size, 256); + } + + #[test] + fn test_boot_params_e820_map() { + let mut params = BootParams::new(); + let entries = vec![ + E820Entry { + addr: 0, + size: 0x9FC00, + entry_type: E820_RAM, + _pad: 0, + }, + E820Entry { + addr: 0x100000, + size: 255 * 1024 * 1024, + entry_type: E820_RAM, + _pad: 0, + }, + ]; + + params.set_e820_map(&entries); + assert_eq!(params.data[0x1E8], 2); + + let addr = u64::from_le_bytes(params.data[0x2D0..0x2D8].try_into().unwrap()); + let size = u64::from_le_bytes(params.data[0x2D8..0x2E0].try_into().unwrap()); + let etype = u32::from_le_bytes(params.data[0x2E0..0x2E4].try_into().unwrap()); + assert_eq!(addr, 0); + assert_eq!(size, 0x9FC00); + assert_eq!(etype, E820_RAM); + } + + #[test] + fn test_e820_entry_size() { + assert_eq!(std::mem::size_of::(), 24); + } + + #[test] + fn test_boot_params_loader_type() { + let mut params = BootParams::new(); + params.set_loader_type(0xFF); + assert_eq!(params.data[0x210], 0xFF); + } + + #[test] + fn test_boot_params_ramdisk() { + let mut params = BootParams::new(); + params.set_ramdisk(0x1000000, 0x500000); + + let addr = u32::from_le_bytes(params.data[0x218..0x21C].try_into().unwrap()); + let size = u32::from_le_bytes(params.data[0x21C..0x220].try_into().unwrap()); + assert_eq!(addr, 0x1000000); + assert_eq!(size, 0x500000); + } +} diff --git a/src/vmm/src/windows/boot/setup.rs b/src/vmm/src/windows/boot/setup.rs new file mode 100644 index 000000000..444e943c9 --- /dev/null +++ b/src/vmm/src/windows/boot/setup.rs @@ -0,0 +1,315 @@ +//! x86_64 boot setup — page tables, GDT, and vCPU register configuration +//! for the Windows WHPX backend. + +use super::super::types::{DescriptorTable, SegmentRegister, SpecialRegisters, StandardRegisters}; + +// Page table constants +const PAGE_PRESENT: u64 = 1 << 0; +const PAGE_WRITE: u64 = 1 << 1; +const PAGE_SIZE_2MB: u64 = 1 << 7; + +// Control register bits +const CR0_PE: u64 = 1 << 0; +const CR0_ET: u64 = 1 << 4; +const CR0_NE: u64 = 1 << 5; +const CR0_WP: u64 = 1 << 16; +const CR0_AM: u64 = 1 << 18; +const CR0_PG: u64 = 1 << 31; + +const CR4_PAE: u64 = 1 << 5; +const CR4_OSFXSR: u64 = 1 << 9; +const CR4_OSXMMEXCPT: u64 = 1 << 10; + +const EFER_LME: u64 = 1 << 8; +const EFER_LMA: u64 = 1 << 10; +const EFER_SCE: u64 = 1 << 0; + +// GDT entry access byte and flags +const GDT_CODE_ACCESS: u16 = 0xA09B; +const GDT_DATA_ACCESS: u16 = 0xC093; +const GDT_TSS_ACCESS: u16 = 0x808B; + +/// Memory addresses for page table structures. +const PML4_ADDR: u64 = 0x9000; +const PDPT_ADDR: u64 = 0xA000; +const PD_ADDR: u64 = 0xB000; +const GDT_ADDR: u64 = 0x500; +const BOOT_STACK: u64 = 0x8FF0; + +/// GDT entry indices +const GDT_NULL: usize = 0; +const GDT_CODE: usize = 1; +const GDT_DATA: usize = 2; +const GDT_TSS: usize = 3; + +/// Number of GDT entries (null + code + data + TSS = 4) +const GDT_ENTRY_COUNT: usize = 4; + +/// Build identity-mapped page tables for 4GB. +pub fn build_page_tables() -> PageTables { + let mut pml4 = [0u64; 512]; + let mut pdpt = [0u64; 512]; + let mut pd = [[0u64; 512]; 4]; + + pml4[0] = PDPT_ADDR | PAGE_PRESENT | PAGE_WRITE; + + for (i, entry) in pdpt.iter_mut().enumerate().take(4) { + *entry = (PD_ADDR + i as u64 * 0x1000) | PAGE_PRESENT | PAGE_WRITE; + } + + for (i, pd_table) in pd.iter_mut().enumerate() { + for (j, entry) in pd_table.iter_mut().enumerate() { + let phys_addr = (i as u64 * 512 + j as u64) * (2 * 1024 * 1024); + *entry = phys_addr | PAGE_PRESENT | PAGE_WRITE | PAGE_SIZE_2MB; + } + } + + PageTables { pml4, pdpt, pd } +} + +/// Page table data ready to be written to guest memory. +pub struct PageTables { + pub pml4: [u64; 512], + pub pdpt: [u64; 512], + pub pd: [[u64; 512]; 4], +} + +impl PageTables { + pub fn pml4_bytes(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.pml4.as_ptr() as *const u8, 512 * 8) } + } + + pub fn pdpt_bytes(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.pdpt.as_ptr() as *const u8, 512 * 8) } + } + + pub fn pd_bytes(&self, index: usize) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.pd[index].as_ptr() as *const u8, 512 * 8) } + } +} + +/// Build the GDT entries. +pub fn build_gdt() -> Vec { + let mut gdt = vec![0u64; GDT_ENTRY_COUNT + 1]; + + gdt[GDT_NULL] = 0; + gdt[GDT_CODE] = gdt_entry(0, 0xFFFFF, GDT_CODE_ACCESS); + gdt[GDT_DATA] = gdt_entry(0, 0xFFFFF, GDT_DATA_ACCESS); + gdt[GDT_TSS] = gdt_entry(0, 0xFFFF, GDT_TSS_ACCESS); + gdt[GDT_TSS + 1] = 0; + + gdt +} + +fn gdt_entry(base: u32, limit: u32, access_rights: u16) -> u64 { + let access = (access_rights & 0xFF) as u64; + let flags = ((access_rights >> 8) & 0xF0) as u64; + let limit_low = (limit & 0xFFFF) as u64; + let limit_high = ((limit >> 16) & 0xF) as u64; + let base_low = (base & 0xFFFF) as u64; + let base_mid = ((base >> 16) & 0xFF) as u64; + let base_high = ((base >> 24) & 0xFF) as u64; + + limit_low + | (base_low << 16) + | (base_mid << 32) + | (access << 40) + | (limit_high << 48) + | (flags << 48) + | (base_high << 56) +} + +/// GDT data as bytes. +pub fn gdt_bytes(gdt: &[u64]) -> Vec { + let mut bytes = Vec::with_capacity(gdt.len() * 8); + for entry in gdt { + bytes.extend_from_slice(&entry.to_le_bytes()); + } + bytes +} + +/// Configure the initial vCPU registers for 64-bit long mode boot. +pub fn configure_boot_registers(kernel_entry: u64) -> (StandardRegisters, SpecialRegisters) { + let regs = StandardRegisters { + rip: kernel_entry, + rsp: BOOT_STACK, + rsi: super::super::memory::ZERO_PAGE_START, + rflags: 0x2, + ..Default::default() + }; + + let gdt_size = (GDT_ENTRY_COUNT + 1) * 8; + + let sregs = SpecialRegisters { + cs: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x08, + access_rights: GDT_CODE_ACCESS, + }, + ds: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: GDT_DATA_ACCESS, + }, + es: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: GDT_DATA_ACCESS, + }, + fs: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: GDT_DATA_ACCESS, + }, + gs: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: GDT_DATA_ACCESS, + }, + ss: SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: GDT_DATA_ACCESS, + }, + tr: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0x18, + access_rights: GDT_TSS_ACCESS, + }, + ldt: SegmentRegister::default(), + gdt: DescriptorTable { + base: GDT_ADDR, + limit: (gdt_size - 1) as u16, + }, + idt: DescriptorTable { + base: 0, + limit: 0xFFFF, + }, + cr0: CR0_PE | CR0_ET | CR0_NE | CR0_WP | CR0_AM | CR0_PG, + cr2: 0, + cr3: PML4_ADDR, + cr4: CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT, + efer: EFER_LME | EFER_LMA | EFER_SCE, + }; + + (regs, sregs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_page_tables_pml4_points_to_pdpt() { + let pt = build_page_tables(); + let entry = pt.pml4[0]; + assert_eq!(entry & !0xFFF, PDPT_ADDR); + assert_ne!(entry & PAGE_PRESENT, 0); + assert_ne!(entry & PAGE_WRITE, 0); + } + + #[test] + fn test_page_tables_pdpt_entries() { + let pt = build_page_tables(); + for i in 0..4 { + let entry = pt.pdpt[i]; + let expected_addr = PD_ADDR + i as u64 * 0x1000; + assert_eq!(entry & !0xFFF, expected_addr); + assert_ne!(entry & PAGE_PRESENT, 0); + } + for i in 4..512 { + assert_eq!(pt.pdpt[i], 0, "PDPT[{}] should be empty", i); + } + } + + #[test] + fn test_page_tables_identity_map() { + let pt = build_page_tables(); + for i in 0..4 { + for j in 0..512 { + let entry = pt.pd[i][j]; + let expected_phys = (i as u64 * 512 + j as u64) * 2 * 1024 * 1024; + assert_eq!(entry & !0xFFF, expected_phys); + assert_ne!(entry & PAGE_PRESENT, 0); + assert_ne!(entry & PAGE_SIZE_2MB, 0); + } + } + } + + #[test] + fn test_page_tables_cover_4gb() { + let pt = build_page_tables(); + let last_entry = pt.pd[3][511]; + let last_addr = last_entry & !0xFFF; + let expected = (4u64 * 1024 * 1024 * 1024) - (2 * 1024 * 1024); + assert_eq!(last_addr, expected); + } + + #[test] + fn test_gdt_has_null_entry() { + let gdt = build_gdt(); + assert_eq!(gdt[GDT_NULL], 0); + } + + #[test] + fn test_gdt_code_segment() { + let gdt = build_gdt(); + assert_ne!(gdt[GDT_CODE], 0); + } + + #[test] + fn test_gdt_data_segment() { + let gdt = build_gdt(); + assert_ne!(gdt[GDT_DATA], 0); + } + + #[test] + fn test_gdt_bytes_length() { + let gdt = build_gdt(); + let bytes = gdt_bytes(&gdt); + assert_eq!(bytes.len(), gdt.len() * 8); + } + + #[test] + fn test_boot_registers_long_mode() { + let (regs, sregs) = configure_boot_registers(0x100000); + + assert_eq!(regs.rip, 0x100000); + assert_eq!(regs.rsp, BOOT_STACK); + assert_eq!(regs.rsi, super::super::memory::ZERO_PAGE_START); + assert_ne!(regs.rflags & 0x2, 0); + assert_ne!(sregs.cr0 & CR0_PE, 0); + assert_ne!(sregs.cr0 & CR0_PG, 0); + assert_eq!(sregs.cr3, PML4_ADDR); + assert_ne!(sregs.cr4 & CR4_PAE, 0); + assert_ne!(sregs.efer & EFER_LME, 0); + assert_ne!(sregs.efer & EFER_LMA, 0); + } + + #[test] + fn test_boot_registers_segment_selectors() { + let (_, sregs) = configure_boot_registers(0x100000); + + assert_eq!(sregs.cs.selector, 0x08); + assert_eq!(sregs.ds.selector, 0x10); + assert_eq!(sregs.es.selector, 0x10); + assert_eq!(sregs.ss.selector, 0x10); + assert_eq!(sregs.tr.selector, 0x18); + } + + #[test] + fn test_gdt_entry_encoding() { + let entry = gdt_entry(0, 0xFFFFF, GDT_CODE_ACCESS); + assert_ne!(entry, 0); + + let null = gdt_entry(0, 0, 0); + assert_eq!(null, 0); + } +} diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs new file mode 100644 index 000000000..5e8989742 --- /dev/null +++ b/src/vmm/src/windows/cmdline.rs @@ -0,0 +1,181 @@ +//! Kernel command line builder for the Windows WHPX backend. + +use super::memory::VIRTIO_MMIO_BASE; + +/// Size of each virtio-MMIO device slot in bytes. +pub const MMIO_SLOT_SIZE: u64 = 0x200; + +/// IRQ number for the first MMIO device slot. +pub const FIRST_MMIO_IRQ: u8 = 5; + +/// Base kernel command line parameters. +const BASE_CMDLINE: &str = + "console=ttyS0 earlyprintk=serial,ttyS0,115200 noapic nolapic noacpi nosmp"; + +/// Description of a virtio-MMIO device slot for command line generation. +#[derive(Debug, Clone)] +pub struct MmioSlot { + /// Slot index (0-based). Determines MMIO base address and IRQ. + pub index: u8, + /// Whether the slot is active (has a device). + pub active: bool, +} + +/// Build the full kernel command line. +pub fn build_kernel_cmdline( + user_cmdline: Option<&str>, + has_root_disk: bool, + mmio_slots: &[MmioSlot], +) -> String { + let mut cmdline = BASE_CMDLINE.to_string(); + + if has_root_disk { + cmdline.push_str(" root=/dev/vda rw"); + } + + for slot in mmio_slots { + if !slot.active { + continue; + } + let base = VIRTIO_MMIO_BASE + (slot.index as u64) * MMIO_SLOT_SIZE; + let irq = FIRST_MMIO_IRQ + slot.index; + cmdline.push_str(&format!( + " virtio_mmio.device={}@0x{:x}:{}", + MMIO_SLOT_SIZE, base, irq + )); + } + + if let Some(extra) = user_cmdline { + if !extra.is_empty() { + cmdline.push(' '); + cmdline.push_str(extra); + } + } + + cmdline +} + +/// Calculate the MMIO base address for a given slot index. +pub fn mmio_base_for_slot(index: u8) -> u64 { + VIRTIO_MMIO_BASE + (index as u64) * MMIO_SLOT_SIZE +} + +/// Calculate the IRQ number for a given slot index. +pub fn irq_for_slot(index: u8) -> u8 { + FIRST_MMIO_IRQ + index +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_base_cmdline_only() { + let cmdline = build_kernel_cmdline(None, false, &[]); + assert_eq!(cmdline, BASE_CMDLINE); + } + + #[test] + fn test_with_root_disk() { + let cmdline = build_kernel_cmdline(None, true, &[]); + assert!(cmdline.contains("root=/dev/vda rw")); + assert!(cmdline.starts_with(BASE_CMDLINE)); + } + + #[test] + fn test_with_mmio_slots() { + let slots = vec![ + MmioSlot { + index: 0, + active: true, + }, + MmioSlot { + index: 1, + active: true, + }, + ]; + let cmdline = build_kernel_cmdline(None, true, &slots); + assert!(cmdline.contains("virtio_mmio.device=512@0xd0000000:5")); + assert!(cmdline.contains("virtio_mmio.device=512@0xd0000200:6")); + } + + #[test] + fn test_inactive_slots_skipped() { + let slots = vec![ + MmioSlot { + index: 0, + active: true, + }, + MmioSlot { + index: 1, + active: false, + }, + MmioSlot { + index: 2, + active: true, + }, + ]; + let cmdline = build_kernel_cmdline(None, false, &slots); + assert!(cmdline.contains("virtio_mmio.device=512@0xd0000000:5")); + assert!(!cmdline.contains("0xd0000200")); + assert!(cmdline.contains("virtio_mmio.device=512@0xd0000400:7")); + } + + #[test] + fn test_user_cmdline_appended() { + let cmdline = build_kernel_cmdline(Some("init=/bin/sh"), false, &[]); + assert!(cmdline.ends_with("init=/bin/sh")); + } + + #[test] + fn test_empty_user_cmdline_no_trailing_space() { + let cmdline = build_kernel_cmdline(Some(""), false, &[]); + assert_eq!(cmdline, BASE_CMDLINE); + } + + #[test] + fn test_mmio_base_for_slot() { + assert_eq!(mmio_base_for_slot(0), 0xD000_0000); + assert_eq!(mmio_base_for_slot(1), 0xD000_0200); + assert_eq!(mmio_base_for_slot(2), 0xD000_0400); + } + + #[test] + fn test_irq_for_slot() { + assert_eq!(irq_for_slot(0), 5); + assert_eq!(irq_for_slot(1), 6); + assert_eq!(irq_for_slot(2), 7); + } + + #[test] + fn test_full_cmdline_with_all_options() { + let slots = vec![ + MmioSlot { + index: 0, + active: true, + }, + MmioSlot { + index: 1, + active: true, + }, + MmioSlot { + index: 2, + active: true, + }, + ]; + let cmdline = build_kernel_cmdline(Some("quiet"), true, &slots); + + let base_pos = cmdline.find(BASE_CMDLINE).unwrap(); + let root_pos = cmdline.find("root=/dev/vda").unwrap(); + let mmio0_pos = cmdline.find("0xd0000000:5").unwrap(); + let mmio1_pos = cmdline.find("0xd0000200:6").unwrap(); + let mmio2_pos = cmdline.find("0xd0000400:7").unwrap(); + let user_pos = cmdline.find("quiet").unwrap(); + + assert!(base_pos < root_pos); + assert!(root_pos < mmio0_pos); + assert!(mmio0_pos < mmio1_pos); + assert!(mmio1_pos < mmio2_pos); + assert!(mmio2_pos < user_pos); + } +} diff --git a/src/vmm/src/windows/context.rs b/src/vmm/src/windows/context.rs new file mode 100644 index 000000000..1a46811ac --- /dev/null +++ b/src/vmm/src/windows/context.rs @@ -0,0 +1,306 @@ +//! VM context — configuration state machine for building a VM. +//! +//! Mirrors libkrun's KrunContext pattern: create → configure → start. + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Mutex; + +use super::error::{Result, WkrunError}; +use super::types::VmState; + +/// Global context ID counter. +static NEXT_CTX_ID: AtomicU32 = AtomicU32::new(0); + +/// Global context map — maps context IDs to VM configurations. +/// Uses a Mutex for thread-safe access from the C API. +static CTX_MAP: std::sync::LazyLock>> = + std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); + +/// Disk format constants (matching libkrun). +pub const DISK_FORMAT_RAW: u32 = 0; +pub const DISK_FORMAT_QCOW2: u32 = 1; + +/// Configuration for a virtual machine. +pub struct VmContext { + /// Context ID. + pub id: u32, + /// Current state. + pub state: VmState, + /// Number of vCPUs. + pub num_vcpus: u8, + /// RAM size in MiB. + pub ram_mib: u32, + /// Root filesystem path. + pub root_path: Option, + /// Kernel image path (for direct boot). + pub kernel_path: Option, + /// Kernel command line. + pub kernel_cmdline: Option, + /// Initramfs path. + pub initramfs_path: Option, + /// Executable path for the guest init. + pub exec_path: Option, + /// Arguments for the guest executable. + pub argv: Vec, + /// Environment variables for the guest. + pub envp: Vec, + /// Working directory in the guest. + pub workdir: Option, + /// Attached block devices. + pub disks: Vec, + /// Virtiofs/9p mounts. + pub fs_mounts: Vec, + /// Vsock port bridges. + pub vsock_ports: Vec, + /// Console output file path. + pub console_output: Option, + /// Resource limits to apply in the guest (format: "RESOURCE=CUR:MAX"). + pub rlimits: Vec, + /// Whether APIC emulation is enabled. + pub apic_emulation: bool, + /// Network device configuration. + pub net_config: Option, +} + +/// Network device configuration. +pub struct NetConfig { + /// MAC address (6 bytes). If unset, auto-generated. + pub mac: [u8; 6], + /// Path to the userspace networking proxy socket. + pub socket_path: PathBuf, +} + +/// Block device configuration. +pub struct DiskConfig { + pub block_id: String, + pub path: PathBuf, + pub format: u32, + pub read_only: bool, +} + +/// Filesystem mount configuration (virtiofs or 9p). +pub struct FsMount { + pub tag: String, + pub host_path: PathBuf, +} + +/// Vsock port bridge configuration. +pub struct VsockPort { + pub port: u32, + pub host_path: PathBuf, + pub listen: bool, + /// Optional host TCP port override. When set, the vsock bridge listens on + /// this TCP port instead of the vsock port number. Enables multiple VMs + /// to use distinct host ports for the same guest vsock port. + pub host_tcp_port: Option, +} + +impl VmContext { + fn new(id: u32) -> Self { + VmContext { + id, + state: VmState::Created, + num_vcpus: 1, + ram_mib: 256, + root_path: None, + kernel_path: None, + kernel_cmdline: None, + initramfs_path: None, + exec_path: None, + argv: Vec::new(), + envp: Vec::new(), + workdir: None, + disks: Vec::new(), + fs_mounts: Vec::new(), + vsock_ports: Vec::new(), + console_output: None, + rlimits: Vec::new(), + apic_emulation: true, + net_config: None, + } + } + + /// Create a VmContext with default values for testing. + #[cfg(test)] + pub fn default_for_test() -> Self { + Self::new(0) + } +} + +/// Create a new VM context. Returns the context ID (>= 0) on success. +pub fn create_ctx() -> Result { + let id = NEXT_CTX_ID.fetch_add(1, Ordering::Relaxed); + let ctx = VmContext::new(id); + + let mut map = CTX_MAP + .lock() + .map_err(|_| WkrunError::Config("context map lock poisoned".into()))?; + + if map.contains_key(&id) { + return Err(WkrunError::ContextExists(id)); + } + + map.insert(id, ctx); + Ok(id) +} + +/// Free (destroy) a VM context. Returns Ok(()) on success. +pub fn free_ctx(ctx_id: u32) -> Result<()> { + let mut map = CTX_MAP + .lock() + .map_err(|_| WkrunError::Config("context map lock poisoned".into()))?; + + map.remove(&ctx_id) + .ok_or(WkrunError::InvalidContext(ctx_id))?; + + Ok(()) +} + +/// Execute a closure with mutable access to a VM context. +pub fn with_ctx_mut(ctx_id: u32, f: F) -> Result +where + F: FnOnce(&mut VmContext) -> Result, +{ + let mut map = CTX_MAP + .lock() + .map_err(|_| WkrunError::Config("context map lock poisoned".into()))?; + + let ctx = map + .get_mut(&ctx_id) + .ok_or(WkrunError::InvalidContext(ctx_id))?; + + f(ctx) +} + +/// Execute a closure with read access to a VM context. +pub fn with_ctx(ctx_id: u32, f: F) -> Result +where + F: FnOnce(&VmContext) -> Result, +{ + let map = CTX_MAP + .lock() + .map_err(|_| WkrunError::Config("context map lock poisoned".into()))?; + + let ctx = map.get(&ctx_id).ok_or(WkrunError::InvalidContext(ctx_id))?; + + f(ctx) +} + +/// Take (remove) a VM context from the global map. +/// Used when starting the VM — the context is consumed. +pub fn take_ctx(ctx_id: u32) -> Result { + let mut map = CTX_MAP + .lock() + .map_err(|_| WkrunError::Config("context map lock poisoned".into()))?; + + map.remove(&ctx_id) + .ok_or(WkrunError::InvalidContext(ctx_id)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_create_and_free_ctx() { + let id = create_ctx().unwrap(); + assert!(free_ctx(id).is_ok()); + } + + #[test] + fn test_double_free_returns_error() { + let id = create_ctx().unwrap(); + assert!(free_ctx(id).is_ok()); + assert!(free_ctx(id).is_err()); + } + + #[test] + fn test_invalid_ctx_returns_error() { + assert!(free_ctx(u32::MAX).is_err()); + } + + #[test] + fn test_with_ctx_mut() { + let id = create_ctx().unwrap(); + + with_ctx_mut(id, |ctx| { + ctx.num_vcpus = 4; + ctx.ram_mib = 1024; + Ok(()) + }) + .unwrap(); + + with_ctx(id, |ctx| { + assert_eq!(ctx.num_vcpus, 4); + assert_eq!(ctx.ram_mib, 1024); + Ok(()) + }) + .unwrap(); + + free_ctx(id).unwrap(); + } + + #[test] + fn test_take_ctx() { + let id = create_ctx().unwrap(); + + with_ctx_mut(id, |ctx| { + ctx.ram_mib = 512; + Ok(()) + }) + .unwrap(); + + let ctx = take_ctx(id).unwrap(); + assert_eq!(ctx.ram_mib, 512); + + // After taking, the context should no longer exist + assert!(free_ctx(id).is_err()); + } + + #[test] + fn test_set_rlimits() { + let id = create_ctx().unwrap(); + + with_ctx_mut(id, |ctx| { + ctx.rlimits = vec![ + "RLIMIT_NOFILE=1024:4096".to_string(), + "RLIMIT_NPROC=512:1024".to_string(), + ]; + Ok(()) + }) + .unwrap(); + + with_ctx(id, |ctx| { + assert_eq!(ctx.rlimits.len(), 2); + assert_eq!(ctx.rlimits[0], "RLIMIT_NOFILE=1024:4096"); + assert_eq!(ctx.rlimits[1], "RLIMIT_NPROC=512:1024"); + Ok(()) + }) + .unwrap(); + + free_ctx(id).unwrap(); + } + + #[test] + fn test_context_defaults() { + let id = create_ctx().unwrap(); + + with_ctx(id, |ctx| { + assert_eq!(ctx.num_vcpus, 1); + assert_eq!(ctx.ram_mib, 256); + assert_eq!(ctx.state, VmState::Created); + assert!(ctx.root_path.is_none()); + assert!(ctx.kernel_path.is_none()); + assert!(ctx.disks.is_empty()); + assert!(ctx.fs_mounts.is_empty()); + assert!(ctx.vsock_ports.is_empty()); + assert!(ctx.rlimits.is_empty()); + Ok(()) + }) + .unwrap(); + + free_ctx(id).unwrap(); + } +} diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs new file mode 100644 index 000000000..77885eddf --- /dev/null +++ b/src/vmm/src/windows/devices/manager.rs @@ -0,0 +1,705 @@ +//! DeviceManager — centralized I/O port and MMIO device dispatch. +//! +//! Owns all emulated devices (Serial, PIC, PIT, CMOS/RTC, virtio-*) +//! and routes vCPU exit events to the appropriate device handlers. + +use std::collections::HashMap; +use std::fs::File; +use std::io::Write; +use std::path::Path; +use std::sync::{Arc, LazyLock, Mutex}; +use std::time::Instant; + +use super::pic::Pic; +use super::pit::Pit; +use super::serial::{Serial, COM1_BASE}; +use super::virtio::block::VirtioBlock; +use super::virtio::disk::open_disk_backend; +use super::virtio::mmio::VirtioMmioDevice; +use super::virtio::net::VirtioNet; +use super::virtio::p9::Virtio9p; +use super::virtio::queue::GuestMemoryAccessor; +use super::virtio::vsock::VirtioVsock; +use super::super::error::{Result, WkrunError}; +use super::super::cmdline::{irq_for_slot, mmio_base_for_slot, MmioSlot, MMIO_SLOT_SIZE}; +use super::super::context::VmContext; +use super::super::vcpu::IoHandler; + +/// Shared console output buffer. +pub type ConsoleBuffer = Arc>>; + +/// Writer that copies output to both an inner writer and a shared buffer. +struct TeeWriter { + inner: Box, + buffer: ConsoleBuffer, +} + +impl Write for TeeWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.buffer.lock().unwrap().extend_from_slice(buf); + self.inner.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +/// Global console output buffers, keyed by ctx_id. +static CONSOLE_BUFFERS: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + +/// Store a console buffer for a VM. +pub fn store_console_buffer(ctx_id: u32, buffer: ConsoleBuffer) { + CONSOLE_BUFFERS.lock().unwrap().insert(ctx_id, buffer); +} + +/// Get a snapshot of console output for a VM. +/// +/// Returns None if no buffer exists for the given ctx_id. +pub fn get_console_output(ctx_id: u32) -> Option> { + CONSOLE_BUFFERS + .lock() + .unwrap() + .get(&ctx_id) + .map(|buf| buf.lock().unwrap().clone()) +} + +/// Remove and drop the console buffer for a VM. +pub fn remove_console_buffer(ctx_id: u32) { + CONSOLE_BUFFERS.lock().unwrap().remove(&ctx_id); +} + +/// Default guest CID for vsock (standard value for single-VM hosts). +const GUEST_CID: u64 = 3; + +/// Default vsock listen ports (BoxLite: 2695=gRPC, 2696=ready signal). +const DEFAULT_VSOCK_PORTS: &[u32] = &[2695, 2696]; + +/// CMOS register read values (static, read-only clock). +fn cmos_read(addr: u8) -> u8 { + match addr { + 0x00 => 0, // Seconds + 0x02 => 0, // Minutes + 0x04 => 12, // Hours (12 noon) + 0x06 => 3, // Day of week (Wednesday) + 0x07 => 1, // Day of month + 0x08 => 1, // Month (January) + 0x09 => 25, // Year (2025) + 0x0A => 0x26, // Status A: no update in progress, 32.768 kHz + 0x0B => 0x02, // Status B: 24-hour, BCD mode + 0x0C => 0x00, // Status C: no interrupt source + 0x0D => 0x80, // Status D: battery OK + 0x0E => 0x00, // Diagnostic status + 0x0F => 0x00, // Shutdown status + 0x10 => 0x00, // Floppy drive type + 0x12 => 0x00, // Hard drive type + 0x15 => 0x80, // Base memory low byte (640KB = 0x0280) + 0x16 => 0x02, // Base memory high byte + 0x17 => 0x00, // Extended memory low (kernel uses E820) + 0x18 => 0x00, // Extended memory high + 0x32 => 0x20, // Century (20xx) + _ => 0x00, + } +} + +/// Result of creating devices from a VmContext. +pub struct DeviceSetup { + /// The device manager. + pub devices: DeviceManager, + /// MMIO slots to include in the kernel command line. + pub mmio_slots: Vec, + /// Whether a root disk is present. + pub has_root_disk: bool, + /// Shared console output buffer (captures all serial output). + pub console_buffer: ConsoleBuffer, +} + +/// Centralized device manager for all emulated devices. +pub struct DeviceManager { + serial: Serial, + pub pic: Pic, + pit: Pit, + cmos_addr: u8, + + /// Virtio-blk device (slot 0) — optional. + virtio_blk: Option>, + /// Virtio-vsock device (slot 1). + virtio_vsock: VirtioMmioDevice, + /// Virtio-9p device (slot 2) — optional. + virtio_9p: Option>, + /// Virtio-net device (slot 3) — optional. + virtio_net: Option>, + + /// Track whether we've requested an interrupt window. + window_requested: bool, + /// Last PIT tick timestamp. + last_tick: Instant, +} + +impl DeviceManager { + /// Create all devices from a VmContext configuration. + /// + /// Returns the device manager plus MMIO slot info for the kernel cmdline. + pub fn from_context(ctx: &VmContext) -> Result { + // Serial console with capture buffer. + let console_buffer: ConsoleBuffer = Arc::new(Mutex::new(Vec::new())); + let serial = if let Some(ref path) = ctx.console_output { + let file = File::create(path).map_err(|e| { + WkrunError::Device(format!( + "failed to create console output '{}': {}", + path.display(), + e + )) + })?; + let tee = TeeWriter { + inner: Box::new(file), + buffer: console_buffer.clone(), + }; + Serial::new(COM1_BASE, Box::new(tee)) + } else { + let tee = TeeWriter { + inner: Box::new(std::io::stdout()), + buffer: console_buffer.clone(), + }; + Serial::new(COM1_BASE, Box::new(tee)) + }; + + // Virtio-blk (slot 0). + let has_root_disk = !ctx.disks.is_empty(); + let virtio_blk = if let Some(disk) = ctx.disks.first() { + let backend = open_disk_backend(&disk.path, disk.format, disk.read_only)?; + let blk = VirtioBlock::new(backend, disk.read_only); + Some(VirtioMmioDevice::new(blk)) + } else { + None + }; + + // Virtio-vsock (slot 1) — always present. + let mut vsock_backend = VirtioVsock::new(GUEST_CID); + // Listen on configured ports, or defaults. + if ctx.vsock_ports.is_empty() { + for &port in DEFAULT_VSOCK_PORTS { + let _ = vsock_backend.listen(port); + } + } else { + for vp in &ctx.vsock_ports { + let host_port = vp.host_tcp_port.unwrap_or(vp.port as u16); + let _ = vsock_backend.listen_on(vp.port, host_port); + } + } + let virtio_vsock = VirtioMmioDevice::new(vsock_backend); + + // Virtio-9p (slot 2) — optional, from fs_mounts. + let virtio_9p = ctx.fs_mounts.first().map(|mount| { + let p9 = Virtio9p::new(&mount.tag, mount.host_path.clone(), false); + VirtioMmioDevice::new(p9) + }); + + // Virtio-net (slot 3) — optional, from net_config. + let virtio_net = if let Some(ref net_cfg) = ctx.net_config { + let transport = Self::connect_net_transport(&net_cfg.socket_path)?; + let net = VirtioNet::new(net_cfg.mac, transport); + Some(VirtioMmioDevice::new(net)) + } else { + None + }; + + // Build MMIO slots for kernel cmdline. + let mmio_slots = vec![ + MmioSlot { + index: 0, + active: virtio_blk.is_some(), + }, + MmioSlot { + index: 1, + active: true, + }, // vsock always active + MmioSlot { + index: 2, + active: virtio_9p.is_some(), + }, + MmioSlot { + index: 3, + active: virtio_net.is_some(), + }, + ]; + + let devices = DeviceManager { + serial, + pic: Pic::new(), + pit: Pit::new(), + cmos_addr: 0, + virtio_blk, + virtio_vsock, + virtio_9p, + virtio_net, + window_requested: false, + last_tick: Instant::now(), + }; + + Ok(DeviceSetup { + devices, + mmio_slots, + has_root_disk, + console_buffer, + }) + } + + /// Handle an I/O port output (write) from the guest. + /// + /// Returns `true` if skip_instruction should be called after. + pub fn handle_io_out(&mut self, port: u16, size: u8, data: u32) { + if self.serial.handles_port(port) { + self.serial.io_write(port, size, data); + if self.serial.has_interrupt() { + self.pic.raise_irq(4); + } + } else if self.pic.handles_port(port) { + self.pic.write_port(port, data as u8); + } else if self.pit.handles_port(port) { + self.pit.write_port(port, data as u8); + } else if port == 0x70 { + self.cmos_addr = (data as u8) & 0x7F; + } + // Ignore writes to other ports (PS/2, etc.). + } + + /// Handle an I/O port input (read) from the guest. + /// + /// Returns the data to inject into the guest register. + pub fn handle_io_in(&mut self, port: u16, size: u8) -> u32 { + if self.serial.handles_port(port) { + let val = self.serial.io_read(port, size); + if self.serial.has_interrupt() { + self.pic.raise_irq(4); + } + val + } else if self.pic.handles_port(port) { + self.pic.read_port(port) as u32 + } else if self.pit.handles_port(port) { + self.pit.read_port(port) as u32 + } else if port == 0x71 { + cmos_read(self.cmos_addr) as u32 + } else if (0xCF8..=0xCFF).contains(&port) { + 0xFFFF_FFFF // PCI config: no devices. + } else if port == 0x61 { + 0x20 // System control port B: timer 2 output high. + } else if port == 0x92 { + 0x02 // System control port A: A20 enabled. + } else { + 0xFF // Default: no device. + } + } + + /// Handle an MMIO read from the guest. + /// + /// Returns the data to inject into the destination register. + pub fn handle_mmio_read(&self, address: u64, size: u8) -> u64 { + let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); + let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); + let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); + let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); + + if blk_offset < MMIO_SLOT_SIZE { + if let Some(ref dev) = self.virtio_blk { + dev.read(blk_offset, size) as u64 + } else { + 0 + } + } else if vsock_offset < MMIO_SLOT_SIZE { + self.virtio_vsock.read(vsock_offset, size) as u64 + } else if p9_offset < MMIO_SLOT_SIZE { + if let Some(ref dev) = self.virtio_9p { + dev.read(p9_offset, size) as u64 + } else { + 0 + } + } else if net_offset < MMIO_SLOT_SIZE { + if let Some(ref dev) = self.virtio_net { + dev.read(net_offset, size) as u64 + } else { + 0 + } + } else { + 0 + } + } + + /// Handle an MMIO write from the guest. + /// + /// Returns `true` if an interrupt should be raised. + pub fn handle_mmio_write( + &mut self, + address: u64, + size: u8, + data: u64, + mem: &dyn GuestMemoryAccessor, + ) { + let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); + let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); + let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); + + let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); + + if blk_offset < MMIO_SLOT_SIZE { + if let Some(ref mut dev) = self.virtio_blk { + if dev.write(blk_offset, data as u32, size, mem) { + self.pic.raise_irq(irq_for_slot(0)); + } + } + } else if vsock_offset < MMIO_SLOT_SIZE { + if self + .virtio_vsock + .write(vsock_offset, data as u32, size, mem) + { + self.pic.raise_irq(irq_for_slot(1)); + } + } else if p9_offset < MMIO_SLOT_SIZE { + if let Some(ref mut dev) = self.virtio_9p { + if dev.write(p9_offset, data as u32, size, mem) { + self.pic.raise_irq(irq_for_slot(2)); + } + } + } else if net_offset < MMIO_SLOT_SIZE { + if let Some(ref mut dev) = self.virtio_net { + if dev.write(net_offset, data as u32, size, mem) { + self.pic.raise_irq(irq_for_slot(3)); + } + } + } + } + + /// Tick the PIT timer based on wall clock time and poll devices. + /// + /// Call this at the top of each vCPU run loop iteration. + pub fn tick_and_poll(&mut self, mem: &dyn GuestMemoryAccessor) { + // Tick PIT. + let now = Instant::now(); + let elapsed_ns = now.duration_since(self.last_tick).as_nanos() as u64; + self.last_tick = now; + + if elapsed_ns > 0 { + let fires = self.pit.tick(elapsed_ns); + for _ in 0..fires { + self.pic.raise_irq(0); + } + } + + // Poll vsock for host-initiated data. + if self.virtio_vsock.poll(mem) { + self.pic.raise_irq(irq_for_slot(1)); + } + + // Poll net for incoming frames. + if let Some(ref mut dev) = self.virtio_net { + if dev.poll(mem) { + self.pic.raise_irq(irq_for_slot(3)); + } + } + } + + /// Connect to the userspace networking proxy and return a transport. + /// + /// On Unix: connects via Unix stream socket. + /// On Windows: parses "host:port" and connects via TCP. + fn connect_net_transport( + socket_path: &Path, + ) -> Result>> { + #[cfg(unix)] + { + let stream = std::os::unix::net::UnixStream::connect(socket_path).map_err(|e| { + WkrunError::Device(format!( + "failed to connect to net socket '{}': {}", + socket_path.display(), + e + )) + })?; + let transport = + super::virtio::net::UnixStreamTransport::new(stream).map_err(|e| { + WkrunError::Device(format!("failed to configure net socket: {}", e)) + })?; + Ok(Some(Box::new(transport))) + } + #[cfg(not(unix))] + { + let addr = socket_path.to_string_lossy(); + let stream = std::net::TcpStream::connect(addr.as_ref()).map_err(|e| { + WkrunError::Device(format!("failed to connect to net proxy '{}': {}", addr, e)) + })?; + let transport = + super::virtio::net::TcpTransport::new(stream).map_err(|e| { + WkrunError::Device(format!("failed to configure net socket: {}", e)) + })?; + Ok(Some(Box::new(transport))) + } + } + + /// Whether an interrupt window has been requested. + pub fn window_requested(&self) -> bool { + self.window_requested + } + + /// Set the interrupt window requested flag. + pub fn set_window_requested(&mut self, requested: bool) { + self.window_requested = requested; + } +} + +/// Create a `DeviceManager` from explicit components (for testing). +pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { + let vsock_backend = VirtioVsock::new(GUEST_CID); + DeviceManager { + serial, + pic: Pic::new(), + pit: Pit::new(), + cmos_addr: 0, + virtio_blk: None, + virtio_vsock: VirtioMmioDevice::new(vsock_backend), + virtio_9p: None, + virtio_net: None, + window_requested: false, + last_tick: Instant::now(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use std::path::PathBuf; + use std::sync::{Arc, Mutex}; + + /// Capture buffer for serial output in tests. + #[derive(Clone)] + struct CaptureSink { + buf: Arc>>, + } + + impl CaptureSink { + fn new() -> Self { + CaptureSink { + buf: Arc::new(Mutex::new(Vec::new())), + } + } + + fn contents(&self) -> Vec { + self.buf.lock().unwrap().clone() + } + } + + impl Write for CaptureSink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.buf.lock().unwrap().extend_from_slice(buf); + Ok(buf.len()) + } + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + fn make_test_devices() -> DeviceManager { + let serial = Serial::new(COM1_BASE, Box::new(std::io::sink())); + device_manager_with_serial(serial) + } + + #[test] + fn test_io_out_serial_write() { + let sink = CaptureSink::new(); + let serial = Serial::new(COM1_BASE, Box::new(sink.clone())); + let mut dm = device_manager_with_serial(serial); + + // Write 'A' to THR (port 0x3F8). + dm.handle_io_out(0x3F8, 1, b'A' as u32); + assert_eq!(sink.contents(), b"A"); + } + + #[test] + fn test_io_in_serial_lsr() { + let mut dm = make_test_devices(); + // Read LSR (port 0x3FD) — should report transmitter empty. + let lsr = dm.handle_io_in(0x3FD, 1); + // LSR bit 5 (THRE) and bit 6 (TEMT) should be set. + assert_ne!(lsr & 0x60, 0); + } + + #[test] + fn test_io_in_pci_config_no_devices() { + let mut dm = make_test_devices(); + let data = dm.handle_io_in(0xCF8, 4); + assert_eq!(data, 0xFFFF_FFFF); + } + + #[test] + fn test_io_in_system_control_port_b() { + let mut dm = make_test_devices(); + assert_eq!(dm.handle_io_in(0x61, 1), 0x20); + } + + #[test] + fn test_io_in_system_control_port_a() { + let mut dm = make_test_devices(); + assert_eq!(dm.handle_io_in(0x92, 1), 0x02); + } + + #[test] + fn test_io_in_unknown_port() { + let mut dm = make_test_devices(); + assert_eq!(dm.handle_io_in(0x999, 1), 0xFF); + } + + #[test] + fn test_cmos_read_via_io() { + let mut dm = make_test_devices(); + // Select CMOS register 0x09 (year). + dm.handle_io_out(0x70, 1, 0x09); + // Read CMOS data. + let year = dm.handle_io_in(0x71, 1); + assert_eq!(year, 25); // 2025. + } + + #[test] + fn test_cmos_read_battery_ok() { + let mut dm = make_test_devices(); + dm.handle_io_out(0x70, 1, 0x0D); + let status_d = dm.handle_io_in(0x71, 1); + assert_eq!(status_d, 0x80); + } + + #[test] + fn test_mmio_read_no_blk_device() { + let dm = make_test_devices(); + // Read from virtio-blk slot when no device present. + let data = dm.handle_mmio_read(mmio_base_for_slot(0), 4); + assert_eq!(data, 0); + } + + #[test] + fn test_mmio_read_vsock_magic() { + let dm = make_test_devices(); + // Read virtio magic from vsock MMIO slot. + let magic = dm.handle_mmio_read(mmio_base_for_slot(1), 4); + assert_eq!(magic, 0x7472_6976); // "virt" in LE. + } + + #[test] + fn test_mmio_read_vsock_device_id() { + let dm = make_test_devices(); + // Device ID is at offset 0x008. + let device_id = dm.handle_mmio_read(mmio_base_for_slot(1) + 0x008, 4); + assert_eq!(device_id, 19); // vsock device ID. + } + + #[test] + fn test_mmio_read_out_of_range() { + let dm = make_test_devices(); + // Read from an address that doesn't belong to any device. + let data = dm.handle_mmio_read(0xE000_0000, 4); + assert_eq!(data, 0); + } + + #[test] + fn test_window_requested_default() { + let dm = make_test_devices(); + assert!(!dm.window_requested()); + } + + #[test] + fn test_window_requested_toggle() { + let mut dm = make_test_devices(); + dm.set_window_requested(true); + assert!(dm.window_requested()); + dm.set_window_requested(false); + assert!(!dm.window_requested()); + } + + #[test] + fn test_tee_writer() { + let inner_buf = Arc::new(Mutex::new(Vec::new())); + let capture_buf: ConsoleBuffer = Arc::new(Mutex::new(Vec::new())); + + let inner = CaptureSink { + buf: inner_buf.clone(), + }; + let mut tee = super::TeeWriter { + inner: Box::new(inner), + buffer: capture_buf.clone(), + }; + + tee.write_all(b"Hello").unwrap(); + tee.write_all(b", VM!").unwrap(); + tee.flush().unwrap(); + + // Both sinks should have the same content. + assert_eq!(inner_buf.lock().unwrap().as_slice(), b"Hello, VM!"); + assert_eq!(capture_buf.lock().unwrap().as_slice(), b"Hello, VM!"); + } + + #[test] + fn test_console_buffer_store_and_get() { + let buf: ConsoleBuffer = Arc::new(Mutex::new(Vec::new())); + buf.lock().unwrap().extend_from_slice(b"test output"); + + let ctx_id = 90000; // Unique ID to avoid conflicts. + super::store_console_buffer(ctx_id, buf); + + let output = super::get_console_output(ctx_id).unwrap(); + assert_eq!(output, b"test output"); + + // Cleanup. + super::remove_console_buffer(ctx_id); + assert!(super::get_console_output(ctx_id).is_none()); + } + + #[test] + fn test_console_buffer_not_found() { + assert!(super::get_console_output(89999).is_none()); + } + + #[test] + fn test_from_context_has_console_buffer() { + let ctx = VmContext::default_for_test(); + let setup = DeviceManager::from_context(&ctx).unwrap(); + // Buffer should be empty initially. + assert!(setup.console_buffer.lock().unwrap().is_empty()); + } + + #[test] + fn test_from_context_minimal() { + let ctx = VmContext::default_for_test(); + let setup = DeviceManager::from_context(&ctx).unwrap(); + assert!(!setup.has_root_disk); + // Slot 0 (blk) inactive, slot 1 (vsock) active, slot 2 (9p) inactive. + assert!(!setup.mmio_slots[0].active); + assert!(setup.mmio_slots[1].active); + assert!(!setup.mmio_slots[2].active); + } + + #[test] + fn test_from_context_with_fs_mount() { + let mut ctx = VmContext::default_for_test(); + ctx.fs_mounts.push(super::super::super::context::FsMount { + tag: "test".to_string(), + host_path: PathBuf::from("/tmp"), + }); + let setup = DeviceManager::from_context(&ctx).unwrap(); + // Slot 2 (9p) should now be active. + assert!(setup.mmio_slots[2].active); + } + + #[test] + fn test_from_context_net_slot_inactive_by_default() { + let ctx = VmContext::default_for_test(); + let setup = DeviceManager::from_context(&ctx).unwrap(); + // Slot 3 (net) should be inactive when no net_config. + assert!(!setup.mmio_slots[3].active); + } + + #[test] + fn test_mmio_read_no_net_device() { + let dm = make_test_devices(); + // Read from virtio-net slot when no device present. + let data = dm.handle_mmio_read(mmio_base_for_slot(3), 4); + assert_eq!(data, 0); + } +} diff --git a/src/vmm/src/windows/devices/mod.rs b/src/vmm/src/windows/devices/mod.rs new file mode 100644 index 000000000..d5add51ba --- /dev/null +++ b/src/vmm/src/windows/devices/mod.rs @@ -0,0 +1,7 @@ +//! Device emulation for the guest VM. + +pub mod manager; +pub mod pic; +pub mod pit; +pub mod serial; +pub mod virtio; diff --git a/src/vmm/src/windows/devices/pic.rs b/src/vmm/src/windows/devices/pic.rs new file mode 100644 index 000000000..68f57db53 --- /dev/null +++ b/src/vmm/src/windows/devices/pic.rs @@ -0,0 +1,683 @@ +//! 8259 PIC (Programmable Interrupt Controller) emulation. +//! +//! Emulates a dual 8259 PIC (master + slave) for legacy interrupt routing. +//! +//! Master PIC: I/O ports 0x20-0x21, handles IRQs 0-7 +//! Slave PIC: I/O ports 0xA0-0xA1, handles IRQs 8-15 +//! Slave is connected to master IRQ 2 (cascade). +//! +//! The Linux kernel in PIC mode (noapic nolapic) programs the PICs to: +//! - Master: vector base 0x20 (IRQs 0-7 → vectors 0x20-0x27) +//! - Slave: vector base 0x28 (IRQs 8-15 → vectors 0x28-0x2F) + +use super::super::vcpu::IoHandler; + +/// Master PIC command port. +pub const PIC_MASTER_CMD: u16 = 0x20; +/// Master PIC data port. +pub const PIC_MASTER_DATA: u16 = 0x21; +/// Slave PIC command port. +pub const PIC_SLAVE_CMD: u16 = 0xA0; +/// Slave PIC data port. +pub const PIC_SLAVE_DATA: u16 = 0xA1; + +/// Cascade IRQ (slave connected to master IRQ 2). +const CASCADE_IRQ: u8 = 2; + +/// State for a single 8259 PIC chip. +#[derive(Debug)] +struct PicChip { + /// Interrupt Request Register — pending interrupt requests. + irr: u8, + /// In-Service Register — interrupts currently being serviced. + isr: u8, + /// Interrupt Mask Register — masked (disabled) interrupts. + imr: u8, + /// Vector base (aligned to 8, set by ICW2). + vector_base: u8, + /// ICW initialization state machine. + init_state: InitState, + /// Whether ICW4 is needed (from ICW1 bit 0). + icw4_needed: bool, + /// Whether to read ISR (true) or IRR (false) on command port read. + read_isr: bool, + /// Auto-EOI mode (from ICW4 bit 1). + auto_eoi: bool, +} + +/// ICW initialization state machine. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InitState { + /// Normal operation (not in initialization). + Ready, + /// Waiting for ICW2 (vector base). + WaitIcw2, + /// Waiting for ICW3 (cascade configuration). + WaitIcw3, + /// Waiting for ICW4 (mode). + WaitIcw4, +} + +impl PicChip { + fn new() -> Self { + PicChip { + irr: 0, + isr: 0, + imr: 0xFF, // All interrupts masked initially + vector_base: 0, + init_state: InitState::Ready, + icw4_needed: false, + read_isr: false, + auto_eoi: false, + } + } + + /// Write to the command port (port 0x20 or 0xA0). + fn write_command(&mut self, data: u8) { + if data & 0x10 != 0 { + // ICW1: bit 4 = 1 → start initialization sequence. + self.icw4_needed = data & 0x01 != 0; + self.init_state = InitState::WaitIcw2; + // Reset during init. + self.isr = 0; + self.irr = 0; + self.imr = 0; + self.auto_eoi = false; + self.read_isr = false; + } else if data & 0x08 != 0 { + // OCW3: bit 3 = 1. + if data & 0x02 != 0 { + // Read register command. + self.read_isr = data & 0x01 != 0; + } + } else { + // OCW2: End of Interrupt. + let is_eoi = data & 0x20 != 0; + let is_specific = data & 0x40 != 0; + if is_eoi { + if is_specific { + // Specific EOI: clear specific ISR bit. + let irq = data & 0x07; + self.isr &= !(1 << irq); + } else { + // Non-specific EOI: clear highest-priority ISR bit. + for i in 0..8u8 { + if self.isr & (1 << i) != 0 { + self.isr &= !(1 << i); + break; + } + } + } + } + } + } + + /// Write to the data port (port 0x21 or 0xA1). + fn write_data(&mut self, data: u8) { + match self.init_state { + InitState::WaitIcw2 => { + // ICW2: vector base (upper 5 bits used, lower 3 are IRQ number). + self.vector_base = data & 0xF8; + self.init_state = InitState::WaitIcw3; + } + InitState::WaitIcw3 => { + // ICW3: cascade configuration (we accept but don't use the value). + if self.icw4_needed { + self.init_state = InitState::WaitIcw4; + } else { + self.init_state = InitState::Ready; + } + } + InitState::WaitIcw4 => { + // ICW4: mode configuration. + self.auto_eoi = data & 0x02 != 0; + self.init_state = InitState::Ready; + } + InitState::Ready => { + // OCW1: set interrupt mask register. + self.imr = data; + } + } + } + + /// Read from the command port. + fn read_command(&self) -> u8 { + if self.read_isr { + self.isr + } else { + self.irr + } + } + + /// Read from the data port. + fn read_data(&self) -> u8 { + self.imr + } + + /// Raise an interrupt request on this chip (local IRQ 0-7). + fn raise_irq(&mut self, irq: u8) { + self.irr |= 1 << (irq & 7); + } + + /// Clear an interrupt request (edge-triggered reset). + fn clear_irq(&mut self, irq: u8) { + self.irr &= !(1 << (irq & 7)); + } + + /// Get the highest-priority pending (unmasked, not in-service) IRQ, if any. + fn pending_irq(&self) -> Option { + let pending = self.irr & !self.imr & !self.isr; + if pending == 0 { + return None; + } + // Lowest bit number = highest priority. + (0..8u8).find(|&i| pending & (1 << i) != 0) + } + + /// Acknowledge the highest-priority pending interrupt. + /// Moves the IRQ from IRR to ISR and returns the vector. + fn acknowledge(&mut self) -> Option { + if let Some(irq) = self.pending_irq() { + self.irr &= !(1 << irq); + if self.auto_eoi { + // Auto-EOI: don't set ISR. + } else { + self.isr |= 1 << irq; + } + Some(self.vector_base + irq) + } else { + None + } + } +} + +/// Dual 8259 PIC (master + slave). +pub struct Pic { + master: PicChip, + slave: PicChip, +} + +impl Default for Pic { + fn default() -> Self { + Self::new() + } +} + +impl Pic { + /// Create a new dual PIC with default state (all masked). + pub fn new() -> Self { + Pic { + master: PicChip::new(), + slave: PicChip::new(), + } + } + + /// Raise an interrupt request (IRQ 0-15). + /// + /// IRQs 0-7 go to the master PIC, IRQs 8-15 go to the slave PIC. + /// When a slave IRQ is raised, the cascade line (master IRQ 2) is also raised. + pub fn raise_irq(&mut self, irq: u8) { + if irq < 8 { + self.master.raise_irq(irq); + } else { + self.slave.raise_irq(irq - 8); + // Slave cascades through master IRQ 2. + self.master.raise_irq(CASCADE_IRQ); + } + } + + /// Clear an interrupt request (for edge-triggered mode). + pub fn clear_irq(&mut self, irq: u8) { + if irq < 8 { + self.master.clear_irq(irq); + } else { + self.slave.clear_irq(irq - 8); + // If no more slave IRQs pending, clear cascade on master. + if self.slave.pending_irq().is_none() { + self.master.clear_irq(CASCADE_IRQ); + } + } + } + + /// Check if there are any pending (unmasked, deliverable) interrupts. + pub fn has_pending(&self) -> bool { + self.master.pending_irq().is_some() + } + + /// Acknowledge the highest-priority pending interrupt. + /// + /// Returns the interrupt vector to deliver to the CPU, or None if + /// no interrupts are pending. + pub fn acknowledge(&mut self) -> Option { + if let Some(master_irq) = self.master.pending_irq() { + if master_irq == CASCADE_IRQ { + // Cascade: acknowledge slave first. + let vector = self.slave.acknowledge(); + // Acknowledge cascade on master. + self.master.acknowledge(); + // If no more slave IRQs, clear cascade. + if self.slave.pending_irq().is_none() { + self.master.clear_irq(CASCADE_IRQ); + } + vector + } else { + self.master.acknowledge() + } + } else { + None + } + } + + /// Check if the given I/O port belongs to either PIC. + pub fn handles_port(&self, port: u16) -> bool { + matches!( + port, + PIC_MASTER_CMD | PIC_MASTER_DATA | PIC_SLAVE_CMD | PIC_SLAVE_DATA + ) + } +} + +impl IoHandler for Pic { + fn io_read(&self, port: u16, _size: u8) -> u32 { + let val = match port { + PIC_MASTER_CMD => self.master.read_command(), + PIC_MASTER_DATA => self.master.read_data(), + PIC_SLAVE_CMD => self.slave.read_command(), + PIC_SLAVE_DATA => self.slave.read_data(), + _ => 0xFF, + }; + val as u32 + } + + fn io_write(&self, port: u16, _size: u8, data: u32) { + // IoHandler takes &self, but we need &mut self for PIC state. + // This is a design limitation — for now, the boot_kernel example + // uses Pic directly with &mut self methods. This trait impl is + // provided for interface compatibility but should not be used + // when mutation is needed. + // + // In practice, the vCPU loop will call write_port() directly. + let _ = (port, data); + } +} + +impl Pic { + /// Write to a PIC I/O port (mutable version for the vCPU loop). + pub fn write_port(&mut self, port: u16, data: u8) { + match port { + PIC_MASTER_CMD => self.master.write_command(data), + PIC_MASTER_DATA => self.master.write_data(data), + PIC_SLAVE_CMD => self.slave.write_command(data), + PIC_SLAVE_DATA => self.slave.write_data(data), + _ => {} + } + } + + /// Read from a PIC I/O port. + pub fn read_port(&self, port: u16) -> u8 { + match port { + PIC_MASTER_CMD => self.master.read_command(), + PIC_MASTER_DATA => self.master.read_data(), + PIC_SLAVE_CMD => self.slave.read_command(), + PIC_SLAVE_DATA => self.slave.read_data(), + _ => 0xFF, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- PicChip unit tests ---- + + #[test] + fn test_pic_chip_initial_state() { + let chip = PicChip::new(); + assert_eq!(chip.irr, 0); + assert_eq!(chip.isr, 0); + assert_eq!(chip.imr, 0xFF, "all IRQs masked initially"); + assert_eq!(chip.vector_base, 0); + assert_eq!(chip.init_state, InitState::Ready); + } + + #[test] + fn test_pic_chip_raise_irq_while_masked() { + let mut chip = PicChip::new(); + chip.raise_irq(0); + assert_eq!(chip.irr, 0x01); + // All masked, so no pending. + assert_eq!(chip.pending_irq(), None); + } + + #[test] + fn test_pic_chip_raise_and_unmask() { + let mut chip = PicChip::new(); + chip.imr = 0; // Unmask all. + chip.vector_base = 0x20; + chip.raise_irq(0); + assert_eq!(chip.pending_irq(), Some(0)); + + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x20)); + assert_eq!(chip.irr, 0, "IRR cleared after acknowledge"); + assert_eq!(chip.isr, 0x01, "ISR set after acknowledge"); + } + + #[test] + fn test_pic_chip_priority_order() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x20; + + // Raise IRQ 3 and IRQ 1 — IRQ 1 has higher priority. + chip.raise_irq(3); + chip.raise_irq(1); + assert_eq!(chip.pending_irq(), Some(1)); + + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x21)); // 0x20 + 1 + // IRQ 3 still pending. + assert_eq!(chip.pending_irq(), Some(3)); + } + + #[test] + fn test_pic_chip_isr_blocks_lower_priority() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x20; + + chip.raise_irq(0); + chip.acknowledge(); // IRQ 0 now in ISR. + + // Raise IRQ 1 — it's pending but IRQ 0 is in-service. + // Since ISR bit 0 is set, IRQ 0 blocks IRQ 0 but not IRQ 1. + // Actually, pending = irr & !imr & !isr. IRQ 1 is not in ISR, + // so it should be deliverable. + chip.raise_irq(1); + assert_eq!(chip.pending_irq(), Some(1)); + } + + #[test] + fn test_pic_chip_nonspecific_eoi() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x20; + + chip.raise_irq(0); + chip.acknowledge(); // IRQ 0 in ISR. + assert_eq!(chip.isr, 0x01); + + // Non-specific EOI (OCW2 with bit 5 set). + chip.write_command(0x20); + assert_eq!(chip.isr, 0, "ISR cleared by EOI"); + } + + #[test] + fn test_pic_chip_specific_eoi() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x20; + + chip.raise_irq(0); + chip.raise_irq(2); + chip.acknowledge(); // IRQ 0 acknowledged. + chip.acknowledge(); // IRQ 2 acknowledged. + assert_eq!(chip.isr, 0x05); // bits 0 and 2. + + // Specific EOI for IRQ 2 (OCW2: 0x60 | 2 = 0x62). + chip.write_command(0x62); + assert_eq!(chip.isr, 0x01, "only IRQ 0 should remain in ISR"); + } + + #[test] + fn test_pic_chip_icw_sequence() { + let mut chip = PicChip::new(); + + // ICW1: start init, ICW4 needed. + chip.write_command(0x11); + assert_eq!(chip.init_state, InitState::WaitIcw2); + assert!(chip.icw4_needed); + + // ICW2: vector base = 0x20. + chip.write_data(0x20); + assert_eq!(chip.vector_base, 0x20); + assert_eq!(chip.init_state, InitState::WaitIcw3); + + // ICW3: cascade config. + chip.write_data(0x04); // Master: slave on IRQ 2. + assert_eq!(chip.init_state, InitState::WaitIcw4); + + // ICW4: 8086 mode. + chip.write_data(0x01); + assert_eq!(chip.init_state, InitState::Ready); + } + + #[test] + fn test_pic_chip_icw_without_icw4() { + let mut chip = PicChip::new(); + + // ICW1 without ICW4. + chip.write_command(0x10); + assert!(!chip.icw4_needed); + + // ICW2. + chip.write_data(0x28); + assert_eq!(chip.vector_base, 0x28); + + // ICW3 → goes straight to Ready. + chip.write_data(0x02); + assert_eq!(chip.init_state, InitState::Ready); + } + + #[test] + fn test_pic_chip_imr_read_write() { + let mut chip = PicChip::new(); + + // After init, writing data port sets IMR. + chip.write_data(0xFB); // Mask all except IRQ 2. + assert_eq!(chip.imr, 0xFB); + assert_eq!(chip.read_data(), 0xFB); + } + + #[test] + fn test_pic_chip_read_irr_isr() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x20; + + chip.raise_irq(3); + + // Default read = IRR. + assert_eq!(chip.read_command(), 0x08); // bit 3. + + // OCW3: read ISR. + chip.write_command(0x0B); + assert_eq!(chip.read_command(), 0); // No ISR yet. + + chip.acknowledge(); // IRQ 3 → ISR. + assert_eq!(chip.read_command(), 0x08); // ISR bit 3. + + // OCW3: read IRR. + chip.write_command(0x0A); + assert_eq!(chip.read_command(), 0); // IRR cleared. + } + + #[test] + fn test_pic_chip_auto_eoi() { + let mut chip = PicChip::new(); + + // Init with auto-EOI. + chip.write_command(0x11); // ICW1. + chip.write_data(0x20); // ICW2. + chip.write_data(0x00); // ICW3. + chip.write_data(0x03); // ICW4: 8086 mode + auto-EOI. + assert!(chip.auto_eoi); + + chip.imr = 0; + chip.raise_irq(0); + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x20)); + assert_eq!(chip.isr, 0, "ISR should not be set in auto-EOI mode"); + } + + #[test] + fn test_pic_chip_clear_irq() { + let mut chip = PicChip::new(); + chip.raise_irq(5); + assert_eq!(chip.irr, 0x20); + chip.clear_irq(5); + assert_eq!(chip.irr, 0); + } + + // ---- Dual Pic tests ---- + + #[test] + fn test_pic_new_no_pending() { + let pic = Pic::new(); + assert!(!pic.has_pending()); + } + + #[test] + fn test_pic_master_irq_lifecycle() { + let mut pic = Pic::new(); + + // Program master PIC: vector base 0x20, unmask IRQ 0. + pic.write_port(PIC_MASTER_CMD, 0x11); + pic.write_port(PIC_MASTER_DATA, 0x20); + pic.write_port(PIC_MASTER_DATA, 0x04); + pic.write_port(PIC_MASTER_DATA, 0x01); + pic.write_port(PIC_MASTER_DATA, 0xFE); // Unmask only IRQ 0. + + pic.raise_irq(0); + assert!(pic.has_pending()); + + let vector = pic.acknowledge(); + assert_eq!(vector, Some(0x20)); + assert!(!pic.has_pending()); + + // EOI. + pic.write_port(PIC_MASTER_CMD, 0x20); + } + + #[test] + fn test_pic_slave_irq_lifecycle() { + let mut pic = Pic::new(); + + // Program master: vector 0x20, unmask IRQ 2 (cascade). + pic.write_port(PIC_MASTER_CMD, 0x11); + pic.write_port(PIC_MASTER_DATA, 0x20); + pic.write_port(PIC_MASTER_DATA, 0x04); + pic.write_port(PIC_MASTER_DATA, 0x01); + pic.write_port(PIC_MASTER_DATA, 0xFB); // Unmask only IRQ 2. + + // Program slave: vector 0x28, unmask IRQ 0 (= global IRQ 8). + pic.write_port(PIC_SLAVE_CMD, 0x11); + pic.write_port(PIC_SLAVE_DATA, 0x28); + pic.write_port(PIC_SLAVE_DATA, 0x02); + pic.write_port(PIC_SLAVE_DATA, 0x01); + pic.write_port(PIC_SLAVE_DATA, 0xFE); // Unmask only slave IRQ 0. + + // Raise IRQ 8 (slave IRQ 0). + pic.raise_irq(8); + assert!(pic.has_pending()); + + let vector = pic.acknowledge(); + assert_eq!(vector, Some(0x28)); // Slave vector base + 0. + assert!(!pic.has_pending()); + + // EOI to both slave and master. + pic.write_port(PIC_SLAVE_CMD, 0x20); + pic.write_port(PIC_MASTER_CMD, 0x20); + } + + #[test] + fn test_pic_handles_port() { + let pic = Pic::new(); + assert!(pic.handles_port(PIC_MASTER_CMD)); + assert!(pic.handles_port(PIC_MASTER_DATA)); + assert!(pic.handles_port(PIC_SLAVE_CMD)); + assert!(pic.handles_port(PIC_SLAVE_DATA)); + assert!(!pic.handles_port(0x22)); + assert!(!pic.handles_port(0x3F8)); + } + + #[test] + fn test_pic_read_port() { + let mut pic = Pic::new(); + + // Init master. + pic.write_port(PIC_MASTER_CMD, 0x11); + pic.write_port(PIC_MASTER_DATA, 0x20); + pic.write_port(PIC_MASTER_DATA, 0x04); + pic.write_port(PIC_MASTER_DATA, 0x01); + + // Set IMR. + pic.write_port(PIC_MASTER_DATA, 0xAB); + assert_eq!(pic.read_port(PIC_MASTER_DATA), 0xAB); + } + + #[test] + fn test_pic_multiple_master_irqs() { + let mut pic = Pic::new(); + + // Init and unmask all. + pic.write_port(PIC_MASTER_CMD, 0x11); + pic.write_port(PIC_MASTER_DATA, 0x20); + pic.write_port(PIC_MASTER_DATA, 0x04); + pic.write_port(PIC_MASTER_DATA, 0x01); + pic.write_port(PIC_MASTER_DATA, 0x00); // Unmask all. + + pic.raise_irq(3); + pic.raise_irq(1); + + // IRQ 1 is higher priority. + assert_eq!(pic.acknowledge(), Some(0x21)); + pic.write_port(PIC_MASTER_CMD, 0x20); // EOI. + + // Now IRQ 3. + assert_eq!(pic.acknowledge(), Some(0x23)); + pic.write_port(PIC_MASTER_CMD, 0x20); // EOI. + + assert!(!pic.has_pending()); + } + + #[test] + fn test_pic_init_resets_state() { + let mut pic = Pic::new(); + + // Set some state. + pic.master.irr = 0xFF; + pic.master.isr = 0xFF; + pic.master.imr = 0xFF; + + // Re-init should reset IRR, ISR, IMR. + pic.write_port(PIC_MASTER_CMD, 0x11); + assert_eq!(pic.master.irr, 0); + assert_eq!(pic.master.isr, 0); + assert_eq!(pic.master.imr, 0); + } + + #[test] + fn test_pic_io_handler_read() { + let pic = Pic::new(); + // Reading data port returns IMR (0xFF initially). + let val = pic.io_read(PIC_MASTER_DATA, 1); + assert_eq!(val, 0xFF); + } + + #[test] + fn test_pic_masked_irq_not_pending() { + let mut pic = Pic::new(); + + // Init master, mask IRQ 0. + pic.write_port(PIC_MASTER_CMD, 0x11); + pic.write_port(PIC_MASTER_DATA, 0x20); + pic.write_port(PIC_MASTER_DATA, 0x04); + pic.write_port(PIC_MASTER_DATA, 0x01); + pic.write_port(PIC_MASTER_DATA, 0x01); // Mask IRQ 0. + + pic.raise_irq(0); + assert!(!pic.has_pending(), "masked IRQ should not be pending"); + } +} diff --git a/src/vmm/src/windows/devices/pit.rs b/src/vmm/src/windows/devices/pit.rs new file mode 100644 index 000000000..79ef191ce --- /dev/null +++ b/src/vmm/src/windows/devices/pit.rs @@ -0,0 +1,648 @@ +//! 8254 PIT (Programmable Interval Timer) emulation. +//! +//! Emulates the three counters of the 8254/8253 PIT at I/O ports 0x40-0x43: +//! - Counter 0 (port 0x40): System timer, connected to PIC IRQ 0. +//! - Counter 1 (port 0x41): DRAM refresh (not emulated, returns 0). +//! - Counter 2 (port 0x42): PC speaker (not emulated, returns 0). +//! - Control word (port 0x43): Mode/command register. +//! +//! The PIT oscillator runs at 1,193,182 Hz. The kernel programs a reload +//! value and the counter counts down; when it reaches zero, it fires IRQ 0 +//! and reloads. +//! +//! Only counter 0 modes 2 (rate generator) and 3 (square wave) are emulated, +//! as these are the only modes Linux uses for the system timer. + +/// PIT I/O port: Counter 0 data. +pub const PIT_COUNTER0: u16 = 0x40; +/// PIT I/O port: Counter 1 data. +pub const PIT_COUNTER1: u16 = 0x41; +/// PIT I/O port: Counter 2 data. +pub const PIT_COUNTER2: u16 = 0x42; +/// PIT I/O port: Control word register. +pub const PIT_COMMAND: u16 = 0x43; + +/// PIT oscillator frequency in Hz. +pub const PIT_FREQUENCY: u64 = 1_193_182; + +/// Nanoseconds per PIT tick (approximately 838.1 ns). +/// Calculated as 1_000_000_000 / 1_193_182 ≈ 838. +/// We use fixed-point math in tick() for accuracy. +const NS_PER_SEC: u64 = 1_000_000_000; + +/// Counter operating mode. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum CounterMode { + /// Mode 0: Interrupt on terminal count. + InterruptOnTerminal, + /// Mode 2: Rate generator (periodic, fires on reload). + RateGenerator, + /// Mode 3: Square wave generator (periodic). + SquareWave, + /// Other modes (not emulated). + Other(u8), +} + +/// Access mode for reading/writing counter values. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AccessMode { + /// Low byte only. + Low, + /// High byte only. + High, + /// Low byte then high byte. + LoThenHi, +} + +/// Read/write state for two-byte access mode. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum RwState { + /// Next byte is the low byte. + Low, + /// Next byte is the high byte. + High, +} + +/// State for a single PIT counter. +#[derive(Debug)] +struct PitCounter { + /// Reload value (what the counter reloads to after reaching zero). + reload: u16, + /// Whether the reload value has been fully written. + reload_ready: bool, + /// Operating mode. + mode: CounterMode, + /// Access mode (lo, hi, or lo-hi byte). + access: AccessMode, + /// Write state for two-byte writes. + write_state: RwState, + /// Read state for two-byte reads. + read_state: RwState, + /// Latched count value (for latch command). + latched_value: Option, + /// Low byte of partial reload write. + write_low: u8, + /// Accumulated nanoseconds of elapsed time (for fractional ticks). + ns_accumulator: u64, +} + +impl PitCounter { + fn new() -> Self { + PitCounter { + reload: 0, + reload_ready: false, + mode: CounterMode::Other(0), + access: AccessMode::LoThenHi, + write_state: RwState::Low, + read_state: RwState::Low, + latched_value: None, + write_low: 0, + ns_accumulator: 0, + } + } + + /// Set the counter mode and access mode from a control word. + fn set_control(&mut self, mode: CounterMode, access: AccessMode) { + self.mode = mode; + self.access = access; + self.write_state = RwState::Low; + self.read_state = RwState::Low; + self.reload_ready = false; + } + + /// Write a data byte to this counter's data port. + fn write_data(&mut self, data: u8) { + match self.access { + AccessMode::Low => { + self.reload = data as u16; + self.reload_ready = true; + self.ns_accumulator = 0; + } + AccessMode::High => { + self.reload = (data as u16) << 8; + self.reload_ready = true; + self.ns_accumulator = 0; + } + AccessMode::LoThenHi => match self.write_state { + RwState::Low => { + self.write_low = data; + self.write_state = RwState::High; + } + RwState::High => { + self.reload = self.write_low as u16 | ((data as u16) << 8); + self.write_state = RwState::Low; + self.reload_ready = true; + self.ns_accumulator = 0; + } + }, + } + } + + /// Read a data byte from this counter's data port. + fn read_data(&mut self) -> u8 { + let value = self.latched_value.unwrap_or(self.reload); + + match self.access { + AccessMode::Low => { + self.latched_value = None; + value as u8 + } + AccessMode::High => { + self.latched_value = None; + (value >> 8) as u8 + } + AccessMode::LoThenHi => match self.read_state { + RwState::Low => { + self.read_state = RwState::High; + value as u8 + } + RwState::High => { + self.read_state = RwState::Low; + self.latched_value = None; + (value >> 8) as u8 + } + }, + } + } + + /// Latch the current count value for reading. + fn latch(&mut self) { + if self.latched_value.is_none() { + self.latched_value = Some(self.reload); + } + } + + /// Advance the counter by `elapsed_ns` nanoseconds. + /// + /// Returns the number of times the counter reached zero (fired). + fn tick(&mut self, elapsed_ns: u64) -> u64 { + if !self.reload_ready || self.reload == 0 { + return 0; + } + + match self.mode { + CounterMode::RateGenerator | CounterMode::SquareWave => { + // Accumulate elapsed time. + self.ns_accumulator += elapsed_ns; + + // Calculate how many PIT ticks have elapsed. + // ticks = accumulated_ns * PIT_FREQUENCY / NS_PER_SEC + // To avoid overflow, use u128 for intermediate calculation. + let total_ticks = + (self.ns_accumulator as u128 * PIT_FREQUENCY as u128) / NS_PER_SEC as u128; + + // How many full reload cycles is that? + let fires = total_ticks / self.reload as u128; + + // Subtract consumed nanoseconds (keep remainder in accumulator). + // consumed_ns = fires * reload * NS_PER_SEC / PIT_FREQUENCY + let consumed_ns = + (fires * self.reload as u128 * NS_PER_SEC as u128) / PIT_FREQUENCY as u128; + self.ns_accumulator -= consumed_ns as u64; + + fires as u64 + } + CounterMode::InterruptOnTerminal => { + // Mode 0: fires once when count reaches zero. + self.ns_accumulator += elapsed_ns; + let total_ticks = + (self.ns_accumulator as u128 * PIT_FREQUENCY as u128) / NS_PER_SEC as u128; + if total_ticks >= self.reload as u128 { + self.reload_ready = false; // One-shot: stop after firing. + self.ns_accumulator = 0; + 1 + } else { + 0 + } + } + CounterMode::Other(_) => 0, + } + } +} + +/// 8254 PIT emulation. +pub struct Pit { + counters: [PitCounter; 3], +} + +impl Default for Pit { + fn default() -> Self { + Self::new() + } +} + +impl Pit { + /// Create a new PIT with all counters in their initial state. + pub fn new() -> Self { + Pit { + counters: [PitCounter::new(), PitCounter::new(), PitCounter::new()], + } + } + + /// Check if the given I/O port belongs to the PIT. + pub fn handles_port(&self, port: u16) -> bool { + (PIT_COUNTER0..=PIT_COMMAND).contains(&port) + } + + /// Write to a PIT I/O port. + pub fn write_port(&mut self, port: u16, data: u8) { + match port { + PIT_COUNTER0 => self.counters[0].write_data(data), + PIT_COUNTER1 => self.counters[1].write_data(data), + PIT_COUNTER2 => self.counters[2].write_data(data), + PIT_COMMAND => self.write_command(data), + _ => {} + } + } + + /// Read from a PIT I/O port. + pub fn read_port(&mut self, port: u16) -> u8 { + match port { + PIT_COUNTER0 => self.counters[0].read_data(), + PIT_COUNTER1 => self.counters[1].read_data(), + PIT_COUNTER2 => self.counters[2].read_data(), + PIT_COMMAND => 0, // Command register is write-only. + _ => 0, + } + } + + /// Advance counter 0 by `elapsed_ns` nanoseconds. + /// + /// Returns the number of times counter 0 fired (should raise IRQ 0 + /// for each fire). + pub fn tick(&mut self, elapsed_ns: u64) -> u64 { + self.counters[0].tick(elapsed_ns) + } + + /// Parse and apply a control word written to port 0x43. + fn write_command(&mut self, data: u8) { + let counter_idx = ((data >> 6) & 0x03) as usize; + let access_bits = (data >> 4) & 0x03; + let mode_bits = (data >> 1) & 0x07; + + // Counter 3 is invalid (read-back command in 8254, not emulated). + if counter_idx >= 3 { + return; + } + + let access = match access_bits { + 0 => { + // Latch command: latch the current count. + self.counters[counter_idx].latch(); + return; + } + 1 => AccessMode::Low, + 2 => AccessMode::High, + 3 => AccessMode::LoThenHi, + _ => unreachable!(), + }; + + let mode = match mode_bits { + 0 => CounterMode::InterruptOnTerminal, + 2 | 6 => CounterMode::RateGenerator, + 3 | 7 => CounterMode::SquareWave, + m => CounterMode::Other(m), + }; + + self.counters[counter_idx].set_control(mode, access); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- PitCounter unit tests ---- + + #[test] + fn test_counter_initial_state() { + let counter = PitCounter::new(); + assert_eq!(counter.reload, 0); + assert!(!counter.reload_ready); + assert_eq!(counter.ns_accumulator, 0); + } + + #[test] + fn test_counter_tick_not_ready() { + let mut counter = PitCounter::new(); + assert_eq!(counter.tick(1_000_000), 0, "should not fire when not ready"); + } + + #[test] + fn test_counter_rate_generator_fires() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::RateGenerator; + counter.reload = 1193; // ~1000 Hz (1 ms period) + counter.reload_ready = true; + + // 1 ms = 1_000_000 ns, should fire ~1 time. + let fires = counter.tick(1_000_000); + assert_eq!(fires, 1); + } + + #[test] + fn test_counter_rate_generator_multiple_fires() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::RateGenerator; + counter.reload = 1193; // ~1000 Hz + counter.reload_ready = true; + + // 10 ms = 10_000_000 ns, should fire ~10 times. + let fires = counter.tick(10_000_000); + assert!( + fires >= 9 && fires <= 11, + "expected ~10 fires, got {}", + fires + ); + } + + #[test] + fn test_counter_accumulates_remainder() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::RateGenerator; + counter.reload = 11932; // ~100 Hz (~10.0005 ms period) + counter.reload_ready = true; + + // Tick 5ms — not enough for one full period. + let fires1 = counter.tick(5_000_000); + assert_eq!(fires1, 0); + + // Tick another 6ms — total 11ms, should fire once. + let fires2 = counter.tick(6_000_000); + assert_eq!(fires2, 1); + } + + #[test] + fn test_counter_square_wave_fires() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::SquareWave; + counter.reload = 1193; + counter.reload_ready = true; + + let fires = counter.tick(1_000_000); + assert_eq!(fires, 1); + } + + #[test] + fn test_counter_mode0_fires_once() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::InterruptOnTerminal; + counter.reload = 1193; + counter.reload_ready = true; + + let fires1 = counter.tick(1_000_000); + assert_eq!(fires1, 1); + + // Mode 0 is one-shot: should not fire again. + let fires2 = counter.tick(1_000_000); + assert_eq!(fires2, 0); + } + + #[test] + fn test_counter_zero_reload_no_fire() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::RateGenerator; + counter.reload = 0; // Zero reload should not fire. + counter.reload_ready = true; + + assert_eq!(counter.tick(1_000_000), 0); + } + + #[test] + fn test_counter_write_lo_hi_byte() { + let mut counter = PitCounter::new(); + counter.access = AccessMode::LoThenHi; + counter.mode = CounterMode::RateGenerator; + + // Write low byte first, then high byte. + counter.write_data(0x00); // Low byte. + assert!(!counter.reload_ready); + + counter.write_data(0x10); // High byte → reload = 0x1000. + assert!(counter.reload_ready); + assert_eq!(counter.reload, 0x1000); + } + + #[test] + fn test_counter_write_lo_byte_only() { + let mut counter = PitCounter::new(); + counter.access = AccessMode::Low; + counter.mode = CounterMode::RateGenerator; + + counter.write_data(0x42); + assert!(counter.reload_ready); + assert_eq!(counter.reload, 0x42); + } + + #[test] + fn test_counter_write_hi_byte_only() { + let mut counter = PitCounter::new(); + counter.access = AccessMode::High; + counter.mode = CounterMode::RateGenerator; + + counter.write_data(0x42); + assert!(counter.reload_ready); + assert_eq!(counter.reload, 0x4200); + } + + #[test] + fn test_counter_read_lo_hi_byte() { + let mut counter = PitCounter::new(); + counter.access = AccessMode::LoThenHi; + counter.reload = 0x1234; + + let lo = counter.read_data(); + assert_eq!(lo, 0x34); + + let hi = counter.read_data(); + assert_eq!(hi, 0x12); + } + + #[test] + fn test_counter_latch() { + let mut counter = PitCounter::new(); + counter.access = AccessMode::LoThenHi; + counter.reload = 0xABCD; + + counter.latch(); + assert_eq!(counter.latched_value, Some(0xABCD)); + + // Read should return latched value. + let lo = counter.read_data(); + assert_eq!(lo, 0xCD); + let hi = counter.read_data(); + assert_eq!(hi, 0xAB); + + // Latched value should be consumed. + assert_eq!(counter.latched_value, None); + } + + #[test] + fn test_counter_latch_only_once() { + let mut counter = PitCounter::new(); + counter.reload = 0x1111; + + counter.latch(); + counter.reload = 0x2222; // Change after latch. + counter.latch(); // Should NOT overwrite first latch. + + assert_eq!(counter.latched_value, Some(0x1111)); + } + + // ---- Pit (full device) tests ---- + + #[test] + fn test_pit_handles_port() { + let pit = Pit::new(); + assert!(pit.handles_port(PIT_COUNTER0)); + assert!(pit.handles_port(PIT_COUNTER1)); + assert!(pit.handles_port(PIT_COUNTER2)); + assert!(pit.handles_port(PIT_COMMAND)); + assert!(!pit.handles_port(0x44)); + assert!(!pit.handles_port(0x3F)); + } + + #[test] + fn test_pit_program_counter0_rate_generator() { + let mut pit = Pit::new(); + + // Program counter 0 in rate generator mode, lo-hi access. + // Control word: counter=0 (bits 7-6=00), access=lo-hi (bits 5-4=11), + // mode=2 (bits 3-1=010), BCD=0 (bit 0=0) + // = 0b_00_11_010_0 = 0x34 + pit.write_port(PIT_COMMAND, 0x34); + + // Write reload value: 11932 = 0x2E9C (100 Hz, 10ms period). + pit.write_port(PIT_COUNTER0, 0x9C); // Low byte. + pit.write_port(PIT_COUNTER0, 0x2E); // High byte. + + assert_eq!(pit.counters[0].reload, 0x2E9C); + assert!(pit.counters[0].reload_ready); + + // Tick 11ms — one period is ~10.0005ms, so 11ms is enough for one fire. + let fires = pit.tick(11_000_000); + assert_eq!(fires, 1); + } + + #[test] + fn test_pit_program_counter0_square_wave() { + let mut pit = Pit::new(); + + // Counter 0, lo-hi, mode 3 (square wave), binary. + // = 0b_00_11_011_0 = 0x36 + pit.write_port(PIT_COMMAND, 0x36); + + // Reload 11932 = ~100 Hz (~10.0005ms period). + pit.write_port(PIT_COUNTER0, 0x9C); + pit.write_port(PIT_COUNTER0, 0x2E); + + let fires = pit.tick(11_000_000); + assert_eq!(fires, 1); + } + + #[test] + fn test_pit_latch_command() { + let mut pit = Pit::new(); + + // Program counter 0. + pit.write_port(PIT_COMMAND, 0x34); + pit.write_port(PIT_COUNTER0, 0x00); + pit.write_port(PIT_COUNTER0, 0x10); // reload = 0x1000 + + // Latch counter 0: control word with access=00. + pit.write_port(PIT_COMMAND, 0x00); + + // Read latched value. + let lo = pit.read_port(PIT_COUNTER0); + let hi = pit.read_port(PIT_COUNTER0); + let val = lo as u16 | ((hi as u16) << 8); + assert_eq!(val, 0x1000); + } + + #[test] + fn test_pit_command_register_read_is_zero() { + let mut pit = Pit::new(); + assert_eq!(pit.read_port(PIT_COMMAND), 0); + } + + #[test] + fn test_pit_counter1_counter2_ignored() { + let mut pit = Pit::new(); + + // Programming counters 1 and 2 shouldn't affect tick(). + pit.write_port(PIT_COMMAND, 0x74); // Counter 1, lo-hi, mode 2. + pit.write_port(PIT_COUNTER1, 0x00); + pit.write_port(PIT_COUNTER1, 0x01); + + // Tick should only look at counter 0. + assert_eq!(pit.tick(10_000_000), 0); + } + + #[test] + fn test_pit_no_fire_before_program() { + let mut pit = Pit::new(); + assert_eq!( + pit.tick(100_000_000), + 0, + "should not fire when unprogrammed" + ); + } + + #[test] + fn test_pit_linux_typical_1000hz() { + let mut pit = Pit::new(); + + // Linux HZ=1000 programs PIT with reload = 1193 (≈1ms period). + pit.write_port(PIT_COMMAND, 0x34); + pit.write_port(PIT_COUNTER0, (1193 & 0xFF) as u8); + pit.write_port(PIT_COUNTER0, (1193 >> 8) as u8); + + // 1 second = 1_000_000_000 ns → should fire ~1000 times. + let fires = pit.tick(1_000_000_000); + assert!( + fires >= 998 && fires <= 1002, + "expected ~1000 fires for HZ=1000, got {}", + fires + ); + } + + #[test] + fn test_pit_linux_typical_100hz() { + let mut pit = Pit::new(); + + // Linux HZ=100 programs PIT with reload = 11932 (≈10ms period). + pit.write_port(PIT_COMMAND, 0x34); + pit.write_port(PIT_COUNTER0, (11932 & 0xFF) as u8); + pit.write_port(PIT_COUNTER0, (11932 >> 8) as u8); + + // 1 second → should fire ~100 times. + let fires = pit.tick(1_000_000_000); + assert!( + fires >= 99 && fires <= 101, + "expected ~100 fires for HZ=100, got {}", + fires + ); + } + + #[test] + fn test_pit_incremental_ticks() { + let mut pit = Pit::new(); + + // HZ=100: reload = 11932. + pit.write_port(PIT_COMMAND, 0x34); + pit.write_port(PIT_COUNTER0, (11932 & 0xFF) as u8); + pit.write_port(PIT_COUNTER0, (11932 >> 8) as u8); + + // Tick in small increments (1ms each) for 100ms total. + let mut total_fires = 0u64; + for _ in 0..100 { + total_fires += pit.tick(1_000_000); + } + // 100ms at HZ=100 → should fire ~10 times. + assert!( + total_fires >= 9 && total_fires <= 11, + "expected ~10 fires over 100ms, got {}", + total_fires + ); + } +} diff --git a/src/vmm/src/windows/devices/serial.rs b/src/vmm/src/windows/devices/serial.rs new file mode 100644 index 000000000..f188354ad --- /dev/null +++ b/src/vmm/src/windows/devices/serial.rs @@ -0,0 +1,381 @@ +//! 16550 UART serial console emulation. +//! +//! Emulates a basic 16550 UART at I/O ports 0x3F8-0x3FF (COM1). +//! Provides serial console output from the guest kernel/userspace. +//! +//! Register layout (base = 0x3F8): +//! +0 (THR/RBR): Transmit/Receive buffer +//! +1 (IER): Interrupt Enable Register +//! +2 (IIR/FCR): Interrupt Identification / FIFO Control +//! +3 (LCR): Line Control Register +//! +4 (MCR): Modem Control Register +//! +5 (LSR): Line Status Register +//! +6 (MSR): Modem Status Register +//! +7 (SCR): Scratch Register +//! +//! When DLAB (bit 7 of LCR) is set: +//! +0 (DLL): Divisor Latch Low +//! +1 (DLH): Divisor Latch High + +use std::io::Write; +use std::sync::Mutex; + +use super::super::vcpu::IoHandler; + +/// COM1 base I/O port address. +pub const COM1_BASE: u16 = 0x3F8; + +/// COM1 I/O port range (8 registers). +pub const COM1_SIZE: u16 = 8; + +/// Line Status Register bit flags. +const LSR_DATA_READY: u8 = 0x01; +const LSR_THR_EMPTY: u8 = 0x20; +const LSR_IDLE: u8 = 0x40; + +/// Interrupt Identification Register values. +const IIR_NO_INTERRUPT: u8 = 0x01; +const IIR_THRE: u8 = 0x02; // Transmitter Holding Register Empty +const IIR_FIFO_ENABLED: u8 = 0xC0; + +/// IER bit: Transmitter Holding Register Empty interrupt. +const IER_THRE: u8 = 0x02; + +/// Serial port state. +struct SerialState { + /// Interrupt Enable Register. + ier: u8, + /// Line Control Register. + lcr: u8, + /// Modem Control Register. + mcr: u8, + /// Line Status Register. + lsr: u8, + /// Modem Status Register. + msr: u8, + /// Scratch register. + scr: u8, + /// Divisor Latch Low byte. + dll: u8, + /// Divisor Latch High byte. + dlh: u8, + /// Output sink. + output: Box, + /// THRE interrupt pending (set after THR write when IER THRE bit is set). + thre_pending: bool, +} + +/// 16550 UART emulation. +pub struct Serial { + base_port: u16, + state: Mutex, +} + +impl Serial { + /// Create a new serial port emulation at the given base I/O port. + pub fn new(base_port: u16, output: Box) -> Self { + Serial { + base_port, + state: Mutex::new(SerialState { + ier: 0, + lcr: 0, + mcr: 0, + lsr: LSR_THR_EMPTY | LSR_IDLE, // Transmitter is ready + msr: 0, + scr: 0, + dll: 0, + dlh: 0, + output, + thre_pending: false, + }), + } + } + + /// Create a serial port that writes to stdout. + pub fn stdout(base_port: u16) -> Self { + Self::new(base_port, Box::new(std::io::stdout())) + } + + /// Check if the given I/O port is within this serial port's range. + pub fn handles_port(&self, port: u16) -> bool { + port >= self.base_port && port < self.base_port + COM1_SIZE + } + + /// Check if the serial device has a pending interrupt. + pub fn has_interrupt(&self) -> bool { + self.state.lock().unwrap().thre_pending + } + + /// Handle an I/O port read. + pub fn read(&self, port: u16) -> u8 { + let offset = port - self.base_port; + let mut state = self.state.lock().unwrap(); + let dlab = (state.lcr & 0x80) != 0; + + match offset { + 0 => { + if dlab { + state.dll + } else { + // RBR — receive buffer (no input support yet, return 0) + state.lsr &= !LSR_DATA_READY; + 0 + } + } + 1 => { + if dlab { + state.dlh + } else { + state.ier + } + } + 2 => { + // IIR — check for pending interrupt + if state.thre_pending { + state.thre_pending = false; + IIR_THRE | IIR_FIFO_ENABLED + } else { + IIR_NO_INTERRUPT | IIR_FIFO_ENABLED + } + } + 3 => state.lcr, + 4 => state.mcr, + 5 => { + let lsr = state.lsr; + // Reading LSR clears some bits + state.lsr &= !(LSR_DATA_READY); + lsr + } + 6 => state.msr, + 7 => state.scr, + _ => 0, + } + } + + /// Handle an I/O port write. + pub fn write(&self, port: u16, data: u8) { + let offset = port - self.base_port; + let mut state = self.state.lock().unwrap(); + let dlab = (state.lcr & 0x80) != 0; + + match offset { + 0 => { + if dlab { + state.dll = data; + } else { + // THR — transmit holding register: output the character + let _ = state.output.write_all(&[data]); + let _ = state.output.flush(); + // THR is always ready (we write synchronously) + state.lsr |= LSR_THR_EMPTY | LSR_IDLE; + // Signal THRE interrupt if enabled + if state.ier & IER_THRE != 0 { + state.thre_pending = true; + } + } + } + 1 => { + if dlab { + state.dlh = data; + } else { + let old_ier = state.ier; + state.ier = data & 0x0F; // Only lower 4 bits valid + // Enabling THRE interrupt when THR is already empty triggers it + if (state.ier & IER_THRE != 0) + && (old_ier & IER_THRE == 0) + && (state.lsr & LSR_THR_EMPTY != 0) + { + state.thre_pending = true; + } + } + } + 2 => { + // FCR — FIFO control (we acknowledge but don't implement FIFO) + } + 3 => state.lcr = data, + 4 => state.mcr = data & 0x1F, // Only lower 5 bits valid + 5 => {} // LSR is read-only + 6 => {} // MSR is read-only + 7 => state.scr = data, + _ => {} + } + } +} + +impl IoHandler for Serial { + fn io_read(&self, port: u16, _size: u8) -> u32 { + self.read(port) as u32 + } + + fn io_write(&self, port: u16, _size: u8, data: u32) { + self.write(port, data as u8); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex as StdMutex}; + + /// A test output sink that captures written bytes. + struct CaptureOutput { + buffer: Arc>>, + } + + impl CaptureOutput { + fn new() -> (Self, Arc>>) { + let buffer = Arc::new(StdMutex::new(Vec::new())); + ( + CaptureOutput { + buffer: buffer.clone(), + }, + buffer, + ) + } + } + + impl Write for CaptureOutput { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.buffer.lock().unwrap().extend_from_slice(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + fn create_test_serial() -> (Serial, Arc>>) { + let (output, buffer) = CaptureOutput::new(); + let serial = Serial::new(COM1_BASE, Box::new(output)); + (serial, buffer) + } + + #[test] + fn test_serial_handles_port() { + let (serial, _) = create_test_serial(); + assert!(serial.handles_port(COM1_BASE)); + assert!(serial.handles_port(COM1_BASE + 7)); + assert!(!serial.handles_port(COM1_BASE - 1)); + assert!(!serial.handles_port(COM1_BASE + 8)); + } + + #[test] + fn test_serial_lsr_initially_ready() { + let (serial, _) = create_test_serial(); + let lsr = serial.read(COM1_BASE + 5); + assert_ne!(lsr & LSR_THR_EMPTY, 0, "THR should be empty initially"); + assert_ne!(lsr & LSR_IDLE, 0, "transmitter should be idle initially"); + } + + #[test] + fn test_serial_write_character() { + let (serial, buffer) = create_test_serial(); + + serial.write(COM1_BASE, b'H'); + serial.write(COM1_BASE, b'i'); + + let captured = buffer.lock().unwrap(); + assert_eq!(&*captured, b"Hi"); + } + + #[test] + fn test_serial_write_string() { + let (serial, buffer) = create_test_serial(); + + for &byte in b"Hello, VM!\n" { + serial.write(COM1_BASE, byte); + } + + let captured = buffer.lock().unwrap(); + assert_eq!(std::str::from_utf8(&captured).unwrap(), "Hello, VM!\n"); + } + + #[test] + fn test_serial_scratch_register() { + let (serial, _) = create_test_serial(); + + serial.write(COM1_BASE + 7, 0x42); + assert_eq!(serial.read(COM1_BASE + 7), 0x42); + + serial.write(COM1_BASE + 7, 0xFF); + assert_eq!(serial.read(COM1_BASE + 7), 0xFF); + } + + #[test] + fn test_serial_dlab_divisor_latch() { + let (serial, _) = create_test_serial(); + + // Set DLAB bit in LCR + serial.write(COM1_BASE + 3, 0x80); + + // Write divisor + serial.write(COM1_BASE, 0x01); // DLL + serial.write(COM1_BASE + 1, 0x00); // DLH + + // Read divisor back + assert_eq!(serial.read(COM1_BASE), 0x01); // DLL + assert_eq!(serial.read(COM1_BASE + 1), 0x00); // DLH + + // Clear DLAB + serial.write(COM1_BASE + 3, 0x03); // 8N1 + + // Now register 0 is THR/RBR again, not DLL + // Writing should output a character, not change the divisor + let (serial2, buffer2) = create_test_serial(); + serial2.write(COM1_BASE + 3, 0x03); // 8N1, DLAB=0 + serial2.write(COM1_BASE, b'X'); + let captured = buffer2.lock().unwrap(); + assert_eq!(&*captured, b"X"); + } + + #[test] + fn test_serial_ier_mask() { + let (serial, _) = create_test_serial(); + + // IER only uses lower 4 bits + serial.write(COM1_BASE + 1, 0xFF); + assert_eq!(serial.read(COM1_BASE + 1), 0x0F); + } + + #[test] + fn test_serial_mcr_mask() { + let (serial, _) = create_test_serial(); + + // MCR only uses lower 5 bits + serial.write(COM1_BASE + 4, 0xFF); + assert_eq!(serial.read(COM1_BASE + 4), 0x1F); + } + + #[test] + fn test_serial_iir_no_interrupt() { + let (serial, _) = create_test_serial(); + + let iir = serial.read(COM1_BASE + 2); + assert_ne!(iir & IIR_NO_INTERRUPT, 0, "no interrupt should be pending"); + } + + #[test] + fn test_serial_io_handler_trait() { + let (serial, buffer) = create_test_serial(); + + // Use through IoHandler trait + serial.io_write(COM1_BASE, 1, b'A' as u32); + serial.io_write(COM1_BASE, 1, b'B' as u32); + + let lsr = serial.io_read(COM1_BASE + 5, 1); + assert_ne!(lsr & LSR_THR_EMPTY as u32, 0); + + let captured = buffer.lock().unwrap(); + assert_eq!(&*captured, b"AB"); + } + + #[test] + fn test_serial_thr_stays_ready_after_write() { + let (serial, _) = create_test_serial(); + + serial.write(COM1_BASE, b'X'); + let lsr = serial.read(COM1_BASE + 5); + assert_ne!(lsr & LSR_THR_EMPTY, 0, "THR should be ready after write"); + } +} diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs new file mode 100644 index 000000000..a544b3b8a --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -0,0 +1,635 @@ +//! Virtio-blk device backend (virtio spec v1.2 Section 5.2). +//! +//! Provides a file-backed block device that processes read, write, +//! and flush requests through the virtqueue. + +use super::disk::DiskBackend; +use super::mmio::VirtioDeviceBackend; +use super::queue::{Descriptor, GuestMemoryAccessor, Virtqueue}; + +/// Virtio device ID for block devices. +const VIRTIO_BLK_ID: u32 = 2; + +/// Block size in bytes (standard sector size). +const SECTOR_SIZE: u64 = 512; + +// Virtio-blk feature bits. +/// Device has a maximum size (not used for now). +#[allow(dead_code)] +const VIRTIO_BLK_F_SIZE_MAX: u32 = 1; +/// Device has a maximum segment size (not used for now). +#[allow(dead_code)] +const VIRTIO_BLK_F_SEG_MAX: u32 = 2; +/// Read-only device. +const VIRTIO_BLK_F_RO: u32 = 5; +/// VIRTIO_F_VERSION_1 — required for virtio 1.0+. +const VIRTIO_F_VERSION_1: u32 = 0; // Bit 32, goes in features page 1. + +// Virtio-blk request types. +const VIRTIO_BLK_T_IN: u32 = 0; // Read from disk. +const VIRTIO_BLK_T_OUT: u32 = 1; // Write to disk. +const VIRTIO_BLK_T_FLUSH: u32 = 4; // Flush. + +// Virtio-blk status values. +const VIRTIO_BLK_S_OK: u8 = 0; +const VIRTIO_BLK_S_IOERR: u8 = 1; +const VIRTIO_BLK_S_UNSUPP: u8 = 2; + +/// Virtio-blk device backed by a `DiskBackend`. +pub struct VirtioBlock { + disk: Box, + capacity: u64, // In sectors. + read_only: bool, +} + +impl VirtioBlock { + /// Create a new virtio-blk device from a disk backend. + /// + /// `read_only` marks the device as read-only (rejects write requests). + pub fn new(disk: Box, read_only: bool) -> Self { + let capacity = disk.capacity_bytes() / SECTOR_SIZE; + VirtioBlock { + disk, + capacity, + read_only, + } + } + + /// Get disk capacity in sectors. + pub fn capacity(&self) -> u64 { + self.capacity + } + + /// Process a single virtio-blk request from a descriptor chain. + fn process_request(&mut self, chain: &[Descriptor], mem: &dyn GuestMemoryAccessor) -> u8 { + // Minimum: header + status (flush has no data descriptor). + if chain.len() < 2 { + return VIRTIO_BLK_S_IOERR; + } + + // First descriptor: request header (device-readable). + let header_desc = &chain[0]; + if header_desc.len < 16 { + return VIRTIO_BLK_S_IOERR; + } + + let mut header_buf = [0u8; 16]; + if mem.read_at(header_desc.addr, &mut header_buf).is_err() { + return VIRTIO_BLK_S_IOERR; + } + + let req_type = + u32::from_le_bytes([header_buf[0], header_buf[1], header_buf[2], header_buf[3]]); + let sector = u64::from_le_bytes([ + header_buf[8], + header_buf[9], + header_buf[10], + header_buf[11], + header_buf[12], + header_buf[13], + header_buf[14], + header_buf[15], + ]); + + // Middle descriptors: data buffer(s) (may be empty for flush). + // Last descriptor: status byte (device-writable). + let data_descs = &chain[1..chain.len() - 1]; + + match req_type { + VIRTIO_BLK_T_IN => { + if data_descs.is_empty() { + return VIRTIO_BLK_S_IOERR; + } + self.handle_read(sector, data_descs, mem) + } + VIRTIO_BLK_T_OUT => { + if data_descs.is_empty() { + return VIRTIO_BLK_S_IOERR; + } + self.handle_write(sector, data_descs, mem) + } + VIRTIO_BLK_T_FLUSH => self.handle_flush(), + _ => VIRTIO_BLK_S_UNSUPP, + } + } + + fn handle_read( + &mut self, + sector: u64, + data_descs: &[Descriptor], + mem: &dyn GuestMemoryAccessor, + ) -> u8 { + let mut offset = sector * SECTOR_SIZE; + + for desc in data_descs { + if !desc.is_write() { + return VIRTIO_BLK_S_IOERR; // Data buffer must be device-writable for reads. + } + let mut buf = vec![0u8; desc.len as usize]; + if self.disk.read_at(offset, &mut buf).is_err() { + return VIRTIO_BLK_S_IOERR; + } + if mem.write_at(desc.addr, &buf).is_err() { + return VIRTIO_BLK_S_IOERR; + } + offset += desc.len as u64; + } + VIRTIO_BLK_S_OK + } + + fn handle_write( + &mut self, + sector: u64, + data_descs: &[Descriptor], + mem: &dyn GuestMemoryAccessor, + ) -> u8 { + if self.read_only { + return VIRTIO_BLK_S_IOERR; + } + + let mut offset = sector * SECTOR_SIZE; + + for desc in data_descs { + if desc.is_write() { + return VIRTIO_BLK_S_IOERR; // Data buffer must be device-readable for writes. + } + let mut buf = vec![0u8; desc.len as usize]; + if mem.read_at(desc.addr, &mut buf).is_err() { + return VIRTIO_BLK_S_IOERR; + } + if self.disk.write_at(offset, &buf).is_err() { + return VIRTIO_BLK_S_IOERR; + } + offset += desc.len as u64; + } + VIRTIO_BLK_S_OK + } + + fn handle_flush(&mut self) -> u8 { + if self.disk.flush().is_err() { + VIRTIO_BLK_S_IOERR + } else { + VIRTIO_BLK_S_OK + } + } +} + +impl VirtioDeviceBackend for VirtioBlock { + fn device_id(&self) -> u32 { + VIRTIO_BLK_ID + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 0 => { + let mut features = 0u32; + if self.read_only { + features |= 1 << VIRTIO_BLK_F_RO; + } + features + } + 1 => 1 << VIRTIO_F_VERSION_1, // VIRTIO_F_VERSION_1 is bit 32 (page 1, bit 0). + _ => 0, + } + } + + fn read_config(&self, offset: u64) -> u32 { + // Config space: capacity (u64 at offset 0). + match offset { + 0 => self.capacity as u32, // Low 32 bits. + 4 => (self.capacity >> 32) as u32, // High 32 bits. + _ => 0, + } + } + + fn queue_notify( + &mut self, + _queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + let mut processed = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + // Write error status and continue. + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + let status = self.process_request(&chain, mem); + + // Write status byte to last descriptor. + if let Some(status_desc) = chain.last() { + let _ = mem.write_at(status_desc.addr, &[status]); + } + + // Calculate total bytes written to device-writable descriptors. + let total_written: u32 = chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); + + let _ = queue.add_used(head, total_written, mem); + processed = true; + } + + processed + } + + fn num_queues(&self) -> usize { + 1 // Virtio-blk uses a single request queue. + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + 256 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::disk::RawDiskBackend; + use super::super::super::error::WkrunError; + use std::cell::RefCell; + use std::fs::File; + use std::io::Write as IoWrite; + use tempfile::NamedTempFile; + + struct MockMem { + data: RefCell>, + } + + impl MockMem { + fn new(size: usize) -> Self { + MockMem { + data: RefCell::new(vec![0u8; size]), + } + } + + fn write_bytes(&self, addr: u64, bytes: &[u8]) { + let a = addr as usize; + let mut data = self.data.borrow_mut(); + data[a..a + bytes.len()].copy_from_slice(bytes); + } + + fn read_bytes(&self, addr: u64, len: usize) -> Vec { + let a = addr as usize; + let data = self.data.borrow(); + data[a..a + len].to_vec() + } + } + + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> super::super::super::error::Result<()> { + let a = addr as usize; + let data = self.data.borrow(); + if a + buf.len() > data.len() { + return Err(WkrunError::Memory("out of bounds".into())); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> super::super::super::error::Result<()> { + let a = addr as usize; + let mut mem = self.data.borrow_mut(); + if a + data.len() > mem.len() { + return Err(WkrunError::Memory("out of bounds".into())); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + fn create_test_disk(sectors: u64) -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + let data = vec![0u8; (sectors * SECTOR_SIZE) as usize]; + f.write_all(&data).unwrap(); + f.flush().unwrap(); + f + } + + fn create_disk_with_pattern(sectors: u64) -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + for sector in 0..sectors { + let pattern = vec![(sector & 0xFF) as u8; SECTOR_SIZE as usize]; + f.write_all(&pattern).unwrap(); + } + f.flush().unwrap(); + f + } + + fn open_raw_backend(tmp: &NamedTempFile, read_only: bool) -> Box { + let file = File::options() + .read(true) + .write(!read_only) + .open(tmp.path()) + .unwrap(); + Box::new(RawDiskBackend::new(file).unwrap()) + } + + // --- Construction --- + + #[test] + fn test_new_block_device() { + let tmp = create_test_disk(8); + let backend = open_raw_backend(&tmp, false); + let blk = VirtioBlock::new(backend, false); + assert_eq!(blk.capacity(), 8); + assert_eq!(blk.device_id(), VIRTIO_BLK_ID); + } + + #[test] + fn test_empty_disk_error() { + let tmp = NamedTempFile::new().unwrap(); + let file = File::open(tmp.path()).unwrap(); + assert!(RawDiskBackend::new(file).is_err()); + } + + #[test] + fn test_read_only_features() { + let tmp = create_test_disk(1); + let backend = open_raw_backend(&tmp, true); + let blk = VirtioBlock::new(backend, true); + let features = blk.device_features(0); + assert_ne!(features & (1 << VIRTIO_BLK_F_RO), 0); + } + + // --- Config space --- + + #[test] + fn test_config_capacity() { + let tmp = create_test_disk(1024); + let backend = open_raw_backend(&tmp, false); + let blk = VirtioBlock::new(backend, false); + assert_eq!(blk.read_config(0), 1024); // Low. + assert_eq!(blk.read_config(4), 0); // High. + } + + // --- Request processing (direct) --- + + #[test] + fn test_read_request() { + let tmp = create_disk_with_pattern(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + let mem = MockMem::new(0x10000); + + // Write request header: type=IN, sector=2. + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&VIRTIO_BLK_T_IN.to_le_bytes()); + header[8..16].copy_from_slice(&2u64.to_le_bytes()); + mem.write_bytes(0x1000, &header); + + // Build descriptor chain. + let chain = vec![ + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, // Header (device-readable). + Descriptor { + addr: 0x2000, + len: 512, + flags: 2, + next: 0, + }, // Data (device-writable). + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, // Status (device-writable). + ]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_OK); + + // Check that data was read (sector 2 pattern = 0x02). + let data = mem.read_bytes(0x2000, 512); + assert!(data.iter().all(|&b| b == 0x02)); + } + + #[test] + fn test_write_request() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + let mem = MockMem::new(0x10000); + + // Header: type=OUT, sector=1. + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&VIRTIO_BLK_T_OUT.to_le_bytes()); + header[8..16].copy_from_slice(&1u64.to_le_bytes()); + mem.write_bytes(0x1000, &header); + + // Data to write (device-readable). + let write_data = vec![0xABu8; 512]; + mem.write_bytes(0x2000, &write_data); + + let chain = vec![ + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x2000, + len: 512, + flags: 0, + next: 0, + }, // Device-readable. + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, // Status. + ]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_OK); + + // Verify by reading back. + let mut header2 = [0u8; 16]; + header2[0..4].copy_from_slice(&VIRTIO_BLK_T_IN.to_le_bytes()); + header2[8..16].copy_from_slice(&1u64.to_le_bytes()); + mem.write_bytes(0x4000, &header2); + + let read_chain = vec![ + Descriptor { + addr: 0x4000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x5000, + len: 512, + flags: 2, + next: 0, + }, + Descriptor { + addr: 0x6000, + len: 1, + flags: 2, + next: 0, + }, + ]; + + let status2 = blk.process_request(&read_chain, &mem); + assert_eq!(status2, VIRTIO_BLK_S_OK); + let readback = mem.read_bytes(0x5000, 512); + assert!(readback.iter().all(|&b| b == 0xAB)); + } + + #[test] + fn test_write_rejected_on_read_only() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, true); + let mem = MockMem::new(0x10000); + + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&VIRTIO_BLK_T_OUT.to_le_bytes()); + mem.write_bytes(0x1000, &header); + + let chain = vec![ + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x2000, + len: 512, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, + ]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_IOERR); + } + + #[test] + fn test_flush_request() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + let mem = MockMem::new(0x10000); + + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&VIRTIO_BLK_T_FLUSH.to_le_bytes()); + mem.write_bytes(0x1000, &header); + + let chain = vec![ + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, + ]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_OK); + } + + #[test] + fn test_unsupported_request_type() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + let mem = MockMem::new(0x10000); + + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&99u32.to_le_bytes()); // Unknown type. + mem.write_bytes(0x1000, &header); + + let chain = vec![ + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x2000, + len: 512, + flags: 2, + next: 0, + }, + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, + ]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_UNSUPP); + } + + #[test] + fn test_short_chain_error() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + let mem = MockMem::new(0x10000); + + let chain = vec![Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }]; + + let status = blk.process_request(&chain, &mem); + assert_eq!(status, VIRTIO_BLK_S_IOERR); + } + + // --- VirtioDeviceBackend trait --- + + #[test] + fn test_version_1_feature() { + let tmp = create_test_disk(1); + let backend = open_raw_backend(&tmp, false); + let blk = VirtioBlock::new(backend, false); + let features_page1 = blk.device_features(1); + assert_eq!(features_page1, 1); // Bit 0 of page 1 = VIRTIO_F_VERSION_1. + } + + #[test] + fn test_num_queues() { + let tmp = create_test_disk(1); + let backend = open_raw_backend(&tmp, false); + let blk = VirtioBlock::new(backend, false); + assert_eq!(blk.num_queues(), 1); + } + + #[test] + fn test_queue_max_size() { + let tmp = create_test_disk(1); + let backend = open_raw_backend(&tmp, false); + let blk = VirtioBlock::new(backend, false); + assert_eq!(blk.queue_max_size(0), 256); + } +} diff --git a/src/vmm/src/windows/devices/virtio/disk.rs b/src/vmm/src/windows/devices/virtio/disk.rs new file mode 100644 index 000000000..f98f3d4eb --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/disk.rs @@ -0,0 +1,1078 @@ +//! Disk backend abstraction for virtio-blk. +//! +//! Provides a `DiskBackend` trait to abstract block device I/O, +//! with implementations for raw files and qcow2 images. + +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::Path; + +use super::super::super::error::{Result, WkrunError}; +/// Disk format: raw file passthrough. +pub const DISK_FORMAT_RAW: u32 = 0; +/// Disk format: qcow2 image. +pub const DISK_FORMAT_QCOW2: u32 = 1; + +/// Abstract block device I/O. +/// +/// Backends translate guest sector reads/writes to the underlying +/// storage format (raw file, qcow2 image, etc.). +pub trait DiskBackend { + /// Read `buf.len()` bytes starting at `offset` into `buf`. + fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> Result<()>; + + /// Write `buf` starting at `offset`. + fn write_at(&mut self, offset: u64, buf: &[u8]) -> Result<()>; + + /// Flush pending writes to stable storage. + fn flush(&mut self) -> Result<()>; + + /// Virtual disk size in bytes. + fn capacity_bytes(&self) -> u64; +} + +// --------------------------------------------------------------------------- +// Raw disk backend +// --------------------------------------------------------------------------- + +/// Raw file-backed disk — direct passthrough to the host file. +pub struct RawDiskBackend { + file: File, + capacity: u64, +} + +impl RawDiskBackend { + /// Wrap an open file as a raw disk backend. + /// + /// The file size must be > 0 (i.e., the file must not be empty). + pub fn new(file: File) -> Result { + let metadata = file + .metadata() + .map_err(|e| WkrunError::Device(format!("failed to get disk metadata: {}", e)))?; + let capacity = metadata.len(); + if capacity == 0 { + return Err(WkrunError::Device("disk file is empty".into())); + } + Ok(RawDiskBackend { file, capacity }) + } +} + +impl DiskBackend for RawDiskBackend { + fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> Result<()> { + self.file + .seek(SeekFrom::Start(offset)) + .map_err(|e| WkrunError::Device(format!("disk seek failed: {}", e)))?; + self.file + .read_exact(buf) + .map_err(|e| WkrunError::Device(format!("disk read failed: {}", e)))?; + Ok(()) + } + + fn write_at(&mut self, offset: u64, buf: &[u8]) -> Result<()> { + self.file + .seek(SeekFrom::Start(offset)) + .map_err(|e| WkrunError::Device(format!("disk seek failed: {}", e)))?; + self.file + .write_all(buf) + .map_err(|e| WkrunError::Device(format!("disk write failed: {}", e)))?; + Ok(()) + } + + fn flush(&mut self) -> Result<()> { + self.file + .sync_all() + .map_err(|e| WkrunError::Device(format!("disk flush failed: {}", e)))?; + Ok(()) + } + + fn capacity_bytes(&self) -> u64 { + self.capacity + } +} + +// --------------------------------------------------------------------------- +// qcow2 disk backend +// --------------------------------------------------------------------------- + +/// qcow2 magic number: 'Q', 'F', 'I', 0xFB. +const QCOW2_MAGIC: u32 = 0x514649FB; + +/// Mask to extract the cluster-aligned file offset from an L1 or L2 entry. +/// Bits 55:9 — zeroes out the top flag bits and the sub-cluster offset. +const L2_OFFSET_MASK: u64 = 0x00FF_FFFF_FFFF_FE00; + +/// Parsed qcow2 header (fields common to v2 and v3). +#[derive(Debug)] +struct Qcow2Header { + #[allow(dead_code)] + version: u32, + cluster_bits: u32, + size: u64, // Virtual disk size in bytes. + l1_size: u32, // Number of entries in the L1 table. + l1_table_offset: u64, // File offset of the L1 table. + refcount_table_offset: u64, + refcount_table_clusters: u32, + refcount_order: u32, // log2(refcount bits); 4 means 16-bit refcounts. +} + +impl Qcow2Header { + /// Parse a qcow2 header from the first 104 bytes of the file. + fn parse(buf: &[u8; 104]) -> Result { + let magic = u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]]); + if magic != QCOW2_MAGIC { + return Err(WkrunError::Device(format!( + "not a qcow2 image: bad magic 0x{:08X}", + magic + ))); + } + + let version = u32::from_be_bytes([buf[4], buf[5], buf[6], buf[7]]); + if version != 2 && version != 3 { + return Err(WkrunError::Device(format!( + "unsupported qcow2 version: {}", + version + ))); + } + + // Backing file offset (u64 at 8) — must be zero (no backing files). + let backing_file_offset = u64::from_be_bytes(buf[8..16].try_into().unwrap()); + if backing_file_offset != 0 { + return Err(WkrunError::Device( + "qcow2 backing files are not supported".into(), + )); + } + + let cluster_bits = u32::from_be_bytes([buf[20], buf[21], buf[22], buf[23]]); + if !(9..=21).contains(&cluster_bits) { + return Err(WkrunError::Device(format!( + "invalid qcow2 cluster_bits: {}", + cluster_bits + ))); + } + + let size = u64::from_be_bytes(buf[24..32].try_into().unwrap()); + let crypt_method = u32::from_be_bytes([buf[32], buf[33], buf[34], buf[35]]); + if crypt_method != 0 { + return Err(WkrunError::Device( + "qcow2 encryption is not supported".into(), + )); + } + + let l1_size = u32::from_be_bytes([buf[36], buf[37], buf[38], buf[39]]); + let l1_table_offset = u64::from_be_bytes(buf[40..48].try_into().unwrap()); + let refcount_table_offset = u64::from_be_bytes(buf[48..56].try_into().unwrap()); + let refcount_table_clusters = u32::from_be_bytes([buf[56], buf[57], buf[58], buf[59]]); + + let nb_snapshots = u32::from_be_bytes([buf[60], buf[61], buf[62], buf[63]]); + if nb_snapshots != 0 { + return Err(WkrunError::Device( + "qcow2 snapshots are not supported".into(), + )); + } + + // v3 has refcount_order at offset 96; v2 defaults to 4 (16-bit). + let refcount_order = if version >= 3 { + u32::from_be_bytes([buf[96], buf[97], buf[98], buf[99]]) + } else { + 4 + }; + + Ok(Qcow2Header { + version, + cluster_bits, + size, + l1_size, + l1_table_offset, + refcount_table_offset, + refcount_table_clusters, + refcount_order, + }) + } +} + +/// qcow2 image backend with two-level L1/L2 table navigation. +/// +/// Supports reading and writing existing qcow2 images. New clusters +/// are allocated by appending to the end of the file (append-only). +/// No compression, encryption, snapshots, or backing file support. +struct Qcow2DiskBackend { + file: File, + header: Qcow2Header, + cluster_size: u64, + l2_entries_per_table: u64, + l1_table: Vec, + refcount_table: Vec, + next_free_cluster: u64, + read_only: bool, +} + +impl Qcow2DiskBackend { + /// Open a qcow2 image file and parse its metadata. + fn open(path: &Path, read_only: bool) -> Result { + let mut file = File::options() + .read(true) + .write(!read_only) + .open(path) + .map_err(|e| { + WkrunError::Device(format!( + "failed to open qcow2 disk '{}': {}", + path.display(), + e + )) + })?; + + // Read header. + let mut header_buf = [0u8; 104]; + file.read_exact(&mut header_buf) + .map_err(|e| WkrunError::Device(format!("failed to read qcow2 header: {}", e)))?; + let header = Qcow2Header::parse(&header_buf)?; + + let cluster_size = 1u64 << header.cluster_bits; + let l2_entries_per_table = cluster_size / 8; + + // Read L1 table. + let l1_byte_len = (header.l1_size as usize) * 8; + let mut l1_bytes = vec![0u8; l1_byte_len]; + file.seek(SeekFrom::Start(header.l1_table_offset)) + .map_err(|e| WkrunError::Device(format!("failed to seek to L1 table: {}", e)))?; + file.read_exact(&mut l1_bytes) + .map_err(|e| WkrunError::Device(format!("failed to read L1 table: {}", e)))?; + let l1_table: Vec = l1_bytes + .chunks_exact(8) + .map(|c| u64::from_be_bytes(c.try_into().unwrap())) + .collect(); + + // Read refcount table. + let refcount_entries = (header.refcount_table_clusters as u64 * cluster_size / 8) as usize; + let mut refcount_bytes = vec![0u8; refcount_entries * 8]; + file.seek(SeekFrom::Start(header.refcount_table_offset)) + .map_err(|e| WkrunError::Device(format!("failed to seek to refcount table: {}", e)))?; + file.read_exact(&mut refcount_bytes) + .map_err(|e| WkrunError::Device(format!("failed to read refcount table: {}", e)))?; + let refcount_table: Vec = refcount_bytes + .chunks_exact(8) + .map(|c| u64::from_be_bytes(c.try_into().unwrap())) + .collect(); + + // Determine next free cluster: end of file rounded up to cluster boundary. + let file_len = file + .seek(SeekFrom::End(0)) + .map_err(|e| WkrunError::Device(format!("failed to get qcow2 file size: {}", e)))?; + let next_free_cluster = file_len.div_ceil(cluster_size) * cluster_size; + + Ok(Qcow2DiskBackend { + file, + header, + cluster_size, + l2_entries_per_table, + l1_table, + refcount_table, + next_free_cluster, + read_only, + }) + } + + /// Resolve a guest byte offset to a host file offset. + /// Returns `None` if the cluster is unallocated. + fn resolve_offset(&mut self, guest_offset: u64) -> Result> { + let l1_index = (guest_offset / self.cluster_size / self.l2_entries_per_table) as usize; + let l2_index = ((guest_offset / self.cluster_size) % self.l2_entries_per_table) as usize; + let offset_in_cluster = guest_offset % self.cluster_size; + + if l1_index >= self.l1_table.len() { + return Ok(None); + } + + let l1_entry = self.l1_table[l1_index]; + let l2_table_offset = l1_entry & L2_OFFSET_MASK; + if l2_table_offset == 0 { + return Ok(None); + } + + // Read the L2 entry. + let l2_entry_file_offset = l2_table_offset + (l2_index as u64) * 8; + self.file + .seek(SeekFrom::Start(l2_entry_file_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: failed to seek L2 entry: {}", e)))?; + let mut entry_buf = [0u8; 8]; + self.file + .read_exact(&mut entry_buf) + .map_err(|e| WkrunError::Device(format!("qcow2: failed to read L2 entry: {}", e)))?; + let l2_entry = u64::from_be_bytes(entry_buf); + + let data_cluster_offset = l2_entry & L2_OFFSET_MASK; + if data_cluster_offset == 0 { + return Ok(None); + } + + Ok(Some(data_cluster_offset + offset_in_cluster)) + } + + /// Allocate a new cluster by appending to the file. + /// Updates refcount for the new cluster. + fn allocate_cluster(&mut self) -> Result { + let offset = self.allocate_raw_cluster()?; + self.set_refcount(offset, 1)?; + Ok(offset) + } + + /// Allocate a new cluster without updating refcounts. + /// Used internally to break recursion when allocating refcount blocks. + fn allocate_raw_cluster(&mut self) -> Result { + let offset = self.next_free_cluster; + let zeros = vec![0u8; self.cluster_size as usize]; + self.file + .seek(SeekFrom::Start(offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: seek for alloc failed: {}", e)))?; + self.file + .write_all(&zeros) + .map_err(|e| WkrunError::Device(format!("qcow2: cluster alloc write failed: {}", e)))?; + self.next_free_cluster = offset + self.cluster_size; + Ok(offset) + } + + /// Set the refcount for a cluster at the given file offset. + /// + /// Navigates the two-level refcount table. If the refcount block + /// is missing, allocates one (using raw allocation to avoid recursion). + fn set_refcount(&mut self, cluster_offset: u64, count: u16) -> Result<()> { + let cluster_index = cluster_offset / self.cluster_size; + let refcount_bits = 1u32 << self.header.refcount_order; + let entries_per_block = self.cluster_size * 8 / refcount_bits as u64; + + let refcount_table_index = (cluster_index / entries_per_block) as usize; + let block_index = cluster_index % entries_per_block; + + if refcount_table_index >= self.refcount_table.len() { + // Refcount table too small — skip for now (append-only images + // with limited allocations rarely hit this). + return Ok(()); + } + + let mut block_offset = self.refcount_table[refcount_table_index]; + if block_offset == 0 { + // Allocate a new refcount block (raw — no recursive refcount update). + block_offset = self.allocate_raw_cluster()?; + self.refcount_table[refcount_table_index] = block_offset; + // Write updated refcount table entry back to disk. + let rt_entry_offset = + self.header.refcount_table_offset + (refcount_table_index as u64) * 8; + self.file + .seek(SeekFrom::Start(rt_entry_offset)) + .map_err(|e| { + WkrunError::Device(format!("qcow2: seek refcount table entry: {}", e)) + })?; + self.file + .write_all(&block_offset.to_be_bytes()) + .map_err(|e| { + WkrunError::Device(format!("qcow2: write refcount table entry: {}", e)) + })?; + } + + // Write the 16-bit refcount entry. + let entry_offset = block_offset + block_index * (refcount_bits as u64 / 8); + self.file + .seek(SeekFrom::Start(entry_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: seek refcount entry: {}", e)))?; + self.file + .write_all(&count.to_be_bytes()) + .map_err(|e| WkrunError::Device(format!("qcow2: write refcount entry: {}", e)))?; + + Ok(()) + } + + /// Ensure an L2 table exists for the given L1 index. Allocates if needed. + /// Returns the file offset of the L2 table. + fn ensure_l2_table(&mut self, l1_index: usize) -> Result { + let l1_entry = self.l1_table[l1_index]; + let l2_offset = l1_entry & L2_OFFSET_MASK; + if l2_offset != 0 { + return Ok(l2_offset); + } + + // Allocate a new L2 table cluster. + let new_l2_offset = self.allocate_cluster()?; + + // Update in-memory L1 table. + self.l1_table[l1_index] = new_l2_offset; + + // Write L1 entry back to disk. + let l1_entry_file_offset = self.header.l1_table_offset + (l1_index as u64) * 8; + self.file + .seek(SeekFrom::Start(l1_entry_file_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: seek L1 entry: {}", e)))?; + self.file + .write_all(&new_l2_offset.to_be_bytes()) + .map_err(|e| WkrunError::Device(format!("qcow2: write L1 entry: {}", e)))?; + + Ok(new_l2_offset) + } + + /// Ensure a data cluster exists for the given guest offset. + /// Allocates L2 table and/or data cluster if needed. + /// Returns the host file offset for the data. + fn ensure_data_cluster(&mut self, guest_offset: u64) -> Result { + let l1_index = (guest_offset / self.cluster_size / self.l2_entries_per_table) as usize; + let l2_index = ((guest_offset / self.cluster_size) % self.l2_entries_per_table) as usize; + let offset_in_cluster = guest_offset % self.cluster_size; + + if l1_index >= self.l1_table.len() { + return Err(WkrunError::Device(format!( + "qcow2: guest offset {} exceeds virtual size", + guest_offset + ))); + } + + let l2_table_offset = self.ensure_l2_table(l1_index)?; + + // Read the L2 entry. + let l2_entry_file_offset = l2_table_offset + (l2_index as u64) * 8; + self.file + .seek(SeekFrom::Start(l2_entry_file_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: seek L2 entry: {}", e)))?; + let mut entry_buf = [0u8; 8]; + self.file + .read_exact(&mut entry_buf) + .map_err(|e| WkrunError::Device(format!("qcow2: read L2 entry: {}", e)))?; + let l2_entry = u64::from_be_bytes(entry_buf); + let data_offset = l2_entry & L2_OFFSET_MASK; + + if data_offset != 0 { + return Ok(data_offset + offset_in_cluster); + } + + // Allocate a new data cluster. + let new_data_offset = self.allocate_cluster()?; + + // Write L2 entry back to disk. + self.file + .seek(SeekFrom::Start(l2_entry_file_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: seek L2 entry for write: {}", e)))?; + self.file + .write_all(&new_data_offset.to_be_bytes()) + .map_err(|e| WkrunError::Device(format!("qcow2: write L2 entry: {}", e)))?; + + Ok(new_data_offset + offset_in_cluster) + } +} + +impl DiskBackend for Qcow2DiskBackend { + fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> Result<()> { + let mut pos = 0usize; + let mut guest_offset = offset; + + while pos < buf.len() { + let offset_in_cluster = guest_offset % self.cluster_size; + let remaining_in_cluster = (self.cluster_size - offset_in_cluster) as usize; + let chunk_len = remaining_in_cluster.min(buf.len() - pos); + + match self.resolve_offset(guest_offset)? { + Some(host_offset) => { + self.file.seek(SeekFrom::Start(host_offset)).map_err(|e| { + WkrunError::Device(format!("qcow2: read seek failed: {}", e)) + })?; + self.file + .read_exact(&mut buf[pos..pos + chunk_len]) + .map_err(|e| WkrunError::Device(format!("qcow2: read failed: {}", e)))?; + } + None => { + // Unallocated cluster — return zeros. + buf[pos..pos + chunk_len].fill(0); + } + } + + pos += chunk_len; + guest_offset += chunk_len as u64; + } + + Ok(()) + } + + fn write_at(&mut self, offset: u64, buf: &[u8]) -> Result<()> { + if self.read_only { + return Err(WkrunError::Device( + "qcow2: write rejected on read-only disk".into(), + )); + } + + let mut pos = 0usize; + let mut guest_offset = offset; + + while pos < buf.len() { + let offset_in_cluster = guest_offset % self.cluster_size; + let remaining_in_cluster = (self.cluster_size - offset_in_cluster) as usize; + let chunk_len = remaining_in_cluster.min(buf.len() - pos); + + let host_offset = self.ensure_data_cluster(guest_offset)?; + + self.file + .seek(SeekFrom::Start(host_offset)) + .map_err(|e| WkrunError::Device(format!("qcow2: write seek failed: {}", e)))?; + self.file + .write_all(&buf[pos..pos + chunk_len]) + .map_err(|e| WkrunError::Device(format!("qcow2: write failed: {}", e)))?; + + pos += chunk_len; + guest_offset += chunk_len as u64; + } + + Ok(()) + } + + fn flush(&mut self) -> Result<()> { + self.file + .sync_all() + .map_err(|e| WkrunError::Device(format!("qcow2: flush failed: {}", e)))?; + Ok(()) + } + + fn capacity_bytes(&self) -> u64 { + self.header.size + } +} + +// --------------------------------------------------------------------------- +// Factory +// --------------------------------------------------------------------------- + +/// Open a disk backend based on the specified format. +/// +/// - `DISK_FORMAT_RAW` (0): raw file passthrough +/// - `DISK_FORMAT_QCOW2` (1): qcow2 image with copy-on-write +pub fn open_disk_backend( + path: &Path, + format: u32, + read_only: bool, +) -> Result> { + match format { + DISK_FORMAT_RAW => { + let file = File::options() + .read(true) + .write(!read_only) + .open(path) + .map_err(|e| { + WkrunError::Device(format!("failed to open disk '{}': {}", path.display(), e)) + })?; + Ok(Box::new(RawDiskBackend::new(file)?)) + } + DISK_FORMAT_QCOW2 => { + let backend = Qcow2DiskBackend::open(path, read_only)?; + Ok(Box::new(backend)) + } + _ => Err(WkrunError::Device(format!( + "unsupported disk format: {}", + format + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write as IoWrite; + use tempfile::NamedTempFile; + + fn create_raw_file(size: usize) -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + f.write_all(&vec![0u8; size]).unwrap(); + f.flush().unwrap(); + f + } + + fn create_raw_file_with_pattern(sectors: u64) -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + for sector in 0..sectors { + let pattern = vec![(sector & 0xFF) as u8; 512]; + f.write_all(&pattern).unwrap(); + } + f.flush().unwrap(); + f + } + + // --- RawDiskBackend --- + + #[test] + fn test_raw_backend_capacity() { + let tmp = create_raw_file(4096); + let file = File::open(tmp.path()).unwrap(); + let backend = RawDiskBackend::new(file).unwrap(); + assert_eq!(backend.capacity_bytes(), 4096); + } + + #[test] + fn test_raw_backend_empty_file_error() { + let tmp = NamedTempFile::new().unwrap(); + let file = File::open(tmp.path()).unwrap(); + assert!(RawDiskBackend::new(file).is_err()); + } + + #[test] + fn test_raw_backend_read_at() { + let tmp = create_raw_file_with_pattern(4); + let file = File::options() + .read(true) + .write(true) + .open(tmp.path()) + .unwrap(); + let mut backend = RawDiskBackend::new(file).unwrap(); + + let mut buf = [0u8; 512]; + backend.read_at(512 * 2, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x02)); + } + + #[test] + fn test_raw_backend_write_at() { + let tmp = create_raw_file(2048); + let file = File::options() + .read(true) + .write(true) + .open(tmp.path()) + .unwrap(); + let mut backend = RawDiskBackend::new(file).unwrap(); + + let data = vec![0xABu8; 512]; + backend.write_at(512, &data).unwrap(); + + let mut buf = [0u8; 512]; + backend.read_at(512, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xAB)); + } + + #[test] + fn test_raw_backend_flush() { + let tmp = create_raw_file(512); + let file = File::options() + .read(true) + .write(true) + .open(tmp.path()) + .unwrap(); + let mut backend = RawDiskBackend::new(file).unwrap(); + backend.flush().unwrap(); + } + + // --- open_disk_backend factory --- + + #[test] + fn test_factory_raw_format() { + let tmp = create_raw_file(1024); + let backend = open_disk_backend(tmp.path(), DISK_FORMAT_RAW, false).unwrap(); + assert_eq!(backend.capacity_bytes(), 1024); + } + + #[test] + fn test_factory_invalid_format() { + let tmp = create_raw_file(1024); + let result = open_disk_backend(tmp.path(), 99, false); + assert!(result.is_err()); + } + + // ----------------------------------------------------------------------- + // qcow2 test helpers + // ----------------------------------------------------------------------- + + /// Create a minimal qcow2 v2 image programmatically. + /// + /// Layout (cluster_size = 512 for small tests): + /// Cluster 0: header + /// Cluster 1: refcount table (1 entry pointing to cluster 2) + /// Cluster 2: refcount block (refcounts for clusters 0..N) + /// Cluster 3: L1 table + /// [Cluster 4+: optional pre-allocated L2 + data] + /// + /// `preallocated` is a list of (guest_byte_offset, data) pairs to + /// write into the image at construction time. + fn create_test_qcow2( + virtual_size: u64, + cluster_bits: u32, + preallocated: &[(u64, &[u8])], + ) -> NamedTempFile { + let cluster_size = 1u64 << cluster_bits; + let l2_entries = cluster_size / 8; + + // Calculate L1 table size. + let l1_entries = virtual_size.div_ceil(cluster_size * l2_entries) as u32; + + // Fixed layout: + // Cluster 0: header + // Cluster 1: refcount table + // Cluster 2: refcount block + // Cluster 3: L1 table (may span multiple clusters but 1 for small tests) + let refcount_table_offset = cluster_size; + let refcount_block_offset = cluster_size * 2; + let l1_table_offset = cluster_size * 3; + let mut next_cluster = cluster_size * 4; // First free cluster. + + // Collect allocations needed for preallocated data. + struct PreallocInfo { + l2_idx: usize, + l2_cluster: u64, + data_cluster: u64, + data: Vec, + data_offset_in_cluster: u64, + } + + let mut l2_clusters: std::collections::HashMap = + std::collections::HashMap::new(); + let mut allocs = Vec::new(); + + for &(guest_offset, data) in preallocated { + let l1_idx = (guest_offset / cluster_size / l2_entries) as usize; + let l2_idx = ((guest_offset / cluster_size) % l2_entries) as usize; + let offset_in_cluster = guest_offset % cluster_size; + + let l2_cluster = *l2_clusters.entry(l1_idx).or_insert_with(|| { + let c = next_cluster; + next_cluster += cluster_size; + c + }); + + let data_cluster = next_cluster; + next_cluster += cluster_size; + + allocs.push(PreallocInfo { + l2_idx, + l2_cluster, + data_cluster, + data: data.to_vec(), + data_offset_in_cluster: offset_in_cluster, + }); + } + + let total_clusters = next_cluster / cluster_size; + let file_size = next_cluster; + + // Build the file. + let mut f = NamedTempFile::new().unwrap(); + let mut image = vec![0u8; file_size as usize]; + + // --- Header (cluster 0) --- + // Magic. + image[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + // Version = 2. + image[4..8].copy_from_slice(&2u32.to_be_bytes()); + // Backing file offset = 0. + image[8..16].copy_from_slice(&0u64.to_be_bytes()); + // Backing file size = 0. + image[16..20].copy_from_slice(&0u32.to_be_bytes()); + // Cluster bits. + image[20..24].copy_from_slice(&cluster_bits.to_be_bytes()); + // Virtual size. + image[24..32].copy_from_slice(&virtual_size.to_be_bytes()); + // Crypt method = 0. + image[32..36].copy_from_slice(&0u32.to_be_bytes()); + // L1 size. + image[36..40].copy_from_slice(&l1_entries.to_be_bytes()); + // L1 table offset. + image[40..48].copy_from_slice(&l1_table_offset.to_be_bytes()); + // Refcount table offset. + image[48..56].copy_from_slice(&refcount_table_offset.to_be_bytes()); + // Refcount table clusters = 1. + image[56..60].copy_from_slice(&1u32.to_be_bytes()); + // Nb snapshots = 0. + image[60..64].copy_from_slice(&0u32.to_be_bytes()); + + // --- Refcount table (cluster 1) --- + // Single entry pointing to refcount block at cluster 2. + let rt_off = refcount_table_offset as usize; + image[rt_off..rt_off + 8].copy_from_slice(&refcount_block_offset.to_be_bytes()); + + // --- Refcount block (cluster 2) --- + // Set refcount=1 for all allocated clusters (16-bit BE entries). + let rb_off = refcount_block_offset as usize; + for i in 0..total_clusters { + let entry_off = rb_off + (i as usize) * 2; + image[entry_off..entry_off + 2].copy_from_slice(&1u16.to_be_bytes()); + } + + // --- L1 table (cluster 3) --- + for (&l1_idx, &l2_cluster) in &l2_clusters { + let entry_off = l1_table_offset as usize + l1_idx * 8; + image[entry_off..entry_off + 8].copy_from_slice(&l2_cluster.to_be_bytes()); + } + + // --- L2 tables + data clusters --- + for alloc in &allocs { + // Write L2 entry. + let l2_entry_off = alloc.l2_cluster as usize + alloc.l2_idx * 8; + image[l2_entry_off..l2_entry_off + 8] + .copy_from_slice(&alloc.data_cluster.to_be_bytes()); + + // Write data. + let data_off = alloc.data_cluster as usize + alloc.data_offset_in_cluster as usize; + let end = data_off + alloc.data.len(); + image[data_off..end].copy_from_slice(&alloc.data); + } + + f.write_all(&image).unwrap(); + f.flush().unwrap(); + f + } + + // ----------------------------------------------------------------------- + // qcow2 header parsing + // ----------------------------------------------------------------------- + + #[test] + fn test_qcow2_header_valid_v2() { + let tmp = create_test_qcow2(1024 * 1024, 16, &[]); + let backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + assert_eq!(backend.header.version, 2); + assert_eq!(backend.header.cluster_bits, 16); + assert_eq!(backend.capacity_bytes(), 1024 * 1024); + } + + #[test] + fn test_qcow2_header_bad_magic() { + let mut tmp = NamedTempFile::new().unwrap(); + let mut data = vec![0u8; 512]; + data[0..4].copy_from_slice(&0xDEADBEEFu32.to_be_bytes()); + tmp.write_all(&data).unwrap(); + tmp.flush().unwrap(); + + let err = Qcow2DiskBackend::open(tmp.path(), false).err().unwrap(); + assert!(err.to_string().contains("bad magic"), "error was: {}", err); + } + + #[test] + fn test_qcow2_header_bad_version() { + let mut tmp = NamedTempFile::new().unwrap(); + let mut data = vec![0u8; 512]; + data[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + data[4..8].copy_from_slice(&1u32.to_be_bytes()); // Version 1. + tmp.write_all(&data).unwrap(); + tmp.flush().unwrap(); + + let err = Qcow2DiskBackend::open(tmp.path(), false).err().unwrap(); + assert!(err.to_string().contains("version"), "error was: {}", err); + } + + #[test] + fn test_qcow2_header_backing_file_rejected() { + let mut tmp = NamedTempFile::new().unwrap(); + let mut data = vec![0u8; 512]; + data[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + data[4..8].copy_from_slice(&2u32.to_be_bytes()); + data[8..16].copy_from_slice(&100u64.to_be_bytes()); // Backing file offset. + tmp.write_all(&data).unwrap(); + tmp.flush().unwrap(); + + let err = Qcow2DiskBackend::open(tmp.path(), false).err().unwrap(); + assert!(err.to_string().contains("backing"), "error was: {}", err); + } + + #[test] + fn test_qcow2_header_encryption_rejected() { + let mut tmp = NamedTempFile::new().unwrap(); + let mut data = vec![0u8; 512]; + data[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + data[4..8].copy_from_slice(&2u32.to_be_bytes()); + data[8..16].copy_from_slice(&0u64.to_be_bytes()); // No backing. + data[20..24].copy_from_slice(&16u32.to_be_bytes()); // cluster_bits. + data[24..32].copy_from_slice(&(1024u64 * 1024).to_be_bytes()); + data[32..36].copy_from_slice(&1u32.to_be_bytes()); // Encrypted! + tmp.write_all(&data).unwrap(); + tmp.flush().unwrap(); + + let err = Qcow2DiskBackend::open(tmp.path(), false).err().unwrap(); + assert!(err.to_string().contains("encryption"), "error was: {}", err); + } + + // ----------------------------------------------------------------------- + // qcow2 reads + // ----------------------------------------------------------------------- + + #[test] + fn test_qcow2_read_unallocated_returns_zeros() { + // 1MB image with no preallocated data, cluster_bits=9 (512B clusters). + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + let mut buf = [0xFFu8; 512]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0)); + } + + #[test] + fn test_qcow2_read_allocated_cluster() { + let pattern = vec![0xABu8; 128]; + let tmp = create_test_qcow2(1024 * 1024, 9, &[(512, &pattern)]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + let mut buf = [0u8; 128]; + backend.read_at(512, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xAB)); + } + + #[test] + fn test_qcow2_read_cross_cluster_boundary() { + // Two adjacent clusters with different data. + let data0 = vec![0x11u8; 512]; + let data1 = vec![0x22u8; 512]; + let tmp = create_test_qcow2(1024 * 1024, 9, &[(0, &data0), (512, &data1)]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + // Read 256 bytes spanning the boundary (last 128 of cluster 0 + first 128 of cluster 1). + let mut buf = [0u8; 256]; + backend.read_at(384, &mut buf).unwrap(); + assert!(buf[..128].iter().all(|&b| b == 0x11)); + assert!(buf[128..].iter().all(|&b| b == 0x22)); + } + + #[test] + fn test_qcow2_capacity() { + let tmp = create_test_qcow2(2 * 1024 * 1024, 16, &[]); + let backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + assert_eq!(backend.capacity_bytes(), 2 * 1024 * 1024); + } + + // ----------------------------------------------------------------------- + // qcow2 writes + // ----------------------------------------------------------------------- + + #[test] + fn test_qcow2_write_allocates_cluster() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + let data = vec![0xCDu8; 256]; + backend.write_at(0, &data).unwrap(); + + let mut buf = [0u8; 256]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xCD)); + } + + #[test] + fn test_qcow2_write_read_roundtrip() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + // Write different patterns at different offsets. + backend.write_at(0, &[0x11; 512]).unwrap(); + backend.write_at(512, &[0x22; 512]).unwrap(); + backend.write_at(1024, &[0x33; 512]).unwrap(); + + let mut buf0 = [0u8; 512]; + let mut buf1 = [0u8; 512]; + let mut buf2 = [0u8; 512]; + backend.read_at(0, &mut buf0).unwrap(); + backend.read_at(512, &mut buf1).unwrap(); + backend.read_at(1024, &mut buf2).unwrap(); + + assert!(buf0.iter().all(|&b| b == 0x11)); + assert!(buf1.iter().all(|&b| b == 0x22)); + assert!(buf2.iter().all(|&b| b == 0x33)); + } + + #[test] + fn test_qcow2_write_partial_cluster() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + // Write 100 bytes in the middle of cluster 0. + backend.write_at(200, &[0xBB; 100]).unwrap(); + + // Verify: first 200 bytes = zeros, next 100 = 0xBB, rest = zeros. + let mut buf = [0u8; 512]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf[..200].iter().all(|&b| b == 0x00)); + assert!(buf[200..300].iter().all(|&b| b == 0xBB)); + assert!(buf[300..].iter().all(|&b| b == 0x00)); + } + + #[test] + fn test_qcow2_write_cross_cluster_boundary() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + // Write 256 bytes spanning cluster boundary (cluster_size=512). + let data = vec![0xEE; 256]; + backend.write_at(384, &data).unwrap(); + + let mut buf = [0u8; 256]; + backend.read_at(384, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xEE)); + + // Verify untouched parts. + let mut before = [0u8; 384]; + backend.read_at(0, &mut before).unwrap(); + assert!(before.iter().all(|&b| b == 0x00)); + + let mut after = [0u8; 128]; + backend.read_at(640, &mut after).unwrap(); + assert!(after.iter().all(|&b| b == 0x00)); + } + + #[test] + fn test_qcow2_write_same_cluster_no_realloc() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + backend.write_at(0, &[0x11; 256]).unwrap(); + let free_before = backend.next_free_cluster; + + // Write again to the same cluster — should not allocate new clusters. + backend.write_at(256, &[0x22; 256]).unwrap(); + assert_eq!(backend.next_free_cluster, free_before); + + // Verify both writes persisted. + let mut buf = [0u8; 512]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf[..256].iter().all(|&b| b == 0x11)); + assert!(buf[256..].iter().all(|&b| b == 0x22)); + } + + #[test] + fn test_qcow2_l2_table_allocation() { + // Use cluster_bits=9 (512B), virtual_size=1MB. + // L2 entries per table = 512/8 = 64. + // So each L1 entry covers 64*512 = 32768 bytes. + // Writing at offset 32768 requires L1 index=1 (new L2 table). + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + + let data = vec![0xAA; 512]; + backend.write_at(32768, &data).unwrap(); + + let mut buf = [0u8; 512]; + backend.read_at(32768, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xAA)); + } + + #[test] + fn test_qcow2_read_only_rejects_writes() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), true).unwrap(); + + let result = backend.write_at(0, &[0x11; 512]); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("read-only"), "error was: {}", err); + } + + #[test] + fn test_qcow2_flush() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = Qcow2DiskBackend::open(tmp.path(), false).unwrap(); + backend.write_at(0, &[0x42; 512]).unwrap(); + backend.flush().unwrap(); + } + + // ----------------------------------------------------------------------- + // Factory: qcow2 dispatch + // ----------------------------------------------------------------------- + + #[test] + fn test_factory_qcow2_format() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let mut backend = open_disk_backend(tmp.path(), DISK_FORMAT_QCOW2, false).unwrap(); + assert_eq!(backend.capacity_bytes(), 1024 * 1024); + + // Write + read through the factory-created backend. + backend.write_at(0, &[0x99; 512]).unwrap(); + let mut buf = [0u8; 512]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x99)); + } +} diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs new file mode 100644 index 000000000..45f302889 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -0,0 +1,663 @@ +//! Virtio-MMIO transport (virtio spec v1.2 Section 4.2). +//! +//! Register file at a memory-mapped I/O address. The guest accesses +//! device registers via MMIO reads/writes which trigger VM exits. + +use super::queue::{GuestMemoryAccessor, Virtqueue}; + +/// MMIO base address for the first virtio device. +/// Placed above guest RAM (256MB) and below the 4GB identity map. +pub const VIRTIO_MMIO_BASE: u64 = 0xD000_0000; + +/// Size of the MMIO register region (512 bytes covers all registers + config). +pub const VIRTIO_MMIO_SIZE: u64 = 0x200; + +// Virtio-MMIO register offsets (virtio spec 4.2.2). +const MAGIC_VALUE: u64 = 0x000; +const VERSION: u64 = 0x004; +const DEVICE_ID: u64 = 0x008; +const VENDOR_ID: u64 = 0x00C; +const DEVICE_FEATURES: u64 = 0x010; +const DEVICE_FEATURES_SEL: u64 = 0x014; +const DRIVER_FEATURES: u64 = 0x020; +const DRIVER_FEATURES_SEL: u64 = 0x024; +const QUEUE_SEL: u64 = 0x030; +const QUEUE_NUM_MAX: u64 = 0x034; +const QUEUE_NUM: u64 = 0x038; +const QUEUE_READY: u64 = 0x044; +const QUEUE_NOTIFY: u64 = 0x050; +const INTERRUPT_STATUS: u64 = 0x060; +const INTERRUPT_ACK: u64 = 0x064; +const STATUS: u64 = 0x070; +const QUEUE_DESC_LOW: u64 = 0x080; +const QUEUE_DESC_HIGH: u64 = 0x084; +const QUEUE_AVAIL_LOW: u64 = 0x090; +const QUEUE_AVAIL_HIGH: u64 = 0x094; +const QUEUE_USED_LOW: u64 = 0x0A0; +const QUEUE_USED_HIGH: u64 = 0x0A4; +const CONFIG_GENERATION: u64 = 0x0FC; +const CONFIG_SPACE: u64 = 0x100; + +// Virtio device status bits (virtio spec 2.1) — used in tests. +#[cfg(test)] +const STATUS_ACK: u32 = 1; +#[cfg(test)] +const STATUS_DRIVER: u32 = 2; +#[cfg(test)] +const STATUS_FEATURES_OK: u32 = 8; +#[cfg(test)] +const STATUS_DRIVER_OK: u32 = 4; + +/// Magic value identifying a virtio-MMIO device ("virt" in little-endian). +const VIRTIO_MMIO_MAGIC: u32 = 0x7472_6976; + +/// Virtio-MMIO version (2 = virtio 1.0+). +const VIRTIO_MMIO_VERSION: u32 = 2; + +/// Vendor ID (0 = no vendor). +const VIRTIO_VENDOR_ID: u32 = 0; + +// Interrupt status bits. +const INTERRUPT_USED_RING: u32 = 1; + +/// Backend trait that specific virtio devices implement. +pub trait VirtioDeviceBackend { + /// Virtio device ID (e.g., 2 for block). + fn device_id(&self) -> u32; + + /// Return device feature bits for the given feature page (0 or 1). + fn device_features(&self, page: u32) -> u32; + + /// Read a 32-bit value from the device config space at the given offset. + fn read_config(&self, offset: u64) -> u32; + + /// Handle a queue notification (guest made buffers available). + /// + /// Returns `true` if the device processed buffers and an interrupt + /// should be raised. + fn queue_notify( + &mut self, + queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool; + + /// Number of virtqueues this device uses. + fn num_queues(&self) -> usize; + + /// Maximum queue size for the given queue index. + fn queue_max_size(&self, queue_idx: u32) -> u16; + + /// Poll for host-initiated events (e.g., incoming network/vsock data). + /// + /// Called from the vCPU run loop. Returns `true` if an interrupt + /// should be raised (device placed data in the used ring). + /// Default: no host-initiated events (suitable for block devices). + fn poll(&mut self, _queues: &mut [Virtqueue], _mem: &dyn GuestMemoryAccessor) -> bool { + false + } +} + +/// Virtio-MMIO device wrapping a backend. +pub struct VirtioMmioDevice { + backend: D, + queues: Vec, + /// Currently selected queue index (via QUEUE_SEL). + queue_sel: u32, + /// Device status register. + status: u32, + /// Device feature selection page. + device_features_sel: u32, + /// Driver feature selection page. + driver_features_sel: u32, + /// Driver-acknowledged feature bits (page 0 and page 1). + driver_features: [u32; 2], + /// Interrupt status register. + interrupt_status: u32, +} + +impl VirtioMmioDevice { + /// Create a new MMIO device wrapping the given backend. + pub fn new(backend: D) -> Self { + let num_queues = backend.num_queues(); + let mut queues = Vec::with_capacity(num_queues); + for i in 0..num_queues { + queues.push(Virtqueue::new(backend.queue_max_size(i as u32))); + } + + VirtioMmioDevice { + backend, + queues, + queue_sel: 0, + status: 0, + device_features_sel: 0, + driver_features_sel: 0, + driver_features: [0; 2], + interrupt_status: 0, + } + } + + /// Get a reference to the backend. + pub fn backend(&self) -> &D { + &self.backend + } + + /// Get a mutable reference to the backend. + pub fn backend_mut(&mut self) -> &mut D { + &mut self.backend + } + + /// Get the current interrupt status (non-zero = interrupt pending). + pub fn interrupt_status(&self) -> u32 { + self.interrupt_status + } + + /// Handle an MMIO read at the given offset from the device base. + pub fn read(&self, offset: u64, size: u8) -> u32 { + // All MMIO register reads are 32-bit in virtio-MMIO v2. + if size != 4 && offset < CONFIG_SPACE { + return 0; + } + + match offset { + MAGIC_VALUE => VIRTIO_MMIO_MAGIC, + VERSION => VIRTIO_MMIO_VERSION, + DEVICE_ID => self.backend.device_id(), + VENDOR_ID => VIRTIO_VENDOR_ID, + DEVICE_FEATURES => self.backend.device_features(self.device_features_sel), + QUEUE_NUM_MAX => { + if let Some(q) = self.current_queue() { + q.max_size() as u32 + } else { + 0 + } + } + QUEUE_READY => { + if let Some(q) = self.current_queue() { + q.is_ready() as u32 + } else { + 0 + } + } + INTERRUPT_STATUS => self.interrupt_status, + STATUS => self.status, + CONFIG_GENERATION => 0, // Config doesn't change dynamically. + off if off >= CONFIG_SPACE => self.backend.read_config(off - CONFIG_SPACE), + _ => 0, + } + } + + /// Handle an MMIO write at the given offset from the device base. + /// + /// `mem` is needed for queue_notify to process descriptor chains. + /// Returns `true` if an interrupt should be raised. + pub fn write( + &mut self, + offset: u64, + value: u32, + size: u8, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + // All MMIO register writes are 32-bit in virtio-MMIO v2. + if size != 4 { + return false; + } + + match offset { + DEVICE_FEATURES_SEL => { + self.device_features_sel = value; + } + DRIVER_FEATURES => { + let sel = self.driver_features_sel as usize; + if sel < self.driver_features.len() { + self.driver_features[sel] = value; + } + } + DRIVER_FEATURES_SEL => { + self.driver_features_sel = value; + } + QUEUE_SEL => { + self.queue_sel = value; + } + QUEUE_NUM => { + if let Some(q) = self.current_queue_mut() { + q.set_size(value as u16); + } + } + QUEUE_READY => { + if let Some(q) = self.current_queue_mut() { + q.set_ready(value == 1); + } + } + QUEUE_NOTIFY => { + return self.handle_queue_notify(value, mem); + } + INTERRUPT_ACK => { + self.interrupt_status &= !value; + } + STATUS => { + self.handle_status_write(value); + } + QUEUE_DESC_LOW => { + if let Some(q) = self.current_queue_mut() { + let high = 0u64; // Will be combined in set_desc_table. + q.set_desc_table(value as u64 | high); + } + } + QUEUE_DESC_HIGH => { + // High bits for descriptor table address (typically 0 for < 4GB). + } + QUEUE_AVAIL_LOW => { + if let Some(q) = self.current_queue_mut() { + q.set_avail_ring(value as u64); + } + } + QUEUE_AVAIL_HIGH => { + // High bits for avail ring address (typically 0). + } + QUEUE_USED_LOW => { + if let Some(q) = self.current_queue_mut() { + q.set_used_ring(value as u64); + } + } + QUEUE_USED_HIGH => { + // High bits for used ring address (typically 0). + } + _ => {} + } + false + } + + /// Poll the backend for host-initiated events. + /// + /// Returns `true` if an interrupt should be raised. + pub fn poll(&mut self, mem: &dyn GuestMemoryAccessor) -> bool { + let raised = self.backend.poll(&mut self.queues, mem); + if raised { + self.interrupt_status |= INTERRUPT_USED_RING; + } + raised + } + + fn current_queue(&self) -> Option<&Virtqueue> { + self.queues.get(self.queue_sel as usize) + } + + fn current_queue_mut(&mut self) -> Option<&mut Virtqueue> { + self.queues.get_mut(self.queue_sel as usize) + } + + fn handle_queue_notify(&mut self, queue_idx: u32, mem: &dyn GuestMemoryAccessor) -> bool { + let idx = queue_idx as usize; + if idx >= self.queues.len() { + return false; + } + + // Split borrow: take queue out, call backend, put it back. + let raised = self + .backend + .queue_notify(queue_idx, &mut self.queues[idx], mem); + + if raised { + self.interrupt_status |= INTERRUPT_USED_RING; + } + + raised + } + + fn handle_status_write(&mut self, value: u32) { + if value == 0 { + // Device reset. + self.status = 0; + self.queue_sel = 0; + self.interrupt_status = 0; + self.device_features_sel = 0; + self.driver_features_sel = 0; + self.driver_features = [0; 2]; + for q in &mut self.queues { + q.reset(); + } + return; + } + // Status can only be set by ORing new bits in. + self.status = value; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::queue::GuestMemoryAccessor; + use super::super::super::error::Result; + use std::cell::RefCell; + + /// Null backend for testing the MMIO transport layer. + struct NullBackend; + + impl VirtioDeviceBackend for NullBackend { + fn device_id(&self) -> u32 { + 0 // Invalid/null device. + } + fn device_features(&self, _page: u32) -> u32 { + 0 + } + fn read_config(&self, _offset: u64) -> u32 { + 0 + } + fn queue_notify( + &mut self, + _queue_idx: u32, + _queue: &mut Virtqueue, + _mem: &dyn GuestMemoryAccessor, + ) -> bool { + false + } + fn num_queues(&self) -> usize { + 1 + } + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + 256 + } + } + + /// Test backend that tracks notifications. + struct TestBackend { + notify_count: RefCell, + } + + impl TestBackend { + fn new() -> Self { + TestBackend { + notify_count: RefCell::new(0), + } + } + } + + impl VirtioDeviceBackend for TestBackend { + fn device_id(&self) -> u32 { + 2 // Block device. + } + fn device_features(&self, page: u32) -> u32 { + if page == 0 { + 0x1234 + } else { + 0 + } + } + fn read_config(&self, offset: u64) -> u32 { + if offset == 0 { + 1024 + } else { + 0 + } // Capacity low. + } + fn queue_notify( + &mut self, + _queue_idx: u32, + _queue: &mut Virtqueue, + _mem: &dyn GuestMemoryAccessor, + ) -> bool { + *self.notify_count.borrow_mut() += 1; + true // Raise interrupt. + } + fn num_queues(&self) -> usize { + 1 + } + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + 128 + } + } + + struct MockMem(RefCell>); + impl MockMem { + fn new(size: usize) -> Self { + MockMem(RefCell::new(vec![0u8; size])) + } + } + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.0.borrow(); + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.0.borrow_mut(); + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + // --- Magic and identification --- + + #[test] + fn test_magic_value() { + let dev = VirtioMmioDevice::new(NullBackend); + assert_eq!(dev.read(MAGIC_VALUE, 4), VIRTIO_MMIO_MAGIC); + } + + #[test] + fn test_version() { + let dev = VirtioMmioDevice::new(NullBackend); + assert_eq!(dev.read(VERSION, 4), 2); + } + + #[test] + fn test_device_id() { + let dev = VirtioMmioDevice::new(TestBackend::new()); + assert_eq!(dev.read(DEVICE_ID, 4), 2); // Block device. + } + + #[test] + fn test_vendor_id() { + let dev = VirtioMmioDevice::new(NullBackend); + assert_eq!(dev.read(VENDOR_ID, 4), 0); + } + + // --- Device features --- + + #[test] + fn test_device_features_page0() { + let mut dev = VirtioMmioDevice::new(TestBackend::new()); + let mem = MockMem::new(64); + dev.write(DEVICE_FEATURES_SEL, 0, 4, &mem); + assert_eq!(dev.read(DEVICE_FEATURES, 4), 0x1234); + } + + #[test] + fn test_device_features_page1() { + let mut dev = VirtioMmioDevice::new(TestBackend::new()); + let mem = MockMem::new(64); + dev.write(DEVICE_FEATURES_SEL, 1, 4, &mem); + assert_eq!(dev.read(DEVICE_FEATURES, 4), 0); + } + + // --- Queue configuration --- + + #[test] + fn test_queue_max_size() { + let dev = VirtioMmioDevice::new(TestBackend::new()); + assert_eq!(dev.read(QUEUE_NUM_MAX, 4), 128); + } + + #[test] + fn test_queue_ready() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + assert_eq!(dev.read(QUEUE_READY, 4), 0); + dev.write(QUEUE_READY, 1, 4, &mem); + assert_eq!(dev.read(QUEUE_READY, 4), 1); + } + + // --- Status state machine --- + + #[test] + fn test_status_ack() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + assert_eq!(dev.read(STATUS, 4), 0); + dev.write(STATUS, STATUS_ACK, 4, &mem); + assert_eq!(dev.read(STATUS, 4), STATUS_ACK); + } + + #[test] + fn test_status_progression() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + dev.write(STATUS, STATUS_ACK, 4, &mem); + dev.write(STATUS, STATUS_ACK | STATUS_DRIVER, 4, &mem); + dev.write( + STATUS, + STATUS_ACK | STATUS_DRIVER | STATUS_FEATURES_OK, + 4, + &mem, + ); + dev.write( + STATUS, + STATUS_ACK | STATUS_DRIVER | STATUS_FEATURES_OK | STATUS_DRIVER_OK, + 4, + &mem, + ); + assert_eq!( + dev.read(STATUS, 4), + STATUS_ACK | STATUS_DRIVER | STATUS_FEATURES_OK | STATUS_DRIVER_OK + ); + } + + #[test] + fn test_status_reset() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + dev.write(STATUS, STATUS_ACK | STATUS_DRIVER, 4, &mem); + assert_ne!(dev.read(STATUS, 4), 0); + dev.write(STATUS, 0, 4, &mem); // Reset. + assert_eq!(dev.read(STATUS, 4), 0); + } + + // --- Interrupt handling --- + + #[test] + fn test_interrupt_on_notify() { + let mut dev = VirtioMmioDevice::new(TestBackend::new()); + let mem = MockMem::new(64); + + assert_eq!(dev.read(INTERRUPT_STATUS, 4), 0); + + // Notify queue 0. + let raised = dev.write(QUEUE_NOTIFY, 0, 4, &mem); + assert!(raised); + assert_eq!(dev.read(INTERRUPT_STATUS, 4), INTERRUPT_USED_RING); + } + + #[test] + fn test_interrupt_ack() { + let mut dev = VirtioMmioDevice::new(TestBackend::new()); + let mem = MockMem::new(64); + + dev.write(QUEUE_NOTIFY, 0, 4, &mem); + assert_eq!(dev.read(INTERRUPT_STATUS, 4), INTERRUPT_USED_RING); + + // Acknowledge the interrupt. + dev.write(INTERRUPT_ACK, INTERRUPT_USED_RING, 4, &mem); + assert_eq!(dev.read(INTERRUPT_STATUS, 4), 0); + } + + // --- Config space --- + + #[test] + fn test_config_space_read() { + let dev = VirtioMmioDevice::new(TestBackend::new()); + // Offset 0x100 = config space offset 0 → capacity low = 1024. + assert_eq!(dev.read(CONFIG_SPACE, 4), 1024); + } + + // --- Non-32-bit access --- + + #[test] + fn test_non_32bit_read_returns_zero() { + let dev = VirtioMmioDevice::new(NullBackend); + // Reading magic with size != 4 should return 0. + assert_eq!(dev.read(MAGIC_VALUE, 1), 0); + assert_eq!(dev.read(MAGIC_VALUE, 2), 0); + } + + #[test] + fn test_non_32bit_write_ignored() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + dev.write(STATUS, STATUS_ACK, 2, &mem); // Wrong size. + assert_eq!(dev.read(STATUS, 4), 0); // Should be unchanged. + } + + // --- Invalid queue selection --- + + #[test] + fn test_invalid_queue_sel() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + dev.write(QUEUE_SEL, 99, 4, &mem); + assert_eq!(dev.read(QUEUE_NUM_MAX, 4), 0); // No such queue. + } + + // --- Poll --- + + #[test] + fn test_poll_default_returns_false() { + let mut dev = VirtioMmioDevice::new(NullBackend); + let mem = MockMem::new(64); + assert!(!dev.poll(&mem)); + assert_eq!(dev.interrupt_status(), 0); + } + + /// Backend that returns true from poll(). + struct PollBackend; + + impl VirtioDeviceBackend for PollBackend { + fn device_id(&self) -> u32 { + 19 + } + fn device_features(&self, _page: u32) -> u32 { + 0 + } + fn read_config(&self, _offset: u64) -> u32 { + 0 + } + fn queue_notify( + &mut self, + _queue_idx: u32, + _queue: &mut Virtqueue, + _mem: &dyn GuestMemoryAccessor, + ) -> bool { + false + } + fn num_queues(&self) -> usize { + 1 + } + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + 128 + } + fn poll(&mut self, _queues: &mut [Virtqueue], _mem: &dyn GuestMemoryAccessor) -> bool { + true + } + } + + #[test] + fn test_poll_sets_interrupt_status() { + let mut dev = VirtioMmioDevice::new(PollBackend); + let mem = MockMem::new(64); + let raised = dev.poll(&mem); + assert!(raised); + assert_eq!(dev.interrupt_status(), INTERRUPT_USED_RING); + } + + #[test] + fn test_poll_interrupt_can_be_acked() { + let mut dev = VirtioMmioDevice::new(PollBackend); + let mem = MockMem::new(64); + dev.poll(&mem); + assert_eq!(dev.interrupt_status(), INTERRUPT_USED_RING); + dev.write(INTERRUPT_ACK, INTERRUPT_USED_RING, 4, &mem); + assert_eq!(dev.interrupt_status(), 0); + } +} diff --git a/src/vmm/src/windows/devices/virtio/mod.rs b/src/vmm/src/windows/devices/virtio/mod.rs new file mode 100644 index 000000000..b3be92879 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/mod.rs @@ -0,0 +1,16 @@ +//! Virtio device emulation. +//! +//! Implements the virtio specification (v1.2) over the MMIO transport +//! for paravirtualized device I/O. Currently supports: +//! - virtio-blk: block device (file-backed disk) +//! - virtio-vsock: socket transport (host TCP <-> guest AF_VSOCK) +//! - virtio-9p: filesystem sharing (host directory <-> guest 9P mount) +//! - virtio-net: network device (userspace proxy via passt/gvproxy) + +pub mod block; +pub mod disk; +pub mod mmio; +pub mod net; +pub mod p9; +pub mod queue; +pub mod vsock; diff --git a/src/vmm/src/windows/devices/virtio/net.rs b/src/vmm/src/windows/devices/virtio/net.rs new file mode 100644 index 000000000..b7c63ab9b --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/net.rs @@ -0,0 +1,893 @@ +//! Virtio-net device backend (virtio spec v1.2 Section 5.1). +//! +//! Provides a network device backed by a userspace networking proxy +//! (passt/gvproxy) via a stream socket. The wire protocol uses +//! length-prefixed Ethernet frames: `[4-byte BE length][frame bytes]`. +//! +//! Queue layout: +//! Queue 0 (RX): host -> guest (device writes, guest reads) +//! Queue 1 (TX): guest -> host (guest writes, device reads) + +use std::collections::VecDeque; +use std::io::{self, Read, Write}; + +use super::mmio::VirtioDeviceBackend; +use super::queue::{GuestMemoryAccessor, Virtqueue}; + +/// Virtio device ID for network devices. +const VIRTIO_NET_ID: u32 = 1; + +/// VIRTIO_NET_F_MAC — device has given MAC address (bit 5). +const VIRTIO_NET_F_MAC: u32 = 5; + +/// VIRTIO_NET_F_STATUS — device provides link status (bit 16). +const VIRTIO_NET_F_STATUS: u32 = 16; + +/// VIRTIO_F_VERSION_1 — bit 32 (page 1, bit 0). +const VIRTIO_F_VERSION_1_BIT: u32 = 0; + +/// Number of queues: RX and TX (no control queue). +const NUM_QUEUES: usize = 2; + +/// Queue index constants. +const RX_QUEUE: usize = 0; +const TX_QUEUE: usize = 1; + +/// Maximum queue size. +const QUEUE_MAX_SIZE: u16 = 256; + +/// Size of struct virtio_net_hdr_v1 in bytes. +const VIRTIO_NET_HDR_SIZE: usize = 12; + +/// Network link status: up. +const VIRTIO_NET_S_LINK_UP: u16 = 1; + +/// Transport trait for pluggable networking backends. +/// +/// Both Unix socket and TCP transports use the passt/gvproxy wire +/// protocol: each frame is `[4-byte big-endian length][frame bytes]`. +pub trait NetTransport: Send { + /// Try to receive a complete Ethernet frame. Returns `None` if no + /// complete frame is available (non-blocking). + fn recv_frame(&mut self) -> Option>; + + /// Send an Ethernet frame, length-prefixed. + fn send_frame(&mut self, frame: &[u8]) -> io::Result<()>; +} + +/// Receive state machine for length-prefixed framing. +enum RecvState { + /// Waiting for the 4-byte length header; `bytes_read` bytes read so far. + LenPending { bytes_read: usize, buf: [u8; 4] }, + /// Length header complete, reading `frame_len` bytes of frame body. + BodyPending { + frame_len: usize, + buf: Vec, + bytes_read: usize, + }, +} + +impl Default for RecvState { + fn default() -> Self { + RecvState::LenPending { + bytes_read: 0, + buf: [0u8; 4], + } + } +} + +/// Unix stream socket transport (macOS/Linux). +#[cfg(unix)] +pub struct UnixStreamTransport { + stream: std::os::unix::net::UnixStream, + state: RecvState, +} + +#[cfg(unix)] +impl UnixStreamTransport { + /// Wrap a non-blocking Unix stream socket. + pub fn new(stream: std::os::unix::net::UnixStream) -> io::Result { + stream.set_nonblocking(true)?; + Ok(UnixStreamTransport { + stream, + state: RecvState::default(), + }) + } +} + +#[cfg(unix)] +impl NetTransport for UnixStreamTransport { + fn recv_frame(&mut self) -> Option> { + recv_frame_from(&mut self.stream, &mut self.state) + } + + fn send_frame(&mut self, frame: &[u8]) -> io::Result<()> { + send_frame_to(&mut self.stream, frame) + } +} + +/// TCP transport (cross-platform, used on Windows). +pub struct TcpTransport { + stream: std::net::TcpStream, + state: RecvState, +} + +impl TcpTransport { + /// Wrap a non-blocking TCP stream. + pub fn new(stream: std::net::TcpStream) -> io::Result { + stream.set_nonblocking(true)?; + Ok(TcpTransport { + stream, + state: RecvState::default(), + }) + } +} + +impl NetTransport for TcpTransport { + fn recv_frame(&mut self) -> Option> { + recv_frame_from(&mut self.stream, &mut self.state) + } + + fn send_frame(&mut self, frame: &[u8]) -> io::Result<()> { + send_frame_to(&mut self.stream, frame) + } +} + +/// Shared recv implementation using the state machine. +fn recv_frame_from(reader: &mut R, state: &mut RecvState) -> Option> { + loop { + match state { + RecvState::LenPending { bytes_read, buf } => { + match reader.read(&mut buf[*bytes_read..]) { + Ok(0) => return None, // EOF + Ok(n) => { + *bytes_read += n; + if *bytes_read == 4 { + let frame_len = u32::from_be_bytes(*buf) as usize; + if frame_len == 0 || frame_len > 65536 { + // Invalid frame, reset. + *state = RecvState::default(); + return None; + } + *state = RecvState::BodyPending { + frame_len, + buf: vec![0u8; frame_len], + bytes_read: 0, + }; + // Continue loop to read body. + } + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => return None, + Err(_) => return None, + } + } + RecvState::BodyPending { + frame_len, + buf, + bytes_read, + } => { + match reader.read(&mut buf[*bytes_read..]) { + Ok(0) => return None, // EOF + Ok(n) => { + *bytes_read += n; + if *bytes_read == *frame_len { + let frame = std::mem::take(buf); + *state = RecvState::default(); + return Some(frame); + } + // Continue loop to read more. + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => return None, + Err(_) => return None, + } + } + } + } +} + +/// Shared send implementation: 4-byte BE length + frame bytes. +fn send_frame_to(writer: &mut W, frame: &[u8]) -> io::Result<()> { + let len = (frame.len() as u32).to_be_bytes(); + writer.write_all(&len)?; + writer.write_all(frame)?; + Ok(()) +} + +/// Generate a MAC address deterministically from a seed. +/// +/// The first three bytes are `52:54:00` (QEMU/KVM OUI prefix). +/// The remaining bytes are derived from `seed`. +pub fn generate_mac(seed: u32) -> [u8; 6] { + let b = seed.to_le_bytes(); + [0x52, 0x54, 0x00, b[0], b[1], b[2]] +} + +/// Virtio-net device backed by a userspace networking proxy. +pub struct VirtioNet { + /// MAC address exposed to the guest. + mac: [u8; 6], + /// Network transport (socket to passt/gvproxy). + transport: Option>, + /// Frames waiting for RX queue space. + rx_pending: VecDeque>, +} + +impl VirtioNet { + /// Create a new virtio-net device with the given MAC and transport. + pub fn new(mac: [u8; 6], transport: Option>) -> Self { + VirtioNet { + mac, + transport, + rx_pending: VecDeque::new(), + } + } + + /// Get the MAC address. + pub fn mac(&self) -> &[u8; 6] { + &self.mac + } + + /// Process the TX queue: read frames from guest, send to transport. + fn process_tx(&mut self, queue: &mut Virtqueue, mem: &dyn GuestMemoryAccessor) -> bool { + let mut processed = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + if chain.is_empty() { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + + // Collect all data from device-readable descriptors. + let mut data = Vec::new(); + for desc in &chain { + if !desc.is_write() { + let mut buf = vec![0u8; desc.len as usize]; + if mem.read_at(desc.addr, &mut buf).is_ok() { + data.extend_from_slice(&buf); + } + } + } + + // First VIRTIO_NET_HDR_SIZE bytes are the virtio_net_hdr — strip it. + if data.len() > VIRTIO_NET_HDR_SIZE { + let frame = &data[VIRTIO_NET_HDR_SIZE..]; + if let Some(ref mut transport) = self.transport { + let _ = transport.send_frame(frame); + } + } + + let _ = queue.add_used(head, 0, mem); + processed = true; + } + + processed + } + + /// Inject pending frames into the RX queue. + fn inject_rx(&mut self, rx_queue: &mut Virtqueue, mem: &dyn GuestMemoryAccessor) -> bool { + let mut injected = false; + + while !self.rx_pending.is_empty() { + let head = match rx_queue.pop_avail(mem) { + Ok(Some(h)) => h, + _ => break, // No available RX buffers. + }; + + let chain = match rx_queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = rx_queue.add_used(head, 0, mem); + injected = true; + continue; + } + }; + + let frame = self.rx_pending.pop_front().unwrap(); + + // Prepend a zero virtio_net_hdr. + let hdr = [0u8; VIRTIO_NET_HDR_SIZE]; + let total_data: Vec = hdr.iter().chain(frame.iter()).copied().collect(); + + let mut offset = 0; + let mut total_written = 0u32; + for desc in &chain { + if !desc.is_write() { + continue; + } + let remaining = total_data.len().saturating_sub(offset); + let to_write = remaining.min(desc.len as usize); + if to_write > 0 { + let _ = mem.write_at(desc.addr, &total_data[offset..offset + to_write]); + offset += to_write; + total_written += to_write as u32; + } + } + + let _ = rx_queue.add_used(head, total_written, mem); + injected = true; + } + + injected + } +} + +impl VirtioDeviceBackend for VirtioNet { + fn device_id(&self) -> u32 { + VIRTIO_NET_ID + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 0 => (1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS), + 1 => 1 << VIRTIO_F_VERSION_1_BIT, + _ => 0, + } + } + + fn read_config(&self, offset: u64) -> u32 { + // Config space layout (virtio spec 5.1.4): + // offset 0: mac[0..3] (4 bytes as u32 LE) + // offset 4: mac[4..5] + status (u16 each, packed as u32 LE) + // offset 6: status (u16) — but guest typically reads at offset 4 + match offset { + 0 => u32::from_le_bytes([self.mac[0], self.mac[1], self.mac[2], self.mac[3]]), + 4 => { + // mac[4], mac[5], status_lo, status_hi + let status = VIRTIO_NET_S_LINK_UP; + u32::from_le_bytes([ + self.mac[4], + self.mac[5], + (status & 0xFF) as u8, + ((status >> 8) & 0xFF) as u8, + ]) + } + _ => 0, + } + } + + fn queue_notify( + &mut self, + queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + match queue_idx as usize { + TX_QUEUE => self.process_tx(queue, mem), + _ => false, + } + } + + fn num_queues(&self) -> usize { + NUM_QUEUES + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + QUEUE_MAX_SIZE + } + + fn poll(&mut self, queues: &mut [Virtqueue], mem: &dyn GuestMemoryAccessor) -> bool { + // Drain available frames from the transport. + if let Some(ref mut transport) = self.transport { + while let Some(frame) = transport.recv_frame() { + self.rx_pending.push_back(frame); + } + } + + // Inject pending frames into the RX queue. + if queues.len() > RX_QUEUE { + self.inject_rx(&mut queues[RX_QUEUE], mem) + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::queue::Virtqueue; + use super::super::super::error::{Result, WkrunError}; + use std::cell::RefCell; + + struct MockMem { + data: RefCell>, + } + + impl MockMem { + fn new(size: usize) -> Self { + MockMem { + data: RefCell::new(vec![0u8; size]), + } + } + + fn write_bytes(&self, addr: u64, bytes: &[u8]) { + let a = addr as usize; + let mut data = self.data.borrow_mut(); + data[a..a + bytes.len()].copy_from_slice(bytes); + } + + fn read_bytes(&self, addr: u64, len: usize) -> Vec { + let a = addr as usize; + let data = self.data.borrow(); + data[a..a + len].to_vec() + } + + fn write_u16_at(&self, addr: u64, val: u16) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u32_at(&self, addr: u64, val: u32) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u64_at(&self, addr: u64, val: u64) { + self.write_bytes(addr, &val.to_le_bytes()); + } + } + + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.data.borrow(); + if a + buf.len() > data.len() { + return Err(WkrunError::Memory("out of bounds".into())); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.data.borrow_mut(); + if a + data.len() > mem.len() { + return Err(WkrunError::Memory("out of bounds".into())); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + // Memory layout for tests. + const DESC_TABLE: u64 = 0x0000; + const DESC_SIZE: u64 = 16; + const AVAIL_RING: u64 = 0x0800; + const USED_RING: u64 = 0x1000; + const BUF_BASE: u64 = 0x2000; + + fn setup_queue(max_size: u16) -> Virtqueue { + let mut q = Virtqueue::new(max_size); + q.set_size(max_size); + q.set_desc_table(DESC_TABLE); + q.set_avail_ring(AVAIL_RING); + q.set_used_ring(USED_RING); + q.set_ready(true); + q + } + + fn write_descriptor(mem: &MockMem, index: u16, addr: u64, len: u32, flags: u16, next: u16) { + let base = DESC_TABLE + index as u64 * DESC_SIZE; + mem.write_u64_at(base, addr); + mem.write_u32_at(base + 8, len); + mem.write_u16_at(base + 12, flags); + mem.write_u16_at(base + 14, next); + } + + fn push_avail(mem: &MockMem, ring_idx: u16, desc_head: u16) { + let entry_off = AVAIL_RING + 4 + (ring_idx as u64) * 2; + mem.write_u16_at(entry_off, desc_head); + mem.write_u16_at(AVAIL_RING + 2, ring_idx + 1); + } + + /// Mock transport with shared state for inspecting sent frames + /// and injecting received frames after the transport is owned by VirtioNet. + struct SharedMockTransport { + sent: std::sync::Arc>>>, + recv_queue: std::sync::Arc>>>, + } + + impl SharedMockTransport { + fn new() -> (Self, SharedMockHandle) { + let sent = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let recv_queue = std::sync::Arc::new(std::sync::Mutex::new(VecDeque::new())); + let handle = SharedMockHandle { + sent: sent.clone(), + recv_queue: recv_queue.clone(), + }; + (SharedMockTransport { sent, recv_queue }, handle) + } + } + + impl NetTransport for SharedMockTransport { + fn recv_frame(&mut self) -> Option> { + self.recv_queue.lock().unwrap().pop_front() + } + + fn send_frame(&mut self, frame: &[u8]) -> io::Result<()> { + self.sent.lock().unwrap().push(frame.to_vec()); + Ok(()) + } + } + + struct SharedMockHandle { + sent: std::sync::Arc>>>, + recv_queue: std::sync::Arc>>>, + } + + impl SharedMockHandle { + fn push_recv(&self, frame: Vec) { + self.recv_queue.lock().unwrap().push_back(frame); + } + + fn sent_frames(&self) -> Vec> { + self.sent.lock().unwrap().clone() + } + } + + fn test_mac() -> [u8; 6] { + [0x52, 0x54, 0x00, 0x12, 0x34, 0x56] + } + + // --- Device identity --- + + #[test] + fn test_device_id() { + let dev = VirtioNet::new(test_mac(), None); + assert_eq!(dev.device_id(), 1); + } + + #[test] + fn test_num_queues() { + let dev = VirtioNet::new(test_mac(), None); + assert_eq!(dev.num_queues(), 2); + } + + #[test] + fn test_queue_max_size() { + let dev = VirtioNet::new(test_mac(), None); + assert_eq!(dev.queue_max_size(0), 256); + assert_eq!(dev.queue_max_size(1), 256); + } + + #[test] + fn test_features_page0() { + let dev = VirtioNet::new(test_mac(), None); + let features = dev.device_features(0); + assert_ne!(features & (1 << VIRTIO_NET_F_MAC), 0); + assert_ne!(features & (1 << VIRTIO_NET_F_STATUS), 0); + } + + #[test] + fn test_features_page1() { + let dev = VirtioNet::new(test_mac(), None); + assert_eq!(dev.device_features(1), 1); // VIRTIO_F_VERSION_1 + } + + // --- Config space --- + + #[test] + fn test_config_mac_offset_0() { + let mac = [0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF]; + let dev = VirtioNet::new(mac, None); + let val = dev.read_config(0); + assert_eq!(val, u32::from_le_bytes([0xAA, 0xBB, 0xCC, 0xDD])); + } + + #[test] + fn test_config_mac_offset_4() { + let mac = [0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF]; + let dev = VirtioNet::new(mac, None); + let val = dev.read_config(4); + // mac[4]=0xEE, mac[5]=0xFF, status=0x0001 (LINK_UP) + assert_eq!(val, u32::from_le_bytes([0xEE, 0xFF, 0x01, 0x00])); + } + + #[test] + fn test_config_status_link_up() { + let dev = VirtioNet::new(test_mac(), None); + let val = dev.read_config(4); + // Status is in bytes 2-3 of the u32 at offset 4. + let status = (val >> 16) as u16; + assert_eq!(status, VIRTIO_NET_S_LINK_UP); + } + + // --- TX queue --- + + #[test] + fn test_tx_sends_frame() { + let (transport, handle) = SharedMockTransport::new(); + let mut dev = VirtioNet::new(test_mac(), Some(Box::new(transport))); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(256); + + // Write virtio_net_hdr (12 zero bytes) + Ethernet frame to guest memory. + let mut tx_data = vec![0u8; VIRTIO_NET_HDR_SIZE]; + let frame = b"\xff\xff\xff\xff\xff\xff\x52\x54\x00\x12\x34\x56\x08\x00hello"; + tx_data.extend_from_slice(frame); + mem.write_bytes(BUF_BASE, &tx_data); + + // Single descriptor: header + frame (device-readable). + write_descriptor(&mem, 0, BUF_BASE, tx_data.len() as u32, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + + let sent = handle.sent_frames(); + assert_eq!(sent.len(), 1); + assert_eq!(sent[0], frame); // virtio_net_hdr stripped. + } + + #[test] + fn test_tx_empty_chain_skipped() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(256); + + write_descriptor(&mem, 0, BUF_BASE, 0, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + } + + #[test] + fn test_tx_short_header_skipped() { + let (transport, handle) = SharedMockTransport::new(); + let mut dev = VirtioNet::new(test_mac(), Some(Box::new(transport))); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(256); + + // Only 8 bytes — shorter than virtio_net_hdr. + mem.write_bytes(BUF_BASE, &[0u8; 8]); + write_descriptor(&mem, 0, BUF_BASE, 8, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + assert!(handle.sent_frames().is_empty()); // Nothing sent. + } + + // --- RX queue --- + + #[test] + fn test_rx_inject_frame() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(256); + + let frame = b"\xff\xff\xff\xff\xff\xff\x52\x54\x00\x12\x34\x56\x08\x00data".to_vec(); + dev.rx_pending.push_back(frame.clone()); + + // RX buffer (device-writable). + write_descriptor(&mem, 0, BUF_BASE, 1500, 2, 0); // WRITE flag = 2 + push_avail(&mem, 0, 0); + + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(injected); + + // Check: 12-byte zero header + frame. + let hdr = mem.read_bytes(BUF_BASE, VIRTIO_NET_HDR_SIZE); + assert_eq!(hdr, vec![0u8; VIRTIO_NET_HDR_SIZE]); + let written_frame = mem.read_bytes(BUF_BASE + VIRTIO_NET_HDR_SIZE as u64, frame.len()); + assert_eq!(written_frame, frame); + } + + #[test] + fn test_rx_no_buffers_stays_pending() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(256); + // Don't push any available buffers. + + dev.rx_pending.push_back(b"frame1".to_vec()); + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(!injected); + assert_eq!(dev.rx_pending.len(), 1); + } + + #[test] + fn test_rx_multiple_frames() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(256); + + dev.rx_pending.push_back(b"frame1".to_vec()); + dev.rx_pending.push_back(b"frame2".to_vec()); + + // Two RX buffers. + write_descriptor(&mem, 0, BUF_BASE, 1500, 2, 0); + push_avail(&mem, 0, 0); + write_descriptor(&mem, 1, BUF_BASE + 0x1000, 1500, 2, 0); + push_avail(&mem, 1, 1); + + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(injected); + assert!(dev.rx_pending.is_empty()); + + // Check first frame. + let f1 = mem.read_bytes(BUF_BASE + VIRTIO_NET_HDR_SIZE as u64, 6); + assert_eq!(f1, b"frame1"); + + // Check second frame. + let f2 = mem.read_bytes(BUF_BASE + 0x1000 + VIRTIO_NET_HDR_SIZE as u64, 6); + assert_eq!(f2, b"frame2"); + } + + // --- Poll --- + + #[test] + fn test_poll_reads_transport() { + let (transport, handle) = SharedMockTransport::new(); + let mut dev = VirtioNet::new(test_mac(), Some(Box::new(transport))); + let mem = MockMem::new(0x10000); + + handle.push_recv(b"incoming_frame".to_vec()); + + // Set up RX buffer. + write_descriptor(&mem, 0, BUF_BASE, 1500, 2, 0); + push_avail(&mem, 0, 0); + + let mut queues = vec![setup_queue(256), setup_queue(256)]; + // Point RX queue to our descriptors. + queues[0].set_desc_table(DESC_TABLE); + queues[0].set_avail_ring(AVAIL_RING); + queues[0].set_used_ring(USED_RING); + + let raised = dev.poll(&mut queues, &mem); + assert!(raised); + + // Frame should be in RX queue: 12-byte hdr + "incoming_frame". + let total_len = VIRTIO_NET_HDR_SIZE + 14; + let written = mem.read_bytes(BUF_BASE, total_len); + assert_eq!(&written[..VIRTIO_NET_HDR_SIZE], &[0u8; VIRTIO_NET_HDR_SIZE]); + assert_eq!(&written[VIRTIO_NET_HDR_SIZE..], b"incoming_frame"); + } + + #[test] + fn test_poll_no_data() { + let (transport, _handle) = SharedMockTransport::new(); + let mut dev = VirtioNet::new(test_mac(), Some(Box::new(transport))); + let mem = MockMem::new(0x10000); + + let mut queues = vec![setup_queue(256), setup_queue(256)]; + let raised = dev.poll(&mut queues, &mem); + assert!(!raised); + } + + // --- Frame length prefix encoding/decoding --- + + #[test] + fn test_frame_length_prefix_encode() { + let mut buf = Vec::new(); + let frame = b"test frame data"; + send_frame_to(&mut buf, frame).unwrap(); + + assert_eq!(buf.len(), 4 + frame.len()); + let len = u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]]); + assert_eq!(len, frame.len() as u32); + assert_eq!(&buf[4..], frame); + } + + #[test] + fn test_frame_length_prefix_decode() { + let frame = b"hello ethernet"; + let mut wire = Vec::new(); + wire.extend_from_slice(&(frame.len() as u32).to_be_bytes()); + wire.extend_from_slice(frame); + + let mut state = RecvState::default(); + let mut cursor = io::Cursor::new(wire); + let result = recv_frame_from(&mut cursor, &mut state); + assert_eq!(result, Some(frame.to_vec())); + } + + // --- No transport --- + + #[test] + fn test_new_without_transport() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + + // TX should silently drop. + let mut tx_data = vec![0u8; VIRTIO_NET_HDR_SIZE]; + tx_data.extend_from_slice(b"dropped"); + mem.write_bytes(BUF_BASE, &tx_data); + write_descriptor(&mem, 0, BUF_BASE, tx_data.len() as u32, 0, 0); + push_avail(&mem, 0, 0); + let mut tx_queue = setup_queue(256); + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + + // Poll with no transport = false. + let mut queues = vec![setup_queue(256), setup_queue(256)]; + assert!(!dev.poll(&mut queues, &mem)); + } + + // --- MAC generation --- + + #[test] + fn test_mac_generation() { + let mac = generate_mac(42); + assert_eq!(mac[0], 0x52); + assert_eq!(mac[1], 0x54); + assert_eq!(mac[2], 0x00); + // Remaining bytes from seed. + let b = 42u32.to_le_bytes(); + assert_eq!(mac[3], b[0]); + assert_eq!(mac[4], b[1]); + assert_eq!(mac[5], b[2]); + } + + #[test] + fn test_mac_generation_different_seeds() { + let mac1 = generate_mac(1); + let mac2 = generate_mac(2); + // Same OUI prefix. + assert_eq!(&mac1[..3], &mac2[..3]); + // Different generated portion. + assert_ne!(&mac1[3..], &mac2[3..]); + } + + // --- TX with chained descriptors --- + + #[test] + fn test_tx_chained_descriptors() { + let (transport, handle) = SharedMockTransport::new(); + let mut dev = VirtioNet::new(test_mac(), Some(Box::new(transport))); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(256); + + // Descriptor 0: virtio_net_hdr (device-readable), chained to 1. + let hdr = [0u8; VIRTIO_NET_HDR_SIZE]; + mem.write_bytes(BUF_BASE, &hdr); + write_descriptor( + &mem, + 0, + BUF_BASE, + VIRTIO_NET_HDR_SIZE as u32, + 1, // NEXT flag + 1, + ); + + // Descriptor 1: Ethernet frame (device-readable). + let frame = b"ethernet_frame_data"; + mem.write_bytes(BUF_BASE + 0x1000, frame); + write_descriptor(&mem, 1, BUF_BASE + 0x1000, frame.len() as u32, 0, 0); + + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + + let sent = handle.sent_frames(); + assert_eq!(sent.len(), 1); + assert_eq!(sent[0], frame); + } + + // --- Queue notify dispatch --- + + #[test] + fn test_queue_notify_rx_returns_false() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(256); + // Notify on RX queue should do nothing. + assert!(!dev.queue_notify(0, &mut rx_queue, &mem)); + } + + #[test] + fn test_queue_notify_invalid_queue() { + let mut dev = VirtioNet::new(test_mac(), None); + let mem = MockMem::new(0x10000); + let mut queue = setup_queue(256); + assert!(!dev.queue_notify(99, &mut queue, &mem)); + } +} diff --git a/src/vmm/src/windows/devices/virtio/p9/filesystem.rs b/src/vmm/src/windows/devices/virtio/p9/filesystem.rs new file mode 100644 index 000000000..a68149df4 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/p9/filesystem.rs @@ -0,0 +1,1089 @@ +//! Host filesystem backend for 9P2000.L. +//! +//! Maps 9P operations to `std::fs` operations on a shared host directory. +//! Each FID maps to an open file or directory path. Security: all paths +//! are resolved relative to the root directory; traversal outside is rejected. + +use std::collections::HashMap; +use std::fs::{self, File, OpenOptions}; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; + +use super::protocol::{ByteWriter, P9Attr, Qid, QT_DIR, QT_FILE, QT_SYMLINK}; + +/// Linux errno constants used in Rlerror responses. +pub const ENOENT: u32 = 2; +pub const EIO: u32 = 5; +pub const EBADF: u32 = 9; +pub const EACCES: u32 = 13; +pub const EEXIST: u32 = 17; +pub const ENOTDIR: u32 = 20; +pub const EINVAL: u32 = 22; +pub const ENOSPC: u32 = 28; +pub const ENOTEMPTY: u32 = 39; + +/// Linux open flags. +const O_RDONLY: u32 = 0; +const O_WRONLY: u32 = 1; +const O_RDWR: u32 = 2; +const O_CREAT: u32 = 0o100; +const O_TRUNC: u32 = 0o1000; +const O_APPEND: u32 = 0o2000; + +/// Getattr request mask bits (P9_GETATTR_*). +const P9_GETATTR_MODE: u64 = 0x00000001; +const P9_GETATTR_NLINK: u64 = 0x00000002; +const P9_GETATTR_UID: u64 = 0x00000004; +const P9_GETATTR_GID: u64 = 0x00000008; +const P9_GETATTR_RDEV: u64 = 0x00000010; +const P9_GETATTR_ATIME: u64 = 0x00000020; +const P9_GETATTR_MTIME: u64 = 0x00000040; +const P9_GETATTR_CTIME: u64 = 0x00000080; +const P9_GETATTR_SIZE: u64 = 0x00000200; +const P9_GETATTR_BLOCKS: u64 = 0x00000400; +const P9_GETATTR_BTIME: u64 = 0x00000800; +const P9_GETATTR_GEN: u64 = 0x00001000; +const P9_GETATTR_DATA_VERSION: u64 = 0x00002000; +/// Convenience mask for "all basic fields". +const P9_GETATTR_BASIC: u64 = P9_GETATTR_MODE + | P9_GETATTR_NLINK + | P9_GETATTR_UID + | P9_GETATTR_GID + | P9_GETATTR_RDEV + | P9_GETATTR_ATIME + | P9_GETATTR_MTIME + | P9_GETATTR_CTIME + | P9_GETATTR_SIZE + | P9_GETATTR_BLOCKS + | P9_GETATTR_BTIME + | P9_GETATTR_GEN + | P9_GETATTR_DATA_VERSION; + +/// Setattr valid bits. +const P9_SETATTR_MODE: u32 = 0x00000001; +const P9_SETATTR_SIZE: u32 = 0x00000008; + +/// Unlinkat flags. +const AT_REMOVEDIR: u32 = 0x200; + +/// FID state: tracks an open file or directory path. +struct FidState { + path: PathBuf, + file: Option, +} + +/// Host filesystem backend for 9P. +pub struct P9Filesystem { + root: PathBuf, + fids: HashMap, + read_only: bool, + msize: u32, + /// Path-to-QID-path cache for consistent QID.path values (used on non-Unix). + #[cfg(not(unix))] + qid_cache: HashMap, + /// Next synthetic QID path ID (used on non-Unix when inode not available). + #[cfg(not(unix))] + next_qid_path: u64, +} + +impl P9Filesystem { + pub fn new(root: PathBuf, read_only: bool) -> Self { + P9Filesystem { + root, + fids: HashMap::new(), + read_only, + msize: 0, + #[cfg(not(unix))] + qid_cache: HashMap::new(), + #[cfg(not(unix))] + next_qid_path: 1, + } + } + + /// Get the current msize. + pub fn msize(&self) -> u32 { + self.msize + } + + /// Negotiate protocol version. Returns negotiated msize. + pub fn version(&mut self, client_msize: u32) -> u32 { + self.msize = client_msize.min(65536); + // Release all fids on version (per spec). + self.fids.clear(); + self.msize + } + + /// Attach: bind `fid` to the root directory. + pub fn attach(&mut self, fid: u32) -> Result { + let meta = fs::metadata(&self.root).map_err(|_| ENOENT)?; + let qid = self.make_qid(&self.root.clone(), &meta); + self.fids.insert( + fid, + FidState { + path: self.root.clone(), + file: None, + }, + ); + Ok(qid) + } + + /// Walk: resolve path components from `fid` into `newfid`. + pub fn walk(&mut self, fid: u32, newfid: u32, names: &[String]) -> Result, u32> { + let base_path = self.fids.get(&fid).ok_or(EBADF)?.path.clone(); + + if names.is_empty() { + // Clone fid. + self.fids.insert( + newfid, + FidState { + path: base_path, + file: None, + }, + ); + return Ok(Vec::new()); + } + + let mut current = base_path; + let mut qids = Vec::with_capacity(names.len()); + + for name in names { + if name == ".." || name.contains('/') || name.contains('\\') { + return Err(ENOENT); + } + current = current.join(name); + + // Security: verify the resolved path is under root. + if !self.is_under_root(¤t) { + return Err(EACCES); + } + + let meta = fs::metadata(¤t).map_err(|_| ENOENT)?; + qids.push(self.make_qid(¤t, &meta)); + } + + self.fids.insert( + newfid, + FidState { + path: current, + file: None, + }, + ); + + Ok(qids) + } + + /// Open a file for I/O. + pub fn lopen(&mut self, fid: u32, flags: u32) -> Result<(Qid, u32), u32> { + // Clone path to release borrow on self.fids before calling other &mut self methods. + let path = self.fids.get(&fid).ok_or(EBADF)?.path.clone(); + let meta = fs::metadata(&path).map_err(|_| ENOENT)?; + + if meta.is_dir() { + let qid = self.make_qid_from_parts(&path, &meta); + let iounit = self.iounit(); + return Ok((qid, iounit)); + } + + if self.read_only && (flags & 0x3) != O_RDONLY { + return Err(EACCES); + } + + let file = self.open_file(&path, flags)?; + let qid = self.make_qid_from_parts(&path, &meta); + let iounit = self.iounit(); + self.fids.get_mut(&fid).ok_or(EBADF)?.file = Some(file); + Ok((qid, iounit)) + } + + /// Create and open a new file. + pub fn lcreate( + &mut self, + fid: u32, + name: &str, + _flags: u32, + _mode: u32, + _gid: u32, + ) -> Result<(Qid, u32), u32> { + if self.read_only { + return Err(EACCES); + } + + let dir_path = self.fids.get(&fid).ok_or(EBADF)?.path.clone(); + let file_path = dir_path.join(name); + + if !self.is_under_root(&file_path) { + return Err(EACCES); + } + + let file = OpenOptions::new() + .read(true) + .write(true) + .create_new(true) + .open(&file_path) + .map_err(|e| match e.kind() { + std::io::ErrorKind::AlreadyExists => EEXIST, + std::io::ErrorKind::PermissionDenied => EACCES, + _ => EIO, + })?; + + // Set permissions on Unix. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = fs::set_permissions(&file_path, fs::Permissions::from_mode(_mode)); + } + + let meta = file.metadata().map_err(|_| EIO)?; + let qid = self.make_qid(&file_path, &meta); + let iounit = self.iounit(); + + // Fid now points to the new file. + let state = self.fids.get_mut(&fid).ok_or(EBADF)?; + state.path = file_path; + state.file = Some(file); + + Ok((qid, iounit)) + } + + /// Read from an open file. + pub fn read(&mut self, fid: u32, offset: u64, count: u32) -> Result, u32> { + let state = self.fids.get_mut(&fid).ok_or(EBADF)?; + let file = state.file.as_mut().ok_or(EBADF)?; + + file.seek(SeekFrom::Start(offset)).map_err(|_| EIO)?; + + let max_read = count.min(self.msize.saturating_sub(11)) as usize; // 11 = header(7) + count(4) + let mut buf = vec![0u8; max_read]; + let n = file.read(&mut buf).map_err(|_| EIO)?; + buf.truncate(n); + Ok(buf) + } + + /// Write to an open file. + pub fn write(&mut self, fid: u32, offset: u64, data: &[u8]) -> Result { + if self.read_only { + return Err(EACCES); + } + + let state = self.fids.get_mut(&fid).ok_or(EBADF)?; + let file = state.file.as_mut().ok_or(EBADF)?; + + file.seek(SeekFrom::Start(offset)).map_err(|_| EIO)?; + file.write_all(data).map_err(|_| ENOSPC)?; + Ok(data.len() as u32) + } + + /// Read directory entries. + pub fn readdir(&mut self, fid: u32, offset: u64, count: u32) -> Result, u32> { + let state = self.fids.get(&fid).ok_or(EBADF)?; + let entries: Vec<_> = fs::read_dir(&state.path) + .map_err(|_| ENOTDIR)? + .filter_map(|e| e.ok()) + .collect(); + + let max_size = count.min(self.msize.saturating_sub(11)) as usize; + let mut w = ByteWriter::with_capacity(max_size); + let mut entry_offset = offset; + + for entry in entries.iter().skip(offset as usize) { + let name = entry.file_name(); + let name_str = name.to_string_lossy(); + let meta = match entry.metadata() { + Ok(m) => m, + Err(_) => continue, + }; + + let qid = self.make_qid(&entry.path(), &meta); + let dtype = if meta.is_dir() { 4u8 } else { 8u8 }; + + // Readdir entry: qid[13] + offset[8] + type[1] + name[s] + let entry_size = 13 + 8 + 1 + 2 + name_str.len(); + if w.len() + entry_size > max_size { + break; + } + + entry_offset += 1; + qid.write_to(&mut w); + w.put_u64(entry_offset); + w.put_u8(dtype); + w.put_string(&name_str); + } + + Ok(w.into_bytes()) + } + + /// Get file attributes. + pub fn getattr(&mut self, fid: u32, request_mask: u64) -> Result { + let state = self.fids.get(&fid).ok_or(EBADF)?; + let meta = fs::metadata(&state.path).map_err(|_| ENOENT)?; + let qid = self.make_qid(&state.path.clone(), &meta); + + let valid = request_mask & P9_GETATTR_BASIC; + + let mode = self.metadata_mode(&meta); + let size = meta.len(); + let blksize = 4096u64; + let blocks = size.div_ceil(512); + + // Timestamps. + let (mtime_sec, mtime_nsec) = self.metadata_mtime(&meta); + let (atime_sec, atime_nsec) = self.metadata_atime(&meta); + let (ctime_sec, ctime_nsec) = (mtime_sec, mtime_nsec); // Approximate. + + let nlink = self.metadata_nlink(&meta); + + Ok(P9Attr { + valid, + qid, + mode, + uid: 0, + gid: 0, + nlink, + rdev: 0, + size, + blksize, + blocks, + atime_sec, + atime_nsec, + mtime_sec, + mtime_nsec, + ctime_sec, + ctime_nsec, + btime_sec: 0, + btime_nsec: 0, + gen: 0, + data_version: 0, + }) + } + + /// Set file attributes. + pub fn setattr( + &mut self, + fid: u32, + valid: u32, + mode: u32, + _uid: u32, + _gid: u32, + size: u64, + ) -> Result<(), u32> { + if self.read_only { + return Err(EACCES); + } + + let state = self.fids.get(&fid).ok_or(EBADF)?; + + if valid & P9_SETATTR_MODE != 0 { + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let perms = fs::Permissions::from_mode(mode); + fs::set_permissions(&state.path, perms).map_err(|_| EIO)?; + } + #[cfg(not(unix))] + let _ = mode; + } + + if valid & P9_SETATTR_SIZE != 0 { + let file = OpenOptions::new() + .write(true) + .open(&state.path) + .map_err(|_| EIO)?; + file.set_len(size).map_err(|_| EIO)?; + } + + Ok(()) + } + + /// Release a fid. + pub fn clunk(&mut self, fid: u32) -> Result<(), u32> { + self.fids.remove(&fid).ok_or(EBADF)?; + Ok(()) + } + + /// Create a directory. + pub fn mkdir(&mut self, dfid: u32, name: &str, _mode: u32, _gid: u32) -> Result { + if self.read_only { + return Err(EACCES); + } + + let dir_path = self.fids.get(&dfid).ok_or(EBADF)?.path.clone(); + let new_path = dir_path.join(name); + + if !self.is_under_root(&new_path) { + return Err(EACCES); + } + + fs::create_dir(&new_path).map_err(|e| match e.kind() { + std::io::ErrorKind::AlreadyExists => EEXIST, + std::io::ErrorKind::PermissionDenied => EACCES, + _ => EIO, + })?; + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = fs::set_permissions(&new_path, fs::Permissions::from_mode(_mode)); + } + + let meta = fs::metadata(&new_path).map_err(|_| EIO)?; + Ok(self.make_qid(&new_path, &meta)) + } + + /// Rename a file or directory. + pub fn renameat( + &mut self, + olddirfid: u32, + oldname: &str, + newdirfid: u32, + newname: &str, + ) -> Result<(), u32> { + if self.read_only { + return Err(EACCES); + } + + let old_dir = self.fids.get(&olddirfid).ok_or(EBADF)?.path.clone(); + let new_dir = self.fids.get(&newdirfid).ok_or(EBADF)?.path.clone(); + + let old_path = old_dir.join(oldname); + let new_path = new_dir.join(newname); + + if !self.is_under_root(&old_path) || !self.is_under_root(&new_path) { + return Err(EACCES); + } + + fs::rename(&old_path, &new_path).map_err(|e| match e.kind() { + std::io::ErrorKind::NotFound => ENOENT, + std::io::ErrorKind::PermissionDenied => EACCES, + _ => EIO, + })?; + + Ok(()) + } + + /// Delete a file or directory. + pub fn unlinkat(&mut self, dirfid: u32, name: &str, flags: u32) -> Result<(), u32> { + if self.read_only { + return Err(EACCES); + } + + let dir_path = self.fids.get(&dirfid).ok_or(EBADF)?.path.clone(); + let target = dir_path.join(name); + + if !self.is_under_root(&target) { + return Err(EACCES); + } + + if flags & AT_REMOVEDIR != 0 { + fs::remove_dir(&target).map_err(|e| match e.kind() { + std::io::ErrorKind::NotFound => ENOENT, + _ => { + // Check if directory is not empty. + if let Ok(mut entries) = fs::read_dir(&target) { + if entries.next().is_some() { + return ENOTEMPTY; + } + } + EIO + } + })?; + } else { + fs::remove_file(&target).map_err(|e| match e.kind() { + std::io::ErrorKind::NotFound => ENOENT, + std::io::ErrorKind::PermissionDenied => EACCES, + _ => EIO, + })?; + } + + Ok(()) + } + + /// Flush cached data to disk. + pub fn fsync(&mut self, fid: u32) -> Result<(), u32> { + let state = self.fids.get_mut(&fid).ok_or(EBADF)?; + if let Some(ref file) = state.file { + // sync_all may fail on read-only files (especially on Windows). + // This is harmless — there's nothing to flush for read-only handles. + let _ = file.sync_all(); + } + Ok(()) + } + + // -- Internal helpers -- + + /// I/O unit size: max data per read/write. + fn iounit(&self) -> u32 { + self.msize.saturating_sub(24) // Conservative: header + read/write overhead. + } + + /// Verify that `path` resolves under the root directory. + fn is_under_root(&self, path: &Path) -> bool { + // Use canonicalize if the path exists; otherwise check components. + if let Ok(canonical) = fs::canonicalize(path) { + if let Ok(root_canonical) = fs::canonicalize(&self.root) { + return canonical.starts_with(&root_canonical); + } + } + // Path doesn't exist yet (e.g., for create). Check the parent. + if let Some(parent) = path.parent() { + if let Ok(canonical_parent) = fs::canonicalize(parent) { + if let Ok(root_canonical) = fs::canonicalize(&self.root) { + return canonical_parent.starts_with(&root_canonical); + } + } + } + false + } + + /// Generate a QID from file metadata. + fn make_qid(&mut self, path: &Path, meta: &fs::Metadata) -> Qid { + self.make_qid_from_parts(path, meta) + } + + fn make_qid_from_parts(&mut self, path: &Path, meta: &fs::Metadata) -> Qid { + let qtype = if meta.is_dir() { + QT_DIR + } else if meta.file_type().is_symlink() { + QT_SYMLINK + } else { + QT_FILE + }; + + let qid_path = self.resolve_qid_path(path, meta); + + let (mtime_sec, _) = self.metadata_mtime(meta); + let version = mtime_sec as u32; + + Qid { + qtype, + version, + path: qid_path, + } + } + + /// Get a unique QID path value for a file. + fn resolve_qid_path(&mut self, path: &Path, meta: &fs::Metadata) -> u64 { + // On Unix: use inode number directly. + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let _ = path; // suppress unused on non-unix + meta.ino() + } + + // On non-Unix: use a cache mapping canonical paths to synthetic IDs. + #[cfg(not(unix))] + { + let _ = meta; + let canonical = fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf()); + if let Some(&id) = self.qid_cache.get(&canonical) { + id + } else { + let id = self.next_qid_path; + self.next_qid_path += 1; + self.qid_cache.insert(canonical, id); + id + } + } + } + + /// Extract file mode from metadata. + fn metadata_mode(&self, meta: &fs::Metadata) -> u32 { + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + meta.mode() + } + #[cfg(not(unix))] + { + let mut mode = 0o644u32; + if meta.is_dir() { + mode = 0o755 | 0o040000; // S_IFDIR + } else { + mode |= 0o100000; // S_IFREG + } + if meta.permissions().readonly() { + mode &= !0o222; // Remove write bits. + } + mode + } + } + + /// Extract mtime from metadata as (seconds, nanoseconds). + fn metadata_mtime(&self, meta: &fs::Metadata) -> (u64, u64) { + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + (meta.mtime() as u64, meta.mtime_nsec() as u64) + } + #[cfg(not(unix))] + { + meta.modified() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| (d.as_secs(), d.subsec_nanos() as u64)) + .unwrap_or((0, 0)) + } + } + + /// Extract atime from metadata as (seconds, nanoseconds). + fn metadata_atime(&self, meta: &fs::Metadata) -> (u64, u64) { + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + (meta.atime() as u64, meta.atime_nsec() as u64) + } + #[cfg(not(unix))] + { + meta.accessed() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| (d.as_secs(), d.subsec_nanos() as u64)) + .unwrap_or((0, 0)) + } + } + + /// Extract nlink from metadata. + fn metadata_nlink(&self, meta: &fs::Metadata) -> u64 { + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + meta.nlink() + } + #[cfg(not(unix))] + { + let _ = meta; + 1 + } + } + + /// Open a file with Linux open flags mapped to Rust OpenOptions. + fn open_file(&self, path: &Path, flags: u32) -> Result { + let access = flags & 0x3; + let mut opts = OpenOptions::new(); + + match access { + O_RDONLY => { + opts.read(true); + } + O_WRONLY => { + opts.write(true); + } + O_RDWR => { + opts.read(true).write(true); + } + _ => { + opts.read(true); + } + } + + if flags & O_CREAT != 0 { + opts.create(true); + } + if flags & O_TRUNC != 0 { + opts.truncate(true); + } + if flags & O_APPEND != 0 { + opts.append(true); + } + + opts.open(path).map_err(|e| match e.kind() { + std::io::ErrorKind::NotFound => ENOENT, + std::io::ErrorKind::PermissionDenied => EACCES, + _ => EIO, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write as IoWrite; + use tempfile::TempDir; + + fn setup() -> (TempDir, P9Filesystem) { + let tmp = TempDir::new().unwrap(); + let mut fs = P9Filesystem::new(tmp.path().to_path_buf(), false); + fs.version(8192); + (tmp, fs) + } + + fn setup_readonly() -> (TempDir, P9Filesystem) { + let tmp = TempDir::new().unwrap(); + let mut fs = P9Filesystem::new(tmp.path().to_path_buf(), true); + fs.version(8192); + (tmp, fs) + } + + fn create_file(dir: &Path, name: &str, content: &[u8]) { + let path = dir.join(name); + let mut f = File::create(&path).unwrap(); + f.write_all(content).unwrap(); + } + + fn create_subdir(dir: &Path, name: &str) { + fs::create_dir(dir.join(name)).unwrap(); + } + + // -- version -- + + #[test] + fn test_version_negotiates_msize() { + let tmp = TempDir::new().unwrap(); + let mut fs = P9Filesystem::new(tmp.path().to_path_buf(), false); + let msize = fs.version(65536); + assert_eq!(msize, 65536); + assert_eq!(fs.msize(), 65536); + } + + #[test] + fn test_version_caps_msize() { + let tmp = TempDir::new().unwrap(); + let mut fs = P9Filesystem::new(tmp.path().to_path_buf(), false); + let msize = fs.version(1_000_000); + assert_eq!(msize, 65536); // Capped. + } + + // -- attach -- + + #[test] + fn test_attach_returns_dir_qid() { + let (_tmp, mut fs) = setup(); + let qid = fs.attach(0).unwrap(); + assert_eq!(qid.qtype, QT_DIR); + assert_ne!(qid.path, 0); + } + + // -- walk -- + + #[test] + fn test_walk_empty_clones_fid() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let qids = fs.walk(0, 1, &[]).unwrap(); + assert!(qids.is_empty()); + } + + #[test] + fn test_walk_single_file() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "hello.txt", b"hello"); + fs.attach(0).unwrap(); + let qids = fs.walk(0, 1, &["hello.txt".to_string()]).unwrap(); + assert_eq!(qids.len(), 1); + assert_eq!(qids[0].qtype, QT_FILE); + } + + #[test] + fn test_walk_multiple_components() { + let (tmp, mut fs) = setup(); + create_subdir(tmp.path(), "a"); + create_subdir(&tmp.path().join("a"), "b"); + create_file(&tmp.path().join("a").join("b"), "c.txt", b"content"); + fs.attach(0).unwrap(); + let qids = fs + .walk( + 0, + 1, + &["a".to_string(), "b".to_string(), "c.txt".to_string()], + ) + .unwrap(); + assert_eq!(qids.len(), 3); + assert_eq!(qids[0].qtype, QT_DIR); + assert_eq!(qids[1].qtype, QT_DIR); + assert_eq!(qids[2].qtype, QT_FILE); + } + + #[test] + fn test_walk_nonexistent_returns_error() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let result = fs.walk(0, 1, &["nonexistent".to_string()]); + assert_eq!(result, Err(ENOENT)); + } + + #[test] + fn test_walk_dotdot_rejected() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let result = fs.walk(0, 1, &["..".to_string()]); + assert_eq!(result, Err(ENOENT)); + } + + #[test] + fn test_walk_bad_fid() { + let (_tmp, mut fs) = setup(); + let result = fs.walk(99, 1, &["foo".to_string()]); + assert_eq!(result, Err(EBADF)); + } + + // -- lopen + read + write -- + + #[test] + fn test_lopen_and_read() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "data.txt", b"hello world"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["data.txt".to_string()]).unwrap(); + let (qid, iounit) = fs.lopen(1, O_RDONLY).unwrap(); + assert_eq!(qid.qtype, QT_FILE); + assert!(iounit > 0); + + let data = fs.read(1, 0, 4096).unwrap(); + assert_eq!(data, b"hello world"); + } + + #[test] + fn test_read_with_offset() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "data.txt", b"hello world"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["data.txt".to_string()]).unwrap(); + fs.lopen(1, O_RDONLY).unwrap(); + + let data = fs.read(1, 6, 4096).unwrap(); + assert_eq!(data, b"world"); + } + + #[test] + fn test_write_and_readback() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "out.txt", b""); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["out.txt".to_string()]).unwrap(); + fs.lopen(1, O_RDWR).unwrap(); + + let written = fs.write(1, 0, b"test data").unwrap(); + assert_eq!(written, 9); + + let data = fs.read(1, 0, 4096).unwrap(); + assert_eq!(data, b"test data"); + } + + // -- readdir -- + + #[test] + fn test_readdir_lists_entries() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "a.txt", b""); + create_file(tmp.path(), "b.txt", b""); + create_subdir(tmp.path(), "subdir"); + fs.attach(0).unwrap(); + fs.lopen(0, O_RDONLY).unwrap(); + + let data = fs.readdir(0, 0, 8192).unwrap(); + // Should contain directory entries for a.txt, b.txt, subdir. + assert!(!data.is_empty()); + } + + #[test] + fn test_readdir_offset_skips() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "a.txt", b""); + create_file(tmp.path(), "b.txt", b""); + create_file(tmp.path(), "c.txt", b""); + fs.attach(0).unwrap(); + + let full = fs.readdir(0, 0, 8192).unwrap(); + let partial = fs.readdir(0, 1, 8192).unwrap(); + // Partial should be smaller (skipped first entry). + assert!(partial.len() < full.len()); + } + + // -- getattr -- + + #[test] + fn test_getattr_file() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "test.txt", b"12345"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["test.txt".to_string()]).unwrap(); + + let attr = fs.getattr(1, 0x3FFF).unwrap(); + assert_eq!(attr.qid.qtype, QT_FILE); + assert_eq!(attr.size, 5); + assert!(attr.valid != 0); + } + + #[test] + fn test_getattr_dir() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + + let attr = fs.getattr(0, 0x3FFF).unwrap(); + assert_eq!(attr.qid.qtype, QT_DIR); + } + + // -- clunk -- + + #[test] + fn test_clunk_releases_fid() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + fs.clunk(0).unwrap(); + // Fid 0 no longer valid. + assert_eq!(fs.walk(0, 1, &[]), Err(EBADF)); + } + + #[test] + fn test_clunk_bad_fid() { + let (_tmp, mut fs) = setup(); + assert_eq!(fs.clunk(99), Err(EBADF)); + } + + // -- mkdir -- + + #[test] + fn test_mkdir_creates_directory() { + let (tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let qid = fs.mkdir(0, "newdir", 0o755, 0).unwrap(); + assert_eq!(qid.qtype, QT_DIR); + assert!(tmp.path().join("newdir").is_dir()); + } + + #[test] + fn test_mkdir_already_exists() { + let (tmp, mut fs) = setup(); + create_subdir(tmp.path(), "existing"); + fs.attach(0).unwrap(); + assert_eq!(fs.mkdir(0, "existing", 0o755, 0), Err(EEXIST)); + } + + // -- lcreate -- + + #[test] + fn test_lcreate_creates_file() { + let (tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let (qid, iounit) = fs.lcreate(0, "new.txt", O_RDWR, 0o644, 0).unwrap(); + assert_eq!(qid.qtype, QT_FILE); + assert!(iounit > 0); + assert!(tmp.path().join("new.txt").exists()); + } + + #[test] + fn test_lcreate_already_exists() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "exists.txt", b""); + fs.attach(0).unwrap(); + assert_eq!(fs.lcreate(0, "exists.txt", O_RDWR, 0o644, 0), Err(EEXIST)); + } + + // -- renameat -- + + #[test] + fn test_renameat() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "old.txt", b"data"); + fs.attach(0).unwrap(); + // Clone fid for newfid. + fs.walk(0, 1, &[]).unwrap(); + fs.renameat(0, "old.txt", 1, "new.txt").unwrap(); + assert!(!tmp.path().join("old.txt").exists()); + assert!(tmp.path().join("new.txt").exists()); + } + + // -- unlinkat -- + + #[test] + fn test_unlinkat_file() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "del.txt", b""); + fs.attach(0).unwrap(); + fs.unlinkat(0, "del.txt", 0).unwrap(); + assert!(!tmp.path().join("del.txt").exists()); + } + + #[test] + fn test_unlinkat_dir() { + let (tmp, mut fs) = setup(); + create_subdir(tmp.path(), "rmdir"); + fs.attach(0).unwrap(); + fs.unlinkat(0, "rmdir", AT_REMOVEDIR).unwrap(); + assert!(!tmp.path().join("rmdir").exists()); + } + + #[test] + fn test_unlinkat_nonempty_dir() { + let (tmp, mut fs) = setup(); + create_subdir(tmp.path(), "notempty"); + create_file(&tmp.path().join("notempty"), "file.txt", b""); + fs.attach(0).unwrap(); + assert_eq!(fs.unlinkat(0, "notempty", AT_REMOVEDIR), Err(ENOTEMPTY)); + } + + // -- fsync -- + + #[test] + fn test_fsync_open_file() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "sync.txt", b"data"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["sync.txt".to_string()]).unwrap(); + fs.lopen(1, O_RDONLY).unwrap(); + fs.fsync(1).unwrap(); + } + + // -- read-only mode -- + + #[test] + fn test_readonly_blocks_write() { + let (tmp, mut fs) = setup_readonly(); + create_file(tmp.path(), "file.txt", b"data"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["file.txt".to_string()]).unwrap(); + // Open for write should fail. + assert_eq!(fs.lopen(1, O_WRONLY), Err(EACCES)); + } + + #[test] + fn test_readonly_blocks_mkdir() { + let (_tmp, mut fs) = setup_readonly(); + fs.attach(0).unwrap(); + assert_eq!(fs.mkdir(0, "new", 0o755, 0), Err(EACCES)); + } + + #[test] + fn test_readonly_blocks_unlink() { + let (tmp, mut fs) = setup_readonly(); + create_file(tmp.path(), "nodel.txt", b""); + fs.attach(0).unwrap(); + assert_eq!(fs.unlinkat(0, "nodel.txt", 0), Err(EACCES)); + } + + #[test] + fn test_readonly_allows_read() { + let (tmp, mut fs) = setup_readonly(); + create_file(tmp.path(), "readable.txt", b"hello"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["readable.txt".to_string()]).unwrap(); + fs.lopen(1, O_RDONLY).unwrap(); + let data = fs.read(1, 0, 4096).unwrap(); + assert_eq!(data, b"hello"); + } + + // -- path traversal security -- + + #[test] + fn test_walk_slash_rejected() { + let (_tmp, mut fs) = setup(); + fs.attach(0).unwrap(); + let result = fs.walk(0, 1, &["a/b".to_string()]); + assert_eq!(result, Err(ENOENT)); + } + + // -- setattr -- + + #[test] + fn test_setattr_truncate() { + let (tmp, mut fs) = setup(); + create_file(tmp.path(), "trunc.txt", b"hello world"); + fs.attach(0).unwrap(); + fs.walk(0, 1, &["trunc.txt".to_string()]).unwrap(); + fs.setattr(1, P9_SETATTR_SIZE, 0, 0, 0, 5).unwrap(); + + // Verify truncation. + let content = std::fs::read(tmp.path().join("trunc.txt")).unwrap(); + assert_eq!(content, b"hello"); + } +} diff --git a/src/vmm/src/windows/devices/virtio/p9/mod.rs b/src/vmm/src/windows/devices/virtio/p9/mod.rs new file mode 100644 index 000000000..5402c6704 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/p9/mod.rs @@ -0,0 +1,820 @@ +//! Virtio-9p device backend (virtio spec v1.2 Section 5.11). +//! +//! Provides a 9P2000.L filesystem share between guest and host. +//! The guest mounts the share via `mount -t 9p -o trans=virtio,version=9p2000.L `. +//! +//! Queue layout: +//! Queue 0 (request): bidirectional 9P messages + +pub mod filesystem; +pub mod protocol; + +use std::path::PathBuf; + +use super::super::mmio::VirtioDeviceBackend; +use super::super::queue::{GuestMemoryAccessor, Virtqueue}; + +use self::filesystem::P9Filesystem; +use self::protocol::*; + +/// Virtio device ID for 9P transport (spec Section 5.11). +const VIRTIO_9P_ID: u32 = 9; + +/// VIRTIO_F_VERSION_1 — bit 32 (page 1, bit 0). +const VIRTIO_F_VERSION_1_BIT: u32 = 0; + +/// VIRTIO_9P_MOUNT_TAG feature bit (page 0, bit 0). +const VIRTIO_9P_MOUNT_TAG_BIT: u32 = 0; + +/// Maximum queue size. +const QUEUE_MAX_SIZE: u16 = 128; + +/// Virtio-9p device with host filesystem backend. +pub struct Virtio9p { + /// Mount tag visible to the guest (max 255 bytes). + tag: String, + /// Filesystem backend. + fs: P9Filesystem, +} + +impl Virtio9p { + /// Create a new 9p device sharing `root_path` on the host. + /// + /// `tag` is the mount tag the guest uses to identify this share. + /// `root_path` is the host directory to expose. + /// `read_only` controls whether writes are permitted. + pub fn new(tag: &str, root_path: PathBuf, read_only: bool) -> Self { + Virtio9p { + tag: tag.to_string(), + fs: P9Filesystem::new(root_path, read_only), + } + } + + /// Get the mount tag. + pub fn tag(&self) -> &str { + &self.tag + } + + /// Process a single 9P request from a descriptor chain. + /// + /// Returns the response bytes to write back, and the total bytes + /// consumed from readable descriptors. + fn process_request(&mut self, request: &[u8]) -> Vec { + let mut r = ByteReader::new(request); + + let hdr = match P9Header::read_from(&mut r) { + Some(h) => h, + None => return build_response(P9_RLERROR, 0, |w| write_rlerror(w, filesystem::EIO)), + }; + + let body = &request[P9_HEADER_SIZE..]; + let req = match parse_request(hdr.msg_type, body) { + Some(r) => r, + None => { + return build_response(P9_RLERROR, hdr.tag, |w| { + write_rlerror(w, filesystem::EINVAL) + }) + } + }; + + self.dispatch(hdr.tag, req) + } + + /// Dispatch a parsed request to the filesystem backend. + fn dispatch(&mut self, tag: u16, req: P9Request) -> Vec { + match req { + P9Request::Tversion { msize, version } => { + if version != "9P2000.L" { + return build_response(P9_RVERSION, tag, |w| { + write_rversion(w, msize, "unknown"); + }); + } + let negotiated = self.fs.version(msize); + build_response(P9_RVERSION, tag, |w| { + write_rversion(w, negotiated, "9P2000.L"); + }) + } + + P9Request::Tattach { fid, .. } => match self.fs.attach(fid) { + Ok(qid) => build_response(P9_RATTACH, tag, |w| write_rattach(w, &qid)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Twalk { fid, newfid, names } => match self.fs.walk(fid, newfid, &names) { + Ok(qids) => build_response(P9_RWALK, tag, |w| write_rwalk(w, &qids)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tlopen { fid, flags } => match self.fs.lopen(fid, flags) { + Ok((qid, iounit)) => { + build_response(P9_RLOPEN, tag, |w| write_rlopen(w, &qid, iounit)) + } + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tlcreate { + fid, + name, + flags, + mode, + gid, + } => match self.fs.lcreate(fid, &name, flags, mode, gid) { + Ok((qid, iounit)) => { + build_response(P9_RLCREATE, tag, |w| write_rlcreate(w, &qid, iounit)) + } + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tread { fid, offset, count } => match self.fs.read(fid, offset, count) { + Ok(data) => build_response(P9_RREAD, tag, |w| write_rread(w, &data)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Twrite { + fid, offset, data, .. + } => match self.fs.write(fid, offset, &data) { + Ok(count) => build_response(P9_RWRITE, tag, |w| write_rwrite(w, count)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Treaddir { fid, offset, count } => { + match self.fs.readdir(fid, offset, count) { + Ok(data) => build_response(P9_RREADDIR, tag, |w| write_rreaddir(w, &data)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + } + } + + P9Request::Tgetattr { fid, request_mask } => match self.fs.getattr(fid, request_mask) { + Ok(attr) => build_response(P9_RGETATTR, tag, |w| write_rgetattr(w, &attr)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tsetattr { + fid, + valid, + mode, + uid, + gid, + size, + .. + } => match self.fs.setattr(fid, valid, mode, uid, gid, size) { + Ok(()) => build_response(P9_RSETATTR, tag, write_rsetattr), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tclunk { fid } => match self.fs.clunk(fid) { + Ok(()) => build_response(P9_RCLUNK, tag, write_rclunk), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tflush { .. } => build_response(P9_RFLUSH, tag, write_rflush), + + P9Request::Tmkdir { + dfid, + name, + mode, + gid, + } => match self.fs.mkdir(dfid, &name, mode, gid) { + Ok(qid) => build_response(P9_RMKDIR, tag, |w| write_rmkdir(w, &qid)), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Trenameat { + olddirfid, + oldname, + newdirfid, + newname, + } => match self.fs.renameat(olddirfid, &oldname, newdirfid, &newname) { + Ok(()) => build_response(P9_RRENAMEAT, tag, write_rrenameat), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tunlinkat { + dirfid, + name, + flags, + } => match self.fs.unlinkat(dirfid, &name, flags) { + Ok(()) => build_response(P9_RUNLINKAT, tag, write_runlinkat), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + + P9Request::Tfsync { fid } => match self.fs.fsync(fid) { + Ok(()) => build_response(P9_RFSYNC, tag, write_rfsync), + Err(e) => build_response(P9_RLERROR, tag, |w| write_rlerror(w, e)), + }, + } + } +} + +impl VirtioDeviceBackend for Virtio9p { + fn device_id(&self) -> u32 { + VIRTIO_9P_ID + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 0 => 1 << VIRTIO_9P_MOUNT_TAG_BIT, + 1 => 1 << VIRTIO_F_VERSION_1_BIT, + _ => 0, + } + } + + fn read_config(&self, offset: u64) -> u32 { + // Config space layout: + // offset 0: tag_len (u16) — only low 16 bits of the u32 read + // offset 2+: tag bytes (padded to u32 alignment) + let tag_bytes = self.tag.as_bytes(); + let tag_len = tag_bytes.len() as u16; + + if offset == 0 { + // tag_len at offset 0 (u16) + first 2 bytes of tag at offset 2. + let mut val = tag_len as u32; + if !tag_bytes.is_empty() { + val |= (tag_bytes[0] as u32) << 16; + } + if tag_bytes.len() > 1 { + val |= (tag_bytes[1] as u32) << 24; + } + val + } else { + // Subsequent 4-byte reads into the tag string. + // offset is relative to config space start. + // tag starts at byte 2 within config space. + let tag_start = offset as usize - 2; + let mut bytes = [0u8; 4]; + for (i, byte) in bytes.iter_mut().enumerate() { + let tidx = tag_start + i; + if tidx < tag_bytes.len() { + *byte = tag_bytes[tidx]; + } + } + u32::from_le_bytes(bytes) + } + } + + fn queue_notify( + &mut self, + _queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + let mut processed = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + if chain.is_empty() { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + + // Collect request from device-readable descriptors. + let mut request = Vec::new(); + for desc in &chain { + if !desc.is_write() { + let mut buf = vec![0u8; desc.len as usize]; + if mem.read_at(desc.addr, &mut buf).is_ok() { + request.extend_from_slice(&buf); + } + } + } + + // Process the 9P request. + let response = self.process_request(&request); + + // Write response to device-writable descriptors. + let mut offset = 0; + let mut total_written = 0u32; + for desc in &chain { + if !desc.is_write() { + continue; + } + let remaining = response.len().saturating_sub(offset); + let to_write = remaining.min(desc.len as usize); + if to_write > 0 { + let _ = mem.write_at(desc.addr, &response[offset..offset + to_write]); + offset += to_write; + total_written += to_write as u32; + } + } + + let _ = queue.add_used(head, total_written, mem); + processed = true; + } + + processed + } + + fn num_queues(&self) -> usize { + 1 // Single request queue. + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + QUEUE_MAX_SIZE + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::super::queue::Virtqueue; + use super::super::super::super::error::Result; + use std::cell::RefCell; + use std::io::Write as IoWrite; + use tempfile::TempDir; + + struct MockMem { + data: RefCell>, + } + + impl MockMem { + fn new(size: usize) -> Self { + MockMem { + data: RefCell::new(vec![0u8; size]), + } + } + + fn write_bytes(&self, addr: u64, bytes: &[u8]) { + let a = addr as usize; + let mut data = self.data.borrow_mut(); + data[a..a + bytes.len()].copy_from_slice(bytes); + } + + fn read_bytes(&self, addr: u64, len: usize) -> Vec { + let a = addr as usize; + let data = self.data.borrow(); + data[a..a + len].to_vec() + } + + fn write_u16_at(&self, addr: u64, val: u16) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u32_at(&self, addr: u64, val: u32) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u64_at(&self, addr: u64, val: u64) { + self.write_bytes(addr, &val.to_le_bytes()); + } + } + + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.data.borrow(); + if a + buf.len() > data.len() { + return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.data.borrow_mut(); + if a + data.len() > mem.len() { + return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + // Memory layout for tests. + const DESC_TABLE: u64 = 0x0000; + const DESC_SIZE: u64 = 16; + const AVAIL_RING: u64 = 0x0800; + const USED_RING: u64 = 0x1000; + const BUF_BASE: u64 = 0x2000; + const RESP_BASE: u64 = 0x4000; + + fn setup_queue(max_size: u16) -> Virtqueue { + let mut q = Virtqueue::new(max_size); + q.set_size(max_size); + q.set_desc_table(DESC_TABLE); + q.set_avail_ring(AVAIL_RING); + q.set_used_ring(USED_RING); + q.set_ready(true); + q + } + + fn write_descriptor(mem: &MockMem, index: u16, addr: u64, len: u32, flags: u16, next: u16) { + let base = DESC_TABLE + index as u64 * DESC_SIZE; + mem.write_u64_at(base, addr); + mem.write_u32_at(base + 8, len); + mem.write_u16_at(base + 12, flags); + mem.write_u16_at(base + 14, next); + } + + fn push_avail(mem: &MockMem, ring_idx: u16, desc_head: u16) { + let entry_off = AVAIL_RING + 4 + (ring_idx as u64) * 2; + mem.write_u16_at(entry_off, desc_head); + mem.write_u16_at(AVAIL_RING + 2, ring_idx + 1); + } + + fn create_test_device(tmp: &TempDir) -> Virtio9p { + Virtio9p::new("hostshare", tmp.path().to_path_buf(), false) + } + + /// Submit a request through the virtqueue and return the response bytes. + fn submit_request( + dev: &mut Virtio9p, + mem: &MockMem, + queue: &mut Virtqueue, + request: &[u8], + avail_idx: u16, + ) -> Vec { + let desc_base = avail_idx * 2; + mem.write_bytes(BUF_BASE, request); + + // Descriptor 0: request (device-readable), chained to 1. + write_descriptor( + mem, + desc_base, + BUF_BASE, + request.len() as u32, + 1, // NEXT flag + desc_base + 1, + ); + // Descriptor 1: response buffer (device-writable). + write_descriptor(mem, desc_base + 1, RESP_BASE, 8192, 2, 0); // WRITE flag + + push_avail(mem, avail_idx, desc_base); + + let raised = dev.queue_notify(0, queue, mem); + assert!(raised); + + // Read the response from RESP_BASE. + let resp_data = mem.read_bytes(RESP_BASE, 8192); + // Parse size from response. + let size = u32::from_le_bytes([resp_data[0], resp_data[1], resp_data[2], resp_data[3]]); + resp_data[..size as usize].to_vec() + } + + fn build_tversion() -> Vec { + build_response(P9_TVERSION, P9_NOTAG, |w| { + w.put_u32(8192); + w.put_string("9P2000.L"); + }) + } + + fn build_tattach(fid: u32) -> Vec { + build_response(P9_TATTACH, 1, |w| { + w.put_u32(fid); + w.put_u32(P9_NOFID); + w.put_string(""); + w.put_string(""); + }) + } + + fn build_twalk(fid: u32, newfid: u32, names: &[&str]) -> Vec { + build_response(P9_TWALK, 2, |w| { + w.put_u32(fid); + w.put_u32(newfid); + w.put_u16(names.len() as u16); + for name in names { + w.put_string(name); + } + }) + } + + fn build_tlopen(fid: u32, flags: u32) -> Vec { + build_response(P9_TLOPEN, 3, |w| { + w.put_u32(fid); + w.put_u32(flags); + }) + } + + fn build_tread(fid: u32, offset: u64, count: u32) -> Vec { + build_response(P9_TREAD, 4, |w| { + w.put_u32(fid); + w.put_u64(offset); + w.put_u32(count); + }) + } + + fn build_tclunk(fid: u32) -> Vec { + build_response(P9_TCLUNK, 5, |w| { + w.put_u32(fid); + }) + } + + fn build_treaddir(fid: u32, offset: u64, count: u32) -> Vec { + build_response(P9_TREADDIR, 6, |w| { + w.put_u32(fid); + w.put_u64(offset); + w.put_u32(count); + }) + } + + fn build_tgetattr(fid: u32) -> Vec { + build_response(P9_TGETATTR, 7, |w| { + w.put_u32(fid); + w.put_u64(0x3FFF); // All attributes. + }) + } + + // -- Device identity -- + + #[test] + fn test_device_id() { + let tmp = TempDir::new().unwrap(); + let dev = create_test_device(&tmp); + assert_eq!(dev.device_id(), 9); + } + + #[test] + fn test_num_queues() { + let tmp = TempDir::new().unwrap(); + let dev = create_test_device(&tmp); + assert_eq!(dev.num_queues(), 1); + } + + #[test] + fn test_queue_max_size() { + let tmp = TempDir::new().unwrap(); + let dev = create_test_device(&tmp); + assert_eq!(dev.queue_max_size(0), 128); + } + + #[test] + fn test_features() { + let tmp = TempDir::new().unwrap(); + let dev = create_test_device(&tmp); + assert_eq!(dev.device_features(0), 1); // VIRTIO_9P_MOUNT_TAG. + assert_eq!(dev.device_features(1), 1); // VIRTIO_F_VERSION_1. + assert_eq!(dev.device_features(2), 0); + } + + // -- Config space -- + + #[test] + fn test_config_tag_len() { + let tmp = TempDir::new().unwrap(); + let dev = Virtio9p::new("hostshare", tmp.path().to_path_buf(), false); + let val = dev.read_config(0); + // Low 16 bits = tag_len = 9 ("hostshare") + assert_eq!(val & 0xFFFF, 9); + } + + #[test] + fn test_tag() { + let tmp = TempDir::new().unwrap(); + let dev = create_test_device(&tmp); + assert_eq!(dev.tag(), "hostshare"); + } + + // -- Version negotiation -- + + #[test] + fn test_version_negotiation() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + + let resp = dev.process_request(&build_tversion()); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RVERSION); + let msize = r.get_u32().unwrap(); + assert_eq!(msize, 8192); + let version = r.get_string().unwrap(); + assert_eq!(version, "9P2000.L"); + } + + #[test] + fn test_version_unknown_protocol() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + + let msg = build_response(P9_TVERSION, P9_NOTAG, |w| { + w.put_u32(8192); + w.put_string("9P2000.u"); // Not supported. + }); + let resp = dev.process_request(&msg); + let mut r = ByteReader::new(&resp); + let _hdr = P9Header::read_from(&mut r).unwrap(); + let _msize = r.get_u32().unwrap(); + let version = r.get_string().unwrap(); + assert_eq!(version, "unknown"); + } + + // -- Attach -- + + #[test] + fn test_attach() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + dev.process_request(&build_tversion()); + + let resp = dev.process_request(&build_tattach(0)); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RATTACH); + let qid = Qid::read_from(&mut r).unwrap(); + assert_eq!(qid.qtype, QT_DIR); + } + + // -- Walk + Read file through queue -- + + #[test] + fn test_walk_and_read_via_queue() { + let tmp = TempDir::new().unwrap(); + let mut file = std::fs::File::create(tmp.path().join("hello.txt")).unwrap(); + file.write_all(b"hello world").unwrap(); + drop(file); + + let mut dev = create_test_device(&tmp); + let mem = MockMem::new(0x10000); + let mut queue = setup_queue(128); + + // Version. + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tversion(), 0); + + // Attach. + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tattach(0), 1); + + // Walk to hello.txt. + let _resp = submit_request( + &mut dev, + &mem, + &mut queue, + &build_twalk(0, 1, &["hello.txt"]), + 2, + ); + + // Open. + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tlopen(1, 0), 3); + + // Read. + let resp = submit_request(&mut dev, &mem, &mut queue, &build_tread(1, 0, 4096), 4); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RREAD); + let count = r.get_u32().unwrap(); + assert_eq!(count, 11); + let data = r.get_bytes(count as usize).unwrap(); + assert_eq!(data, b"hello world"); + } + + // -- Readdir via queue -- + + #[test] + fn test_readdir_via_queue() { + let tmp = TempDir::new().unwrap(); + std::fs::File::create(tmp.path().join("a.txt")).unwrap(); + std::fs::File::create(tmp.path().join("b.txt")).unwrap(); + + let mut dev = create_test_device(&tmp); + let mem = MockMem::new(0x10000); + let mut queue = setup_queue(128); + + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tversion(), 0); + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tattach(0), 1); + + // Open root dir. + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tlopen(0, 0), 2); + + // Readdir. + let resp = submit_request(&mut dev, &mem, &mut queue, &build_treaddir(0, 0, 8192), 3); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RREADDIR); + let count = r.get_u32().unwrap(); + assert!(count > 0); // Should contain entries. + } + + // -- Getattr -- + + #[test] + fn test_getattr_via_queue() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + let mem = MockMem::new(0x10000); + let mut queue = setup_queue(128); + + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tversion(), 0); + let _resp = submit_request(&mut dev, &mem, &mut queue, &build_tattach(0), 1); + + let resp = submit_request(&mut dev, &mem, &mut queue, &build_tgetattr(0), 2); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RGETATTR); + } + + // -- Error response for bad fid -- + + #[test] + fn test_error_bad_fid() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + dev.process_request(&build_tversion()); + + // Try to walk with unattached fid. + let resp = dev.process_request(&build_twalk(99, 1, &["foo"])); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RLERROR); + let ecode = r.get_u32().unwrap(); + assert_eq!(ecode, filesystem::EBADF); + } + + // -- Clunk -- + + #[test] + fn test_clunk() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + dev.process_request(&build_tversion()); + dev.process_request(&build_tattach(0)); + + let resp = dev.process_request(&build_tclunk(0)); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RCLUNK); + + // Fid 0 should now be invalid. + let resp = dev.process_request(&build_twalk(0, 1, &[])); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RLERROR); + } + + // -- Multiple requests in sequence -- + + #[test] + fn test_multiple_requests() { + let tmp = TempDir::new().unwrap(); + std::fs::File::create(tmp.path().join("f1.txt")).unwrap(); + std::fs::File::create(tmp.path().join("f2.txt")).unwrap(); + + let mut dev = create_test_device(&tmp); + dev.process_request(&build_tversion()); + dev.process_request(&build_tattach(0)); + + // Walk to two different files. + let resp1 = dev.process_request(&build_twalk(0, 1, &["f1.txt"])); + let resp2 = dev.process_request(&build_twalk(0, 2, &["f2.txt"])); + + let mut r1 = ByteReader::new(&resp1); + assert_eq!(P9Header::read_from(&mut r1).unwrap().msg_type, P9_RWALK); + + let mut r2 = ByteReader::new(&resp2); + assert_eq!(P9Header::read_from(&mut r2).unwrap().msg_type, P9_RWALK); + } + + // -- Short/malformed request -- + + #[test] + fn test_malformed_request() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + + // Too short for a header. + let resp = dev.process_request(&[0, 0, 0]); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RLERROR); + } + + // -- Flush -- + + #[test] + fn test_flush() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + + let msg = build_response(P9_TFLUSH, 10, |w| { + w.put_u16(5); // oldtag + }); + let resp = dev.process_request(&msg); + let mut r = ByteReader::new(&resp); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RFLUSH); + assert_eq!(hdr.tag, 10); + } + + // -- Empty chain handled -- + + #[test] + fn test_empty_chain_skipped() { + let tmp = TempDir::new().unwrap(); + let mut dev = create_test_device(&tmp); + let mem = MockMem::new(0x10000); + let mut queue = setup_queue(128); + + // Descriptor with 0 length. + write_descriptor(&mem, 0, BUF_BASE, 0, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.queue_notify(0, &mut queue, &mem); + assert!(processed); + } +} diff --git a/src/vmm/src/windows/devices/virtio/p9/protocol.rs b/src/vmm/src/windows/devices/virtio/p9/protocol.rs new file mode 100644 index 000000000..c9a6b5932 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/p9/protocol.rs @@ -0,0 +1,1316 @@ +//! 9P2000.L wire protocol types and serialization. +//! +//! Implements the 9P2000.L message format used by Linux v9fs. +//! All multi-byte fields are little-endian. Messages have the format: +//! size[4] type[1] tag[2] params... + +// -- 9P message type constants -- + +pub const P9_RLERROR: u8 = 7; +pub const P9_TLOPEN: u8 = 12; +pub const P9_RLOPEN: u8 = 13; +pub const P9_TLCREATE: u8 = 14; +pub const P9_RLCREATE: u8 = 15; +pub const P9_TGETATTR: u8 = 24; +pub const P9_RGETATTR: u8 = 25; +pub const P9_TSETATTR: u8 = 26; +pub const P9_RSETATTR: u8 = 27; +pub const P9_TREADDIR: u8 = 40; +pub const P9_RREADDIR: u8 = 41; +pub const P9_TFSYNC: u8 = 50; +pub const P9_RFSYNC: u8 = 51; +pub const P9_TMKDIR: u8 = 72; +pub const P9_RMKDIR: u8 = 73; +pub const P9_TRENAMEAT: u8 = 74; +pub const P9_RRENAMEAT: u8 = 75; +pub const P9_TUNLINKAT: u8 = 76; +pub const P9_RUNLINKAT: u8 = 77; +pub const P9_TVERSION: u8 = 100; +pub const P9_RVERSION: u8 = 101; +pub const P9_TATTACH: u8 = 104; +pub const P9_RATTACH: u8 = 105; +pub const P9_TFLUSH: u8 = 108; +pub const P9_RFLUSH: u8 = 109; +pub const P9_TWALK: u8 = 110; +pub const P9_RWALK: u8 = 111; +pub const P9_TREAD: u8 = 116; +pub const P9_RREAD: u8 = 117; +pub const P9_TWRITE: u8 = 118; +pub const P9_RWRITE: u8 = 119; +pub const P9_TCLUNK: u8 = 120; +pub const P9_RCLUNK: u8 = 121; + +/// No-fid sentinel. +pub const P9_NOFID: u32 = u32::MAX; + +/// No-tag sentinel (used in Tversion). +pub const P9_NOTAG: u16 = u16::MAX; + +/// 9P message header size (size[4] + type[1] + tag[2]). +pub const P9_HEADER_SIZE: usize = 7; + +/// QID size in bytes (type[1] + version[4] + path[8]). +pub const QID_SIZE: usize = 13; + +/// QID type: directory. +pub const QT_DIR: u8 = 0x80; +/// QID type: regular file. +pub const QT_FILE: u8 = 0x00; +/// QID type: symlink. +pub const QT_SYMLINK: u8 = 0x02; + +/// Default maximum message size. +pub const DEFAULT_MSIZE: u32 = 8192 + P9_HEADER_SIZE as u32; + +// -- QID -- + +/// 13-byte file identifier (type, version, path). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Qid { + pub qtype: u8, + pub version: u32, + pub path: u64, +} + +impl Qid { + pub fn write_to(&self, w: &mut ByteWriter) { + w.put_u8(self.qtype); + w.put_u32(self.version); + w.put_u64(self.path); + } + + pub fn read_from(r: &mut ByteReader) -> Option { + let qtype = r.get_u8()?; + let version = r.get_u32()?; + let path = r.get_u64()?; + Some(Qid { + qtype, + version, + path, + }) + } +} + +// -- P9 message header -- + +/// Parsed 9P message header. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct P9Header { + pub size: u32, + pub msg_type: u8, + pub tag: u16, +} + +impl P9Header { + pub fn read_from(r: &mut ByteReader) -> Option { + let size = r.get_u32()?; + let msg_type = r.get_u8()?; + let tag = r.get_u16()?; + Some(P9Header { + size, + msg_type, + tag, + }) + } + + pub fn write_to(&self, w: &mut ByteWriter) { + w.put_u32(self.size); + w.put_u8(self.msg_type); + w.put_u16(self.tag); + } +} + +// -- Parsed T-message requests -- + +/// Parsed 9P T-message (client request). +#[derive(Debug)] +pub enum P9Request { + Tversion { + msize: u32, + version: String, + }, + Tattach { + fid: u32, + afid: u32, + uname: String, + aname: String, + }, + Twalk { + fid: u32, + newfid: u32, + names: Vec, + }, + Tlopen { + fid: u32, + flags: u32, + }, + Tlcreate { + fid: u32, + name: String, + flags: u32, + mode: u32, + gid: u32, + }, + Tread { + fid: u32, + offset: u64, + count: u32, + }, + Twrite { + fid: u32, + offset: u64, + count: u32, + data: Vec, + }, + Treaddir { + fid: u32, + offset: u64, + count: u32, + }, + Tgetattr { + fid: u32, + request_mask: u64, + }, + Tsetattr { + fid: u32, + valid: u32, + mode: u32, + uid: u32, + gid: u32, + size: u64, + atime_sec: u64, + atime_nsec: u64, + mtime_sec: u64, + mtime_nsec: u64, + }, + Tclunk { + fid: u32, + }, + Tflush { + oldtag: u16, + }, + Tmkdir { + dfid: u32, + name: String, + mode: u32, + gid: u32, + }, + Trenameat { + olddirfid: u32, + oldname: String, + newdirfid: u32, + newname: String, + }, + Tunlinkat { + dirfid: u32, + name: String, + flags: u32, + }, + Tfsync { + fid: u32, + }, +} + +/// Parse a T-message body (after header has been read). +pub fn parse_request(msg_type: u8, body: &[u8]) -> Option { + let mut r = ByteReader::new(body); + match msg_type { + P9_TVERSION => { + let msize = r.get_u32()?; + let version = r.get_string()?; + Some(P9Request::Tversion { msize, version }) + } + P9_TATTACH => { + let fid = r.get_u32()?; + let afid = r.get_u32()?; + let uname = r.get_string()?; + let aname = r.get_string()?; + Some(P9Request::Tattach { + fid, + afid, + uname, + aname, + }) + } + P9_TWALK => { + let fid = r.get_u32()?; + let newfid = r.get_u32()?; + let nwname = r.get_u16()?; + let mut names = Vec::with_capacity(nwname as usize); + for _ in 0..nwname { + names.push(r.get_string()?); + } + Some(P9Request::Twalk { fid, newfid, names }) + } + P9_TLOPEN => { + let fid = r.get_u32()?; + let flags = r.get_u32()?; + Some(P9Request::Tlopen { fid, flags }) + } + P9_TLCREATE => { + let fid = r.get_u32()?; + let name = r.get_string()?; + let flags = r.get_u32()?; + let mode = r.get_u32()?; + let gid = r.get_u32()?; + Some(P9Request::Tlcreate { + fid, + name, + flags, + mode, + gid, + }) + } + P9_TREAD => { + let fid = r.get_u32()?; + let offset = r.get_u64()?; + let count = r.get_u32()?; + Some(P9Request::Tread { fid, offset, count }) + } + P9_TWRITE => { + let fid = r.get_u32()?; + let offset = r.get_u64()?; + let count = r.get_u32()?; + let data = r.get_bytes(count as usize)?; + Some(P9Request::Twrite { + fid, + offset, + count, + data, + }) + } + P9_TREADDIR => { + let fid = r.get_u32()?; + let offset = r.get_u64()?; + let count = r.get_u32()?; + Some(P9Request::Treaddir { fid, offset, count }) + } + P9_TGETATTR => { + let fid = r.get_u32()?; + let request_mask = r.get_u64()?; + Some(P9Request::Tgetattr { fid, request_mask }) + } + P9_TSETATTR => { + let fid = r.get_u32()?; + let valid = r.get_u32()?; + let mode = r.get_u32()?; + let uid = r.get_u32()?; + let gid = r.get_u32()?; + let size = r.get_u64()?; + let atime_sec = r.get_u64()?; + let atime_nsec = r.get_u64()?; + let mtime_sec = r.get_u64()?; + let mtime_nsec = r.get_u64()?; + Some(P9Request::Tsetattr { + fid, + valid, + mode, + uid, + gid, + size, + atime_sec, + atime_nsec, + mtime_sec, + mtime_nsec, + }) + } + P9_TCLUNK => { + let fid = r.get_u32()?; + Some(P9Request::Tclunk { fid }) + } + P9_TFLUSH => { + let oldtag = r.get_u16()?; + Some(P9Request::Tflush { oldtag }) + } + P9_TMKDIR => { + let dfid = r.get_u32()?; + let name = r.get_string()?; + let mode = r.get_u32()?; + let gid = r.get_u32()?; + Some(P9Request::Tmkdir { + dfid, + name, + mode, + gid, + }) + } + P9_TRENAMEAT => { + let olddirfid = r.get_u32()?; + let oldname = r.get_string()?; + let newdirfid = r.get_u32()?; + let newname = r.get_string()?; + Some(P9Request::Trenameat { + olddirfid, + oldname, + newdirfid, + newname, + }) + } + P9_TUNLINKAT => { + let dirfid = r.get_u32()?; + let name = r.get_string()?; + let flags = r.get_u32()?; + Some(P9Request::Tunlinkat { + dirfid, + name, + flags, + }) + } + P9_TFSYNC => { + let fid = r.get_u32()?; + Some(P9Request::Tfsync { fid }) + } + _ => None, + } +} + +// -- P9Attr: Rgetattr response payload -- + +/// File attributes for Rgetattr. +#[derive(Debug, Clone)] +pub struct P9Attr { + pub valid: u64, + pub qid: Qid, + pub mode: u32, + pub uid: u32, + pub gid: u32, + pub nlink: u64, + pub rdev: u64, + pub size: u64, + pub blksize: u64, + pub blocks: u64, + pub atime_sec: u64, + pub atime_nsec: u64, + pub mtime_sec: u64, + pub mtime_nsec: u64, + pub ctime_sec: u64, + pub ctime_nsec: u64, + pub btime_sec: u64, + pub btime_nsec: u64, + pub gen: u64, + pub data_version: u64, +} + +impl P9Attr { + pub fn write_to(&self, w: &mut ByteWriter) { + w.put_u64(self.valid); + self.qid.write_to(w); + w.put_u32(self.mode); + w.put_u32(self.uid); + w.put_u32(self.gid); + w.put_u64(self.nlink); + w.put_u64(self.rdev); + w.put_u64(self.size); + w.put_u64(self.blksize); + w.put_u64(self.blocks); + w.put_u64(self.atime_sec); + w.put_u64(self.atime_nsec); + w.put_u64(self.mtime_sec); + w.put_u64(self.mtime_nsec); + w.put_u64(self.ctime_sec); + w.put_u64(self.ctime_nsec); + w.put_u64(self.btime_sec); + w.put_u64(self.btime_nsec); + w.put_u64(self.gen); + w.put_u64(self.data_version); + } +} + +// -- Response builders -- + +/// Build an Rlerror response body (after header). +pub fn write_rlerror(w: &mut ByteWriter, ecode: u32) { + w.put_u32(ecode); +} + +/// Build an Rversion response body. +pub fn write_rversion(w: &mut ByteWriter, msize: u32, version: &str) { + w.put_u32(msize); + w.put_string(version); +} + +/// Build an Rattach response body. +pub fn write_rattach(w: &mut ByteWriter, qid: &Qid) { + qid.write_to(w); +} + +/// Build an Rwalk response body. +pub fn write_rwalk(w: &mut ByteWriter, qids: &[Qid]) { + w.put_u16(qids.len() as u16); + for qid in qids { + qid.write_to(w); + } +} + +/// Build an Rlopen response body. +pub fn write_rlopen(w: &mut ByteWriter, qid: &Qid, iounit: u32) { + qid.write_to(w); + w.put_u32(iounit); +} + +/// Build an Rlcreate response body. +pub fn write_rlcreate(w: &mut ByteWriter, qid: &Qid, iounit: u32) { + qid.write_to(w); + w.put_u32(iounit); +} + +/// Build an Rread response body. +pub fn write_rread(w: &mut ByteWriter, data: &[u8]) { + w.put_u32(data.len() as u32); + w.put_raw(data); +} + +/// Build an Rwrite response body. +pub fn write_rwrite(w: &mut ByteWriter, count: u32) { + w.put_u32(count); +} + +/// Build an Rreaddir response body. +pub fn write_rreaddir(w: &mut ByteWriter, data: &[u8]) { + w.put_u32(data.len() as u32); + w.put_raw(data); +} + +/// Build an Rgetattr response body. +pub fn write_rgetattr(w: &mut ByteWriter, attr: &P9Attr) { + attr.write_to(w); +} + +/// Build an Rclunk response body (empty). +pub fn write_rclunk(_w: &mut ByteWriter) { + // No body. +} + +/// Build an Rflush response body (empty). +pub fn write_rflush(_w: &mut ByteWriter) { + // No body. +} + +/// Build an Rsetattr response body (empty). +pub fn write_rsetattr(_w: &mut ByteWriter) { + // No body. +} + +/// Build an Rmkdir response body. +pub fn write_rmkdir(w: &mut ByteWriter, qid: &Qid) { + qid.write_to(w); +} + +/// Build an Rrenameat response body (empty). +pub fn write_rrenameat(_w: &mut ByteWriter) { + // No body. +} + +/// Build an Runlinkat response body (empty). +pub fn write_runlinkat(_w: &mut ByteWriter) { + // No body. +} + +/// Build an Rfsync response body (empty). +pub fn write_rfsync(_w: &mut ByteWriter) { + // No body. +} + +// -- ByteReader: sequential reader over a byte slice -- + +/// Cursor for reading fields from a byte buffer. +pub struct ByteReader<'a> { + data: &'a [u8], + pos: usize, +} + +impl<'a> ByteReader<'a> { + pub fn new(data: &'a [u8]) -> Self { + ByteReader { data, pos: 0 } + } + + pub fn remaining(&self) -> usize { + self.data.len().saturating_sub(self.pos) + } + + pub fn get_u8(&mut self) -> Option { + if self.pos + 1 > self.data.len() { + return None; + } + let val = self.data[self.pos]; + self.pos += 1; + Some(val) + } + + pub fn get_u16(&mut self) -> Option { + if self.pos + 2 > self.data.len() { + return None; + } + let val = u16::from_le_bytes([self.data[self.pos], self.data[self.pos + 1]]); + self.pos += 2; + Some(val) + } + + pub fn get_u32(&mut self) -> Option { + if self.pos + 4 > self.data.len() { + return None; + } + let val = u32::from_le_bytes([ + self.data[self.pos], + self.data[self.pos + 1], + self.data[self.pos + 2], + self.data[self.pos + 3], + ]); + self.pos += 4; + Some(val) + } + + pub fn get_u64(&mut self) -> Option { + if self.pos + 8 > self.data.len() { + return None; + } + let val = u64::from_le_bytes([ + self.data[self.pos], + self.data[self.pos + 1], + self.data[self.pos + 2], + self.data[self.pos + 3], + self.data[self.pos + 4], + self.data[self.pos + 5], + self.data[self.pos + 6], + self.data[self.pos + 7], + ]); + self.pos += 8; + Some(val) + } + + /// Read a 9P string: length[2] + data[length]. + pub fn get_string(&mut self) -> Option { + let len = self.get_u16()? as usize; + let bytes = self.get_bytes(len)?; + String::from_utf8(bytes).ok() + } + + pub fn get_bytes(&mut self, count: usize) -> Option> { + if self.pos + count > self.data.len() { + return None; + } + let val = self.data[self.pos..self.pos + count].to_vec(); + self.pos += count; + Some(val) + } +} + +// -- ByteWriter: sequential writer into a byte buffer -- + +/// Cursor for writing fields into a growable byte buffer. +pub struct ByteWriter { + data: Vec, +} + +impl Default for ByteWriter { + fn default() -> Self { + Self::new() + } +} + +impl ByteWriter { + pub fn new() -> Self { + ByteWriter { data: Vec::new() } + } + + pub fn with_capacity(cap: usize) -> Self { + ByteWriter { + data: Vec::with_capacity(cap), + } + } + + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + pub fn into_bytes(self) -> Vec { + self.data + } + + pub fn as_bytes(&self) -> &[u8] { + &self.data + } + + pub fn put_u8(&mut self, val: u8) { + self.data.push(val); + } + + pub fn put_u16(&mut self, val: u16) { + self.data.extend_from_slice(&val.to_le_bytes()); + } + + pub fn put_u32(&mut self, val: u32) { + self.data.extend_from_slice(&val.to_le_bytes()); + } + + pub fn put_u64(&mut self, val: u64) { + self.data.extend_from_slice(&val.to_le_bytes()); + } + + /// Write a 9P string: length[2] + data[length]. + pub fn put_string(&mut self, s: &str) { + self.put_u16(s.len() as u16); + self.data.extend_from_slice(s.as_bytes()); + } + + pub fn put_raw(&mut self, data: &[u8]) { + self.data.extend_from_slice(data); + } + + /// Patch a u32 at the given byte offset (used for message size fixup). + pub fn patch_u32(&mut self, offset: usize, val: u32) { + let bytes = val.to_le_bytes(); + self.data[offset..offset + 4].copy_from_slice(&bytes); + } +} + +/// Build a complete 9P response message (header + body). +/// +/// `msg_type` is the R-message type constant. +/// `tag` is the request tag to echo back. +/// `body_fn` writes the body fields into the ByteWriter. +pub fn build_response(msg_type: u8, tag: u16, body_fn: impl FnOnce(&mut ByteWriter)) -> Vec { + let mut w = ByteWriter::with_capacity(128); + // Reserve space for the size field. + w.put_u32(0); + w.put_u8(msg_type); + w.put_u16(tag); + body_fn(&mut w); + // Patch the size field with the total message length. + let total = w.len() as u32; + w.patch_u32(0, total); + w.into_bytes() +} + +#[cfg(test)] +mod tests { + use super::*; + + // -- ByteReader tests -- + + #[test] + fn test_reader_u8() { + let data = [0x42]; + let mut r = ByteReader::new(&data); + assert_eq!(r.get_u8(), Some(0x42)); + assert_eq!(r.get_u8(), None); + } + + #[test] + fn test_reader_u16() { + let data = 0x1234u16.to_le_bytes(); + let mut r = ByteReader::new(&data); + assert_eq!(r.get_u16(), Some(0x1234)); + assert_eq!(r.remaining(), 0); + } + + #[test] + fn test_reader_u32() { + let data = 0xDEADBEEFu32.to_le_bytes(); + let mut r = ByteReader::new(&data); + assert_eq!(r.get_u32(), Some(0xDEADBEEF)); + } + + #[test] + fn test_reader_u64() { + let data = 0x0102030405060708u64.to_le_bytes(); + let mut r = ByteReader::new(&data); + assert_eq!(r.get_u64(), Some(0x0102030405060708)); + } + + #[test] + fn test_reader_string() { + let mut buf = Vec::new(); + buf.extend_from_slice(&5u16.to_le_bytes()); + buf.extend_from_slice(b"hello"); + let mut r = ByteReader::new(&buf); + assert_eq!(r.get_string(), Some("hello".to_string())); + } + + #[test] + fn test_reader_empty_string() { + let buf = 0u16.to_le_bytes(); + let mut r = ByteReader::new(&buf); + assert_eq!(r.get_string(), Some(String::new())); + } + + #[test] + fn test_reader_truncated_returns_none() { + let data = [0x01]; // Only 1 byte, but asking for u32. + let mut r = ByteReader::new(&data); + assert_eq!(r.get_u32(), None); + } + + #[test] + fn test_reader_bytes() { + let data = [1, 2, 3, 4, 5]; + let mut r = ByteReader::new(&data); + assert_eq!(r.get_bytes(3), Some(vec![1, 2, 3])); + assert_eq!(r.get_bytes(3), None); // Only 2 remaining. + assert_eq!(r.get_bytes(2), Some(vec![4, 5])); + } + + // -- ByteWriter tests -- + + #[test] + fn test_writer_roundtrip_u32() { + let mut w = ByteWriter::new(); + w.put_u32(0xCAFEBABE); + let mut r = ByteReader::new(w.as_bytes()); + assert_eq!(r.get_u32(), Some(0xCAFEBABE)); + } + + #[test] + fn test_writer_string() { + let mut w = ByteWriter::new(); + w.put_string("test"); + let mut r = ByteReader::new(w.as_bytes()); + assert_eq!(r.get_string(), Some("test".to_string())); + } + + #[test] + fn test_writer_patch_u32() { + let mut w = ByteWriter::new(); + w.put_u32(0); // Placeholder. + w.put_u8(0xFF); + w.patch_u32(0, 42); + assert_eq!(w.as_bytes()[0..4], 42u32.to_le_bytes()); + assert_eq!(w.as_bytes()[4], 0xFF); + } + + #[test] + fn test_writer_len() { + let mut w = ByteWriter::new(); + assert_eq!(w.len(), 0); + w.put_u32(0); + assert_eq!(w.len(), 4); + w.put_string("hi"); + assert_eq!(w.len(), 4 + 2 + 2); // u32 + u16_len + "hi" + } + + // -- Header tests -- + + #[test] + fn test_header_roundtrip() { + let hdr = P9Header { + size: 23, + msg_type: P9_TVERSION, + tag: P9_NOTAG, + }; + let mut w = ByteWriter::new(); + hdr.write_to(&mut w); + assert_eq!(w.len(), P9_HEADER_SIZE); + + let mut r = ByteReader::new(w.as_bytes()); + let parsed = P9Header::read_from(&mut r).unwrap(); + assert_eq!(parsed, hdr); + } + + // -- QID tests -- + + #[test] + fn test_qid_roundtrip() { + let qid = Qid { + qtype: QT_DIR, + version: 12345, + path: 0xDEAD_BEEF_CAFE, + }; + let mut w = ByteWriter::new(); + qid.write_to(&mut w); + assert_eq!(w.len(), QID_SIZE); + + let mut r = ByteReader::new(w.as_bytes()); + let parsed = Qid::read_from(&mut r).unwrap(); + assert_eq!(parsed, qid); + } + + #[test] + fn test_qid_file() { + let qid = Qid { + qtype: QT_FILE, + version: 0, + path: 1, + }; + let mut w = ByteWriter::new(); + qid.write_to(&mut w); + let mut r = ByteReader::new(w.as_bytes()); + let parsed = Qid::read_from(&mut r).unwrap(); + assert_eq!(parsed.qtype, QT_FILE); + } + + // -- Request parsing tests -- + + #[test] + fn test_parse_tversion() { + let mut w = ByteWriter::new(); + w.put_u32(8192); // msize + w.put_string("9P2000.L"); + let req = parse_request(P9_TVERSION, w.as_bytes()).unwrap(); + match req { + P9Request::Tversion { msize, version } => { + assert_eq!(msize, 8192); + assert_eq!(version, "9P2000.L"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tattach() { + let mut w = ByteWriter::new(); + w.put_u32(0); // fid + w.put_u32(P9_NOFID); // afid + w.put_string("root"); + w.put_string("/share"); + let req = parse_request(P9_TATTACH, w.as_bytes()).unwrap(); + match req { + P9Request::Tattach { + fid, + afid, + uname, + aname, + } => { + assert_eq!(fid, 0); + assert_eq!(afid, P9_NOFID); + assert_eq!(uname, "root"); + assert_eq!(aname, "/share"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_twalk_empty() { + let mut w = ByteWriter::new(); + w.put_u32(0); // fid + w.put_u32(1); // newfid + w.put_u16(0); // nwname = 0 + let req = parse_request(P9_TWALK, w.as_bytes()).unwrap(); + match req { + P9Request::Twalk { fid, newfid, names } => { + assert_eq!(fid, 0); + assert_eq!(newfid, 1); + assert!(names.is_empty()); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_twalk_multi() { + let mut w = ByteWriter::new(); + w.put_u32(0); + w.put_u32(1); + w.put_u16(3); + w.put_string("usr"); + w.put_string("local"); + w.put_string("bin"); + let req = parse_request(P9_TWALK, w.as_bytes()).unwrap(); + match req { + P9Request::Twalk { names, .. } => { + assert_eq!(names, vec!["usr", "local", "bin"]); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tlopen() { + let mut w = ByteWriter::new(); + w.put_u32(5); // fid + w.put_u32(0); // O_RDONLY + let req = parse_request(P9_TLOPEN, w.as_bytes()).unwrap(); + match req { + P9Request::Tlopen { fid, flags } => { + assert_eq!(fid, 5); + assert_eq!(flags, 0); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tread() { + let mut w = ByteWriter::new(); + w.put_u32(3); // fid + w.put_u64(100); // offset + w.put_u32(4096); // count + let req = parse_request(P9_TREAD, w.as_bytes()).unwrap(); + match req { + P9Request::Tread { fid, offset, count } => { + assert_eq!(fid, 3); + assert_eq!(offset, 100); + assert_eq!(count, 4096); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_twrite() { + let mut w = ByteWriter::new(); + w.put_u32(3); // fid + w.put_u64(0); // offset + w.put_u32(5); // count + w.put_raw(b"hello"); // data + let req = parse_request(P9_TWRITE, w.as_bytes()).unwrap(); + match req { + P9Request::Twrite { + fid, + offset, + count, + data, + } => { + assert_eq!(fid, 3); + assert_eq!(offset, 0); + assert_eq!(count, 5); + assert_eq!(data, b"hello"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tclunk() { + let mut w = ByteWriter::new(); + w.put_u32(7); + let req = parse_request(P9_TCLUNK, w.as_bytes()).unwrap(); + match req { + P9Request::Tclunk { fid } => assert_eq!(fid, 7), + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tflush() { + let mut w = ByteWriter::new(); + w.put_u16(42); + let req = parse_request(P9_TFLUSH, w.as_bytes()).unwrap(); + match req { + P9Request::Tflush { oldtag } => assert_eq!(oldtag, 42), + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tgetattr() { + let mut w = ByteWriter::new(); + w.put_u32(1); + w.put_u64(0x3FFF); // request_mask: all valid bits + let req = parse_request(P9_TGETATTR, w.as_bytes()).unwrap(); + match req { + P9Request::Tgetattr { fid, request_mask } => { + assert_eq!(fid, 1); + assert_eq!(request_mask, 0x3FFF); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_treaddir() { + let mut w = ByteWriter::new(); + w.put_u32(2); // fid + w.put_u64(0); // offset + w.put_u32(8192); // count + let req = parse_request(P9_TREADDIR, w.as_bytes()).unwrap(); + match req { + P9Request::Treaddir { fid, offset, count } => { + assert_eq!(fid, 2); + assert_eq!(offset, 0); + assert_eq!(count, 8192); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tmkdir() { + let mut w = ByteWriter::new(); + w.put_u32(1); // dfid + w.put_string("newdir"); + w.put_u32(0o755); // mode + w.put_u32(0); // gid + let req = parse_request(P9_TMKDIR, w.as_bytes()).unwrap(); + match req { + P9Request::Tmkdir { + dfid, + name, + mode, + gid, + } => { + assert_eq!(dfid, 1); + assert_eq!(name, "newdir"); + assert_eq!(mode, 0o755); + assert_eq!(gid, 0); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tunlinkat() { + let mut w = ByteWriter::new(); + w.put_u32(1); // dirfid + w.put_string("oldfile"); + w.put_u32(0); // flags + let req = parse_request(P9_TUNLINKAT, w.as_bytes()).unwrap(); + match req { + P9Request::Tunlinkat { + dirfid, + name, + flags, + } => { + assert_eq!(dirfid, 1); + assert_eq!(name, "oldfile"); + assert_eq!(flags, 0); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_trenameat() { + let mut w = ByteWriter::new(); + w.put_u32(1); // olddirfid + w.put_string("old.txt"); + w.put_u32(2); // newdirfid + w.put_string("new.txt"); + let req = parse_request(P9_TRENAMEAT, w.as_bytes()).unwrap(); + match req { + P9Request::Trenameat { + olddirfid, + oldname, + newdirfid, + newname, + } => { + assert_eq!(olddirfid, 1); + assert_eq!(oldname, "old.txt"); + assert_eq!(newdirfid, 2); + assert_eq!(newname, "new.txt"); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tfsync() { + let mut w = ByteWriter::new(); + w.put_u32(5); + let req = parse_request(P9_TFSYNC, w.as_bytes()).unwrap(); + match req { + P9Request::Tfsync { fid } => assert_eq!(fid, 5), + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tlcreate() { + let mut w = ByteWriter::new(); + w.put_u32(1); // fid + w.put_string("newfile.txt"); + w.put_u32(0x42); // flags (O_CREAT|O_RDWR) + w.put_u32(0o644); // mode + w.put_u32(0); // gid + let req = parse_request(P9_TLCREATE, w.as_bytes()).unwrap(); + match req { + P9Request::Tlcreate { + fid, + name, + flags, + mode, + gid, + } => { + assert_eq!(fid, 1); + assert_eq!(name, "newfile.txt"); + assert_eq!(flags, 0x42); + assert_eq!(mode, 0o644); + assert_eq!(gid, 0); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_tsetattr() { + let mut w = ByteWriter::new(); + w.put_u32(3); // fid + w.put_u32(0x01); // valid (mode) + w.put_u32(0o755); // mode + w.put_u32(0); // uid + w.put_u32(0); // gid + w.put_u64(0); // size + w.put_u64(0); // atime_sec + w.put_u64(0); // atime_nsec + w.put_u64(0); // mtime_sec + w.put_u64(0); // mtime_nsec + let req = parse_request(P9_TSETATTR, w.as_bytes()).unwrap(); + match req { + P9Request::Tsetattr { + fid, valid, mode, .. + } => { + assert_eq!(fid, 3); + assert_eq!(valid, 0x01); + assert_eq!(mode, 0o755); + } + _ => panic!("wrong variant"), + } + } + + #[test] + fn test_parse_unknown_type_returns_none() { + assert!(parse_request(0xFF, &[]).is_none()); + } + + // -- Response builder tests -- + + #[test] + fn test_build_rversion() { + let msg = build_response(P9_RVERSION, P9_NOTAG, |w| { + write_rversion(w, 8192, "9P2000.L"); + }); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RVERSION); + assert_eq!(hdr.tag, P9_NOTAG); + assert_eq!(hdr.size as usize, msg.len()); + + let msize = r.get_u32().unwrap(); + let version = r.get_string().unwrap(); + assert_eq!(msize, 8192); + assert_eq!(version, "9P2000.L"); + } + + #[test] + fn test_build_rlerror() { + let msg = build_response(P9_RLERROR, 1, |w| { + write_rlerror(w, 2); // ENOENT + }); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RLERROR); + assert_eq!(hdr.tag, 1); + let ecode = r.get_u32().unwrap(); + assert_eq!(ecode, 2); + } + + #[test] + fn test_build_rwalk() { + let qids = vec![ + Qid { + qtype: QT_DIR, + version: 1, + path: 100, + }, + Qid { + qtype: QT_FILE, + version: 2, + path: 200, + }, + ]; + let msg = build_response(P9_RWALK, 5, |w| { + write_rwalk(w, &qids); + }); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RWALK); + let nwqid = r.get_u16().unwrap(); + assert_eq!(nwqid, 2); + let q1 = Qid::read_from(&mut r).unwrap(); + assert_eq!(q1.path, 100); + let q2 = Qid::read_from(&mut r).unwrap(); + assert_eq!(q2.path, 200); + } + + #[test] + fn test_build_rread() { + let msg = build_response(P9_RREAD, 3, |w| { + write_rread(w, b"file data"); + }); + let mut r = ByteReader::new(&msg); + let _hdr = P9Header::read_from(&mut r).unwrap(); + let count = r.get_u32().unwrap(); + assert_eq!(count, 9); + let data = r.get_bytes(count as usize).unwrap(); + assert_eq!(data, b"file data"); + } + + #[test] + fn test_build_rwrite() { + let msg = build_response(P9_RWRITE, 3, |w| { + write_rwrite(w, 42); + }); + let mut r = ByteReader::new(&msg); + let _hdr = P9Header::read_from(&mut r).unwrap(); + let count = r.get_u32().unwrap(); + assert_eq!(count, 42); + } + + #[test] + fn test_build_rlopen() { + let qid = Qid { + qtype: QT_FILE, + version: 1, + path: 42, + }; + let msg = build_response(P9_RLOPEN, 2, |w| { + write_rlopen(w, &qid, 4096); + }); + let mut r = ByteReader::new(&msg); + let _hdr = P9Header::read_from(&mut r).unwrap(); + let q = Qid::read_from(&mut r).unwrap(); + assert_eq!(q, qid); + let iounit = r.get_u32().unwrap(); + assert_eq!(iounit, 4096); + } + + #[test] + fn test_build_rattach() { + let qid = Qid { + qtype: QT_DIR, + version: 0, + path: 1, + }; + let msg = build_response(P9_RATTACH, 0, |w| { + write_rattach(w, &qid); + }); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RATTACH); + let q = Qid::read_from(&mut r).unwrap(); + assert_eq!(q, qid); + } + + #[test] + fn test_build_response_size_correct() { + // Rclunk is header-only (7 bytes total). + let msg = build_response(P9_RCLUNK, 10, |w| { + write_rclunk(w); + }); + assert_eq!(msg.len(), P9_HEADER_SIZE); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.size as usize, P9_HEADER_SIZE); + } + + #[test] + fn test_build_rmkdir() { + let qid = Qid { + qtype: QT_DIR, + version: 3, + path: 99, + }; + let msg = build_response(P9_RMKDIR, 7, |w| { + write_rmkdir(w, &qid); + }); + let mut r = ByteReader::new(&msg); + let hdr = P9Header::read_from(&mut r).unwrap(); + assert_eq!(hdr.msg_type, P9_RMKDIR); + let q = Qid::read_from(&mut r).unwrap(); + assert_eq!(q, qid); + } +} diff --git a/src/vmm/src/windows/devices/virtio/queue.rs b/src/vmm/src/windows/devices/virtio/queue.rs new file mode 100644 index 000000000..eecd5f642 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/queue.rs @@ -0,0 +1,699 @@ +//! Split virtqueue implementation (virtio spec v1.2 Section 2.7). +//! +//! A split virtqueue consists of three regions in guest memory: +//! - Descriptor table: array of buffer descriptors +//! - Available ring: guest-to-device buffer indices +//! - Used ring: device-to-guest completion notifications + +use super::super::super::error::{Result, WkrunError}; + +/// Abstraction over guest physical memory for cross-platform testing. +pub trait GuestMemoryAccessor { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()>; + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()>; +} + +/// Extension methods for reading typed values from guest memory. +trait GuestMemoryExt: GuestMemoryAccessor { + fn read_u16(&self, addr: u64) -> Result { + let mut buf = [0u8; 2]; + self.read_at(addr, &mut buf)?; + Ok(u16::from_le_bytes(buf)) + } + + fn read_u32(&self, addr: u64) -> Result { + let mut buf = [0u8; 4]; + self.read_at(addr, &mut buf)?; + Ok(u32::from_le_bytes(buf)) + } + + fn read_u64(&self, addr: u64) -> Result { + let mut buf = [0u8; 8]; + self.read_at(addr, &mut buf)?; + Ok(u64::from_le_bytes(buf)) + } + + fn write_u16(&self, addr: u64, val: u16) -> Result<()> { + self.write_at(addr, &val.to_le_bytes()) + } + + fn write_u32(&self, addr: u64, val: u32) -> Result<()> { + self.write_at(addr, &val.to_le_bytes()) + } +} + +impl GuestMemoryExt for T {} + +// Descriptor table entry layout (virtio spec 2.7.5). +const DESC_ADDR_OFFSET: u64 = 0; +const DESC_LEN_OFFSET: u64 = 8; +const DESC_FLAGS_OFFSET: u64 = 12; +const DESC_NEXT_OFFSET: u64 = 14; +const DESC_SIZE: u64 = 16; + +/// Descriptor flag: buffer is device-writable (for reads from device). +const VIRTQ_DESC_F_WRITE: u16 = 2; +/// Descriptor flag: next field is valid (chained descriptor). +const VIRTQ_DESC_F_NEXT: u16 = 1; + +/// A single descriptor from the descriptor table. +#[derive(Debug, Clone, Copy)] +pub struct Descriptor { + /// Guest physical address of the buffer. + pub addr: u64, + /// Length of the buffer in bytes. + pub len: u32, + /// Descriptor flags. + pub flags: u16, + /// Next descriptor index (valid only if VIRTQ_DESC_F_NEXT is set). + pub next: u16, +} + +impl Descriptor { + /// Whether the buffer is device-writable (guest reads from it). + pub fn is_write(&self) -> bool { + self.flags & VIRTQ_DESC_F_WRITE != 0 + } + + /// Whether there is a next descriptor in the chain. + pub fn has_next(&self) -> bool { + self.flags & VIRTQ_DESC_F_NEXT != 0 + } +} + +/// A split virtqueue. +pub struct Virtqueue { + /// Maximum queue size (device sets this). + max_size: u16, + /// Negotiated queue size (driver sets this, must be <= max_size and power of 2). + size: u16, + /// Whether the queue is ready for use. + ready: bool, + /// Guest physical address of the descriptor table. + desc_table_addr: u64, + /// Guest physical address of the available ring. + avail_ring_addr: u64, + /// Guest physical address of the used ring. + used_ring_addr: u64, + /// Last available index consumed by the device. + last_avail_idx: u16, +} + +impl Virtqueue { + /// Create a new virtqueue with the given maximum size. + pub fn new(max_size: u16) -> Self { + Virtqueue { + max_size, + size: 0, + ready: false, + desc_table_addr: 0, + avail_ring_addr: 0, + used_ring_addr: 0, + last_avail_idx: 0, + } + } + + /// Get the maximum queue size. + pub fn max_size(&self) -> u16 { + self.max_size + } + + /// Get the current queue size. + pub fn size(&self) -> u16 { + self.size + } + + /// Set the queue size (called by driver during setup). + pub fn set_size(&mut self, size: u16) { + self.size = size; + } + + /// Whether the queue is ready for I/O. + pub fn is_ready(&self) -> bool { + self.ready + } + + /// Mark the queue as ready. + pub fn set_ready(&mut self, ready: bool) { + self.ready = ready; + } + + /// Set the descriptor table address. + pub fn set_desc_table(&mut self, addr: u64) { + self.desc_table_addr = addr; + } + + /// Set the available ring address. + pub fn set_avail_ring(&mut self, addr: u64) { + self.avail_ring_addr = addr; + } + + /// Set the used ring address. + pub fn set_used_ring(&mut self, addr: u64) { + self.used_ring_addr = addr; + } + + /// Read a descriptor from the descriptor table by index. + fn read_descriptor( + &self, + index: u16, + mem: &(impl GuestMemoryAccessor + ?Sized), + ) -> Result { + if index >= self.size { + return Err(WkrunError::Device(format!( + "descriptor index {} out of bounds (queue size {})", + index, self.size + ))); + } + let addr = self.desc_table_addr + (index as u64) * DESC_SIZE; + Ok(Descriptor { + addr: mem.read_u64(addr + DESC_ADDR_OFFSET)?, + len: mem.read_u32(addr + DESC_LEN_OFFSET)?, + flags: mem.read_u16(addr + DESC_FLAGS_OFFSET)?, + next: mem.read_u16(addr + DESC_NEXT_OFFSET)?, + }) + } + + /// Pop the next available descriptor chain head index, if any. + /// + /// Returns `None` if no new buffers are available. + pub fn pop_avail(&mut self, mem: &(impl GuestMemoryAccessor + ?Sized)) -> Result> { + if !self.ready || self.size == 0 { + return Ok(None); + } + + // Avail ring layout: flags(u16) + idx(u16) + ring[size](u16 each) + let avail_idx = mem.read_u16(self.avail_ring_addr + 2)?; + + if self.last_avail_idx == avail_idx { + return Ok(None); // No new buffers. + } + + let ring_offset = 4 + (self.last_avail_idx % self.size) as u64 * 2; + let head = mem.read_u16(self.avail_ring_addr + ring_offset)?; + + self.last_avail_idx = self.last_avail_idx.wrapping_add(1); + Ok(Some(head)) + } + + /// Read an entire descriptor chain starting from the given head index. + /// + /// Returns the chain of descriptors. Detects cycles by limiting + /// the chain length to the queue size. + pub fn read_desc_chain( + &self, + head: u16, + mem: &(impl GuestMemoryAccessor + ?Sized), + ) -> Result> { + let mut chain = Vec::new(); + let mut index = head; + let max_chain = self.size as usize; + + loop { + if chain.len() >= max_chain { + return Err(WkrunError::Device(format!( + "descriptor chain too long (> {}), possible cycle", + max_chain + ))); + } + + let desc = self.read_descriptor(index, mem)?; + chain.push(desc); + + if !desc.has_next() { + break; + } + index = desc.next; + } + + Ok(chain) + } + + /// Add a used buffer to the used ring. + /// + /// `head` is the descriptor chain head index (from `pop_avail`). + /// `len` is the total bytes written to the descriptor chain. + pub fn add_used( + &mut self, + head: u16, + len: u32, + mem: &(impl GuestMemoryAccessor + ?Sized), + ) -> Result<()> { + if !self.ready || self.size == 0 { + return Err(WkrunError::Device("queue not ready".into())); + } + + // Used ring layout: flags(u16) + idx(u16) + ring[size](id:u32 + len:u32) + let used_idx = mem.read_u16(self.used_ring_addr + 2)?; + let ring_entry_offset = 4 + (used_idx % self.size) as u64 * 8; + let entry_addr = self.used_ring_addr + ring_entry_offset; + + // Write used ring entry: {id: u32, len: u32}. + mem.write_u32(entry_addr, head as u32)?; + mem.write_u32(entry_addr + 4, len)?; + + // Increment used index. + mem.write_u16(self.used_ring_addr + 2, used_idx.wrapping_add(1))?; + + Ok(()) + } + + /// Reset the queue to its initial state. + pub fn reset(&mut self) { + self.size = 0; + self.ready = false; + self.desc_table_addr = 0; + self.avail_ring_addr = 0; + self.used_ring_addr = 0; + self.last_avail_idx = 0; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::RefCell; + + /// Mock guest memory backed by a Vec. + struct MockGuestMemory { + data: RefCell>, + } + + impl MockGuestMemory { + fn new(size: usize) -> Self { + MockGuestMemory { + data: RefCell::new(vec![0u8; size]), + } + } + + fn write_u16_at(&self, addr: u64, val: u16) { + let a = addr as usize; + let bytes = val.to_le_bytes(); + let mut data = self.data.borrow_mut(); + data[a..a + 2].copy_from_slice(&bytes); + } + + fn write_u32_at(&self, addr: u64, val: u32) { + let a = addr as usize; + let bytes = val.to_le_bytes(); + let mut data = self.data.borrow_mut(); + data[a..a + 4].copy_from_slice(&bytes); + } + + fn write_u64_at(&self, addr: u64, val: u64) { + let a = addr as usize; + let bytes = val.to_le_bytes(); + let mut data = self.data.borrow_mut(); + data[a..a + 8].copy_from_slice(&bytes); + } + + fn read_u16_at(&self, addr: u64) -> u16 { + let a = addr as usize; + let data = self.data.borrow(); + u16::from_le_bytes([data[a], data[a + 1]]) + } + + fn read_u32_at(&self, addr: u64) -> u32 { + let a = addr as usize; + let data = self.data.borrow(); + u32::from_le_bytes([data[a], data[a + 1], data[a + 2], data[a + 3]]) + } + } + + impl GuestMemoryAccessor for MockGuestMemory { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.data.borrow(); + if a + buf.len() > data.len() { + return Err(WkrunError::Memory(format!( + "read out of bounds: 0x{:X} + {}", + addr, + buf.len() + ))); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.data.borrow_mut(); + if a + data.len() > mem.len() { + return Err(WkrunError::Memory(format!( + "write out of bounds: 0x{:X} + {}", + addr, + data.len() + ))); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + // Memory layout for tests: + // DESC_TABLE at 0x0000 (256 entries * 16 bytes = 4096 bytes) + // AVAIL_RING at 0x1000 (flags:2 + idx:2 + ring[256]:512 + used_event:2 = 518) + // USED_RING at 0x2000 (flags:2 + idx:2 + ring[256]:(4+4)*256=2048 + avail_event:2 = 2054) + const DESC_TABLE: u64 = 0x0000; + const AVAIL_RING: u64 = 0x1000; + const USED_RING: u64 = 0x2000; + + fn setup_queue(max_size: u16) -> Virtqueue { + let mut q = Virtqueue::new(max_size); + q.set_size(max_size); + q.set_desc_table(DESC_TABLE); + q.set_avail_ring(AVAIL_RING); + q.set_used_ring(USED_RING); + q.set_ready(true); + q + } + + /// Write a descriptor into mock memory. + fn write_descriptor( + mem: &MockGuestMemory, + index: u16, + addr: u64, + len: u32, + flags: u16, + next: u16, + ) { + let base = DESC_TABLE + index as u64 * DESC_SIZE; + mem.write_u64_at(base + DESC_ADDR_OFFSET, addr); + mem.write_u32_at(base + DESC_LEN_OFFSET, len); + mem.write_u16_at(base + DESC_FLAGS_OFFSET, flags); + mem.write_u16_at(base + DESC_NEXT_OFFSET, next); + } + + /// Set the avail ring index and add an entry. + fn push_avail(mem: &MockGuestMemory, ring_idx: u16, desc_head: u16) { + // Write ring entry. + let entry_off = AVAIL_RING + 4 + (ring_idx as u64) * 2; + mem.write_u16_at(entry_off, desc_head); + // Update avail idx. + mem.write_u16_at(AVAIL_RING + 2, ring_idx + 1); + } + + // --- Construction tests --- + + #[test] + fn test_new_queue() { + let q = Virtqueue::new(256); + assert_eq!(q.max_size(), 256); + assert_eq!(q.size(), 0); + assert!(!q.is_ready()); + } + + #[test] + fn test_queue_configuration() { + let mut q = Virtqueue::new(256); + q.set_size(128); + q.set_desc_table(0x1000); + q.set_avail_ring(0x2000); + q.set_used_ring(0x3000); + q.set_ready(true); + assert_eq!(q.size(), 128); + assert!(q.is_ready()); + } + + #[test] + fn test_queue_reset() { + let mut q = setup_queue(256); + assert!(q.is_ready()); + q.reset(); + assert!(!q.is_ready()); + assert_eq!(q.size(), 0); + } + + // --- pop_avail tests --- + + #[test] + fn test_pop_avail_empty() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + // Avail idx = 0, last_avail_idx = 0 -> nothing. + assert!(q.pop_avail(&mem).unwrap().is_none()); + } + + #[test] + fn test_pop_avail_not_ready() { + let mut q = Virtqueue::new(256); + let mem = MockGuestMemory::new(0x4000); + assert!(q.pop_avail(&mem).unwrap().is_none()); + } + + #[test] + fn test_pop_avail_single() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + push_avail(&mem, 0, 42); + + let head = q.pop_avail(&mem).unwrap(); + assert_eq!(head, Some(42)); + + // No more available. + assert!(q.pop_avail(&mem).unwrap().is_none()); + } + + #[test] + fn test_pop_avail_multiple() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + push_avail(&mem, 0, 10); + // Push second: ring[1]=20, idx=2 + mem.write_u16_at(AVAIL_RING + 4 + 2, 20); + mem.write_u16_at(AVAIL_RING + 2, 2); + + assert_eq!(q.pop_avail(&mem).unwrap(), Some(10)); + assert_eq!(q.pop_avail(&mem).unwrap(), Some(20)); + assert!(q.pop_avail(&mem).unwrap().is_none()); + } + + // --- read_desc_chain tests --- + + #[test] + fn test_read_single_descriptor() { + let q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + // Descriptor 0: addr=0x5000, len=512, no flags, no next. + write_descriptor(&mem, 0, 0x5000, 512, 0, 0); + + let chain = q.read_desc_chain(0, &mem).unwrap(); + assert_eq!(chain.len(), 1); + assert_eq!(chain[0].addr, 0x5000); + assert_eq!(chain[0].len, 512); + assert!(!chain[0].is_write()); + assert!(!chain[0].has_next()); + } + + #[test] + fn test_read_chained_descriptors() { + let q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + // Descriptor 0 -> 1 -> 2 (virtio-blk: header -> data -> status). + write_descriptor(&mem, 0, 0x5000, 16, VIRTQ_DESC_F_NEXT, 1); + write_descriptor( + &mem, + 1, + 0x6000, + 512, + VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, + 2, + ); + write_descriptor(&mem, 2, 0x7000, 1, VIRTQ_DESC_F_WRITE, 0); + + let chain = q.read_desc_chain(0, &mem).unwrap(); + assert_eq!(chain.len(), 3); + + // Header (device-readable). + assert_eq!(chain[0].addr, 0x5000); + assert_eq!(chain[0].len, 16); + assert!(!chain[0].is_write()); + assert!(chain[0].has_next()); + + // Data buffer (device-writable). + assert_eq!(chain[1].addr, 0x6000); + assert_eq!(chain[1].len, 512); + assert!(chain[1].is_write()); + + // Status (device-writable). + assert_eq!(chain[2].addr, 0x7000); + assert_eq!(chain[2].len, 1); + assert!(chain[2].is_write()); + assert!(!chain[2].has_next()); + } + + #[test] + fn test_chain_cycle_detection() { + let q = setup_queue(4); + let mem = MockGuestMemory::new(0x4000); + + // Descriptor 0 -> 1 -> 0 (cycle). + write_descriptor(&mem, 0, 0x5000, 16, VIRTQ_DESC_F_NEXT, 1); + write_descriptor(&mem, 1, 0x6000, 512, VIRTQ_DESC_F_NEXT, 0); + + let result = q.read_desc_chain(0, &mem); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("cycle"), "error should mention cycle: {}", err); + } + + #[test] + fn test_descriptor_index_out_of_bounds() { + let q = setup_queue(4); + let mem = MockGuestMemory::new(0x4000); + + let result = q.read_desc_chain(5, &mem); + assert!(result.is_err()); + } + + // --- add_used tests --- + + #[test] + fn test_add_used_single() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + q.add_used(42, 512, &mem).unwrap(); + + // Check used ring: idx should be 1. + let used_idx = mem.read_u16_at(USED_RING + 2); + assert_eq!(used_idx, 1); + + // Check used ring entry: {id=42, len=512}. + let entry_id = mem.read_u32_at(USED_RING + 4); + let entry_len = mem.read_u32_at(USED_RING + 4 + 4); + assert_eq!(entry_id, 42); + assert_eq!(entry_len, 512); + } + + #[test] + fn test_add_used_multiple() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + q.add_used(0, 100, &mem).unwrap(); + q.add_used(3, 200, &mem).unwrap(); + + let used_idx = mem.read_u16_at(USED_RING + 2); + assert_eq!(used_idx, 2); + + // First entry. + assert_eq!(mem.read_u32_at(USED_RING + 4), 0); + assert_eq!(mem.read_u32_at(USED_RING + 8), 100); + + // Second entry. + assert_eq!(mem.read_u32_at(USED_RING + 12), 3); + assert_eq!(mem.read_u32_at(USED_RING + 16), 200); + } + + #[test] + fn test_add_used_not_ready() { + let mut q = Virtqueue::new(256); + let mem = MockGuestMemory::new(0x4000); + assert!(q.add_used(0, 0, &mem).is_err()); + } + + // --- Full round-trip: avail -> process -> used --- + + #[test] + fn test_full_roundtrip() { + let mut q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + // Set up a single-descriptor buffer. + write_descriptor(&mem, 5, 0x8000, 1024, VIRTQ_DESC_F_WRITE, 0); + push_avail(&mem, 0, 5); + + // Pop available. + let head = q.pop_avail(&mem).unwrap().expect("should have buffer"); + assert_eq!(head, 5); + + // Read chain. + let chain = q.read_desc_chain(head, &mem).unwrap(); + assert_eq!(chain.len(), 1); + assert_eq!(chain[0].len, 1024); + + // Complete: add to used. + q.add_used(head, 1024, &mem).unwrap(); + + let used_idx = mem.read_u16_at(USED_RING + 2); + assert_eq!(used_idx, 1); + } + + // --- Wrapping behavior --- + + #[test] + fn test_avail_index_wraps() { + let mut q = setup_queue(4); + let mem = MockGuestMemory::new(0x4000); + + // Simulate avail idx at u16::MAX boundary. + q.last_avail_idx = u16::MAX; + // Set avail ring idx to u16::MAX + 1 = 0 (wraps). + mem.write_u16_at(AVAIL_RING + 2, 0); + + // last_avail_idx (65535) == avail_idx (0 after wrap)? + // No: 65535 != 0, so we should get a buffer. + // Ring offset: (65535 % 4) * 2 = 3 * 2 = 6 -> ring[3] + mem.write_u16_at(AVAIL_RING + 4 + 6, 2); + + let head = q.pop_avail(&mem).unwrap(); + assert_eq!(head, Some(2)); + assert_eq!(q.last_avail_idx, 0); // Wrapped. + } + + // --- Virtio-blk style 3-descriptor chain --- + + #[test] + fn test_virtio_blk_chain() { + let q = setup_queue(256); + let mem = MockGuestMemory::new(0x4000); + + // Header (device-readable): type=IN, sector=0 + write_descriptor(&mem, 0, 0xA000, 16, VIRTQ_DESC_F_NEXT, 1); + // Data buffer (device-writable): 512 bytes + write_descriptor( + &mem, + 1, + 0xB000, + 512, + VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, + 2, + ); + // Status (device-writable): 1 byte + write_descriptor(&mem, 2, 0xC000, 1, VIRTQ_DESC_F_WRITE, 0); + + let chain = q.read_desc_chain(0, &mem).unwrap(); + assert_eq!(chain.len(), 3); + assert!(!chain[0].is_write()); // Header is device-readable. + assert!(chain[1].is_write()); // Data is device-writable. + assert!(chain[2].is_write()); // Status is device-writable. + } + + // --- Descriptor flags --- + + #[test] + fn test_descriptor_flags() { + let desc = Descriptor { + addr: 0, + len: 0, + flags: VIRTQ_DESC_F_WRITE | VIRTQ_DESC_F_NEXT, + next: 1, + }; + assert!(desc.is_write()); + assert!(desc.has_next()); + + let desc2 = Descriptor { + addr: 0, + len: 0, + flags: 0, + next: 0, + }; + assert!(!desc2.is_write()); + assert!(!desc2.has_next()); + } +} diff --git a/src/vmm/src/windows/devices/virtio/vsock/connection.rs b/src/vmm/src/windows/devices/virtio/vsock/connection.rs new file mode 100644 index 000000000..4b97eee34 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/vsock/connection.rs @@ -0,0 +1,681 @@ +//! Vsock connection state machine with credit-based flow control. +//! +//! Each vsock connection tracks the state of a bidirectional byte stream +//! between a guest port and a host port. Flow control follows the virtio +//! spec (Section 5.10.6.3): each side advertises buffer space (buf_alloc) +//! and reports bytes consumed (fwd_cnt). The peer computes available +//! send credit as: `peer_buf_alloc - (tx_cnt - peer_fwd_cnt)`. + +use super::packet::{ + VsockHeader, VSOCK_OP_CREDIT_REQUEST, VSOCK_OP_CREDIT_UPDATE, VSOCK_OP_REQUEST, + VSOCK_OP_RESPONSE, VSOCK_OP_RST, VSOCK_OP_RW, VSOCK_OP_SHUTDOWN, VSOCK_SHUTDOWN_RECV, + VSOCK_SHUTDOWN_SEND, +}; + +/// Default buffer space we advertise to the peer (64 KiB). +const DEFAULT_BUF_ALLOC: u32 = 65536; + +/// Connection state. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConnState { + /// No connection established. + Idle, + /// REQUEST sent/received, waiting for RESPONSE. + Connecting, + /// Data transfer active. + Connected, + /// SHUTDOWN sent or received; draining. + Closing, + /// Connection fully closed. + Closed, +} + +/// A single vsock connection between a guest port and a host port. +pub struct VsockConnection { + state: ConnState, + pub local_cid: u64, + pub local_port: u32, + pub peer_cid: u64, + pub peer_port: u32, + // Our credit: how much buffer space we offer to the peer. + buf_alloc: u32, + // Bytes we have consumed (forwarded to host TCP socket). + fwd_cnt: u32, + // Peer's advertised buffer space. + peer_buf_alloc: u32, + // Peer's forwarded count (bytes peer has consumed). + peer_fwd_cnt: u32, + // Total bytes we have sent to the peer (to compute remaining credit). + tx_cnt: u32, + // Host-to-guest transmit buffer. + tx_buf: Vec, + // Whether the peer has requested a credit update. + credit_update_needed: bool, +} + +impl VsockConnection { + /// Create a new connection in the Idle state. + pub fn new(local_cid: u64, local_port: u32, peer_cid: u64, peer_port: u32) -> Self { + VsockConnection { + state: ConnState::Idle, + local_cid, + local_port, + peer_cid, + peer_port, + buf_alloc: DEFAULT_BUF_ALLOC, + fwd_cnt: 0, + peer_buf_alloc: 0, + peer_fwd_cnt: 0, + tx_cnt: 0, + tx_buf: Vec::new(), + credit_update_needed: false, + } + } + + /// Current connection state. + pub fn state(&self) -> ConnState { + self.state + } + + /// Our advertised buffer space. + pub fn buf_alloc(&self) -> u32 { + self.buf_alloc + } + + /// Bytes we have consumed (forwarded to host side). + pub fn fwd_cnt(&self) -> u32 { + self.fwd_cnt + } + + /// Total bytes we have sent to the peer. + pub fn tx_cnt(&self) -> u32 { + self.tx_cnt + } + + /// Bytes buffered for host-to-guest transmission. + pub fn tx_buf_len(&self) -> usize { + self.tx_buf.len() + } + + /// Available credit to send data to the peer. + /// + /// `peer_buf_alloc - (tx_cnt - peer_fwd_cnt)` per spec 5.10.6.3. + pub fn peer_credit(&self) -> u32 { + let in_flight = self.tx_cnt.wrapping_sub(self.peer_fwd_cnt); + self.peer_buf_alloc.saturating_sub(in_flight) + } + + /// Whether we need to send a credit update to the peer. + pub fn needs_credit_update(&self) -> bool { + self.credit_update_needed + } + + /// Clear the credit update flag. + pub fn clear_credit_update(&mut self) { + self.credit_update_needed = false; + } + + /// Handle a REQUEST from the guest. + /// + /// Transitions Idle -> Connected and returns a RESPONSE header. + /// Returns None if the connection is not in Idle state (sends RST instead). + pub fn handle_request(&mut self, hdr: &VsockHeader) -> Option { + if self.state != ConnState::Idle { + return None; + } + + // Record peer's credit info from the REQUEST. + self.peer_buf_alloc = hdr.buf_alloc; + self.peer_fwd_cnt = hdr.fwd_cnt; + self.state = ConnState::Connected; + + Some(VsockHeader::new_response( + self.local_cid, + self.local_port, + self.peer_cid, + self.peer_port, + self.buf_alloc, + self.fwd_cnt, + )) + } + + /// Handle an RW (data) packet from the guest. + /// + /// Returns the payload data to forward to the host TCP socket. + /// Updates fwd_cnt. Returns None if not connected. + pub fn handle_rw(&mut self, payload: &[u8]) -> Option> { + if self.state != ConnState::Connected { + return None; + } + + self.fwd_cnt = self.fwd_cnt.wrapping_add(payload.len() as u32); + + // Check if we should proactively send a credit update. + // If the peer's remaining view of our buffer is below half, signal update. + let peer_view = self.buf_alloc.saturating_sub( + self.fwd_cnt + .wrapping_sub(/* they don't know fwd_cnt yet */ 0), + ); + if peer_view < self.buf_alloc / 2 { + self.credit_update_needed = true; + } + + Some(payload.to_vec()) + } + + /// Handle a SHUTDOWN from the guest. + pub fn handle_shutdown(&mut self, flags: u32) { + match self.state { + ConnState::Connected => { + if flags & (VSOCK_SHUTDOWN_SEND | VSOCK_SHUTDOWN_RECV) + == (VSOCK_SHUTDOWN_SEND | VSOCK_SHUTDOWN_RECV) + { + self.state = ConnState::Closed; + } else { + self.state = ConnState::Closing; + } + } + ConnState::Closing => { + self.state = ConnState::Closed; + } + _ => {} + } + } + + /// Handle a RST from the guest. + pub fn handle_rst(&mut self) { + self.state = ConnState::Closed; + } + + /// Handle a credit update from the guest. + pub fn handle_credit_update(&mut self, hdr: &VsockHeader) { + self.peer_buf_alloc = hdr.buf_alloc; + self.peer_fwd_cnt = hdr.fwd_cnt; + } + + /// Handle a credit request from the guest. + pub fn handle_credit_request(&mut self) { + self.credit_update_needed = true; + } + + /// Enqueue data from the host for transmission to the guest. + /// + /// Returns the number of bytes actually enqueued (limited by peer credit). + pub fn enqueue_tx(&mut self, data: &[u8]) -> usize { + if self.state != ConnState::Connected { + return 0; + } + + let credit = self.peer_credit() as usize; + let to_send = data.len().min(credit); + if to_send > 0 { + self.tx_buf.extend_from_slice(&data[..to_send]); + } + to_send + } + + /// Drain pending host-to-guest data, limited by available credit. + /// + /// Returns data to be placed in an RX virtqueue buffer, along with + /// the header to prepend. + pub fn drain_tx(&mut self, max_payload: usize) -> Option<(VsockHeader, Vec)> { + if self.tx_buf.is_empty() { + return None; + } + + let send_len = self.tx_buf.len().min(max_payload); + let data: Vec = self.tx_buf.drain(..send_len).collect(); + + self.tx_cnt = self.tx_cnt.wrapping_add(data.len() as u32); + + let hdr = VsockHeader::new_rw( + self.local_cid, + self.local_port, + self.peer_cid, + self.peer_port, + data.len() as u32, + self.buf_alloc, + self.fwd_cnt, + ); + + Some((hdr, data)) + } + + /// Build a credit update header for this connection. + pub fn make_credit_update(&self) -> VsockHeader { + VsockHeader::new_credit_update( + self.local_cid, + self.local_port, + self.peer_cid, + self.peer_port, + self.buf_alloc, + self.fwd_cnt, + ) + } + + /// Build a RST header for this connection. + pub fn make_rst(&self) -> VsockHeader { + VsockHeader::new_rst( + self.local_cid, + self.local_port, + self.peer_cid, + self.peer_port, + ) + } + + /// Dispatch a packet by operation code. + /// + /// Returns a response header to send back (if any), and optional + /// payload data to forward to the host side. + pub fn dispatch( + &mut self, + hdr: &VsockHeader, + payload: &[u8], + ) -> (Option, Option>) { + match hdr.op { + VSOCK_OP_REQUEST => { + let resp = self.handle_request(hdr); + if resp.is_none() { + // Already connected or invalid state -> RST. + return (Some(self.make_rst()), None); + } + (resp, None) + } + VSOCK_OP_RW => { + let data = self.handle_rw(payload); + let credit_hdr = if self.credit_update_needed { + self.credit_update_needed = false; + Some(self.make_credit_update()) + } else { + None + }; + (credit_hdr, data) + } + VSOCK_OP_SHUTDOWN => { + self.handle_shutdown(hdr.flags); + (None, None) + } + VSOCK_OP_RST => { + self.handle_rst(); + (None, None) + } + VSOCK_OP_RESPONSE => { + // Guest accepted our connection (host-initiated connect). + if self.state == ConnState::Connecting { + self.peer_buf_alloc = hdr.buf_alloc; + self.peer_fwd_cnt = hdr.fwd_cnt; + self.state = ConnState::Connected; + } + (None, None) + } + VSOCK_OP_CREDIT_UPDATE => { + self.handle_credit_update(hdr); + (None, None) + } + VSOCK_OP_CREDIT_REQUEST => { + self.handle_credit_request(); + let update = self.make_credit_update(); + self.credit_update_needed = false; + (Some(update), None) + } + _ => { + // Unknown op -> RST. + (Some(self.make_rst()), None) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn guest_conn() -> VsockConnection { + // local = host (CID 2), peer = guest (CID 3) + VsockConnection::new(2, 2695, 3, 5000) + } + + fn make_request_hdr() -> VsockHeader { + VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + } + } + + // --- State transitions --- + + #[test] + fn test_new_connection_is_idle() { + let conn = guest_conn(); + assert_eq!(conn.state(), ConnState::Idle); + } + + #[test] + fn test_handle_request_transitions_to_connected() { + let mut conn = guest_conn(); + let hdr = make_request_hdr(); + let resp = conn.handle_request(&hdr); + assert!(resp.is_some()); + assert_eq!(conn.state(), ConnState::Connected); + + let r = resp.unwrap(); + assert_eq!(r.op, VSOCK_OP_RESPONSE); + assert_eq!(r.src_cid, 2); + assert_eq!(r.dst_cid, 3); + assert_eq!(r.buf_alloc, DEFAULT_BUF_ALLOC); + } + + #[test] + fn test_request_on_non_idle_returns_none() { + let mut conn = guest_conn(); + let hdr = make_request_hdr(); + conn.handle_request(&hdr); + assert_eq!(conn.state(), ConnState::Connected); + + // Second request should fail. + let resp = conn.handle_request(&hdr); + assert!(resp.is_none()); + } + + #[test] + fn test_shutdown_both_transitions_to_closed() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_shutdown(VSOCK_SHUTDOWN_SEND | VSOCK_SHUTDOWN_RECV); + assert_eq!(conn.state(), ConnState::Closed); + } + + #[test] + fn test_shutdown_send_only_transitions_to_closing() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_shutdown(VSOCK_SHUTDOWN_SEND); + assert_eq!(conn.state(), ConnState::Closing); + } + + #[test] + fn test_shutdown_closing_to_closed() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_shutdown(VSOCK_SHUTDOWN_SEND); + assert_eq!(conn.state(), ConnState::Closing); + conn.handle_shutdown(VSOCK_SHUTDOWN_RECV); + assert_eq!(conn.state(), ConnState::Closed); + } + + #[test] + fn test_rst_transitions_to_closed() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_rst(); + assert_eq!(conn.state(), ConnState::Closed); + } + + // --- Data transfer --- + + #[test] + fn test_handle_rw_returns_data() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + let data = conn.handle_rw(b"hello"); + assert_eq!(data.as_deref(), Some(b"hello".as_slice())); + } + + #[test] + fn test_handle_rw_updates_fwd_cnt() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_rw(b"hello"); // 5 bytes + assert_eq!(conn.fwd_cnt(), 5); + conn.handle_rw(b"world!"); // 6 bytes + assert_eq!(conn.fwd_cnt(), 11); + } + + #[test] + fn test_handle_rw_when_not_connected_returns_none() { + let mut conn = guest_conn(); + let data = conn.handle_rw(b"hello"); + assert!(data.is_none()); + } + + // --- Credit flow control --- + + #[test] + fn test_peer_credit_initial() { + let mut conn = guest_conn(); + let hdr = make_request_hdr(); // peer_buf_alloc = 32768 + conn.handle_request(&hdr); + assert_eq!(conn.peer_credit(), 32768); + } + + #[test] + fn test_peer_credit_decreases_with_tx() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.enqueue_tx(&[0u8; 1000]); + conn.drain_tx(1000); + // tx_cnt = 1000, peer_fwd_cnt = 0 -> credit = 32768 - 1000 = 31768 + assert_eq!(conn.peer_credit(), 31768); + } + + #[test] + fn test_peer_credit_recovers_with_update() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.enqueue_tx(&[0u8; 1000]); + conn.drain_tx(1000); + + // Simulate peer consumed 1000 bytes. + let mut update = make_request_hdr(); + update.op = VSOCK_OP_CREDIT_UPDATE; + update.fwd_cnt = 1000; + update.buf_alloc = 32768; + conn.handle_credit_update(&update); + + assert_eq!(conn.peer_credit(), 32768); + } + + #[test] + fn test_enqueue_tx_respects_credit() { + let mut conn = guest_conn(); + let mut hdr = make_request_hdr(); + hdr.buf_alloc = 100; // Only 100 bytes of credit. + conn.handle_request(&hdr); + + let queued = conn.enqueue_tx(&[0xAA; 200]); + assert_eq!(queued, 100); // Limited by credit. + assert_eq!(conn.tx_buf_len(), 100); + } + + #[test] + fn test_enqueue_tx_when_not_connected() { + let mut conn = guest_conn(); + let queued = conn.enqueue_tx(b"hello"); + assert_eq!(queued, 0); + } + + // --- Drain TX --- + + #[test] + fn test_drain_tx_returns_data_and_header() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.enqueue_tx(b"hello"); + + let (hdr, data) = conn.drain_tx(1024).unwrap(); + assert_eq!(data, b"hello"); + assert_eq!(hdr.op, VSOCK_OP_RW); + assert_eq!(hdr.len, 5); + assert_eq!(hdr.src_cid, 2); + assert_eq!(hdr.dst_cid, 3); + } + + #[test] + fn test_drain_tx_respects_max_payload() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.enqueue_tx(&[0xBB; 1000]); + + let (hdr, data) = conn.drain_tx(500).unwrap(); + assert_eq!(data.len(), 500); + assert_eq!(hdr.len, 500); + + // Remaining data still in buffer. + assert_eq!(conn.tx_buf_len(), 500); + } + + #[test] + fn test_drain_tx_empty_returns_none() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + assert!(conn.drain_tx(1024).is_none()); + } + + #[test] + fn test_drain_tx_updates_tx_cnt() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.enqueue_tx(b"12345"); + conn.drain_tx(1024); + assert_eq!(conn.tx_cnt(), 5); + } + + // --- Credit request --- + + #[test] + fn test_credit_request_sets_flag() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + assert!(!conn.needs_credit_update()); + conn.handle_credit_request(); + assert!(conn.needs_credit_update()); + } + + #[test] + fn test_clear_credit_update() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_credit_request(); + assert!(conn.needs_credit_update()); + conn.clear_credit_update(); + assert!(!conn.needs_credit_update()); + } + + // --- Dispatch --- + + #[test] + fn test_dispatch_request() { + let mut conn = guest_conn(); + let hdr = make_request_hdr(); + let (resp, data) = conn.dispatch(&hdr, &[]); + assert!(resp.is_some()); + assert_eq!(resp.unwrap().op, VSOCK_OP_RESPONSE); + assert!(data.is_none()); + assert_eq!(conn.state(), ConnState::Connected); + } + + #[test] + fn test_dispatch_rw() { + let mut conn = guest_conn(); + conn.dispatch(&make_request_hdr(), &[]); + + let rw_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 5, + type_: 1, + op: VSOCK_OP_RW, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let (_, data) = conn.dispatch(&rw_hdr, b"hello"); + assert_eq!(data.as_deref(), Some(b"hello".as_slice())); + } + + #[test] + fn test_dispatch_credit_request_sends_update() { + let mut conn = guest_conn(); + conn.dispatch(&make_request_hdr(), &[]); + + let cr_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_CREDIT_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let (resp, _) = conn.dispatch(&cr_hdr, &[]); + assert!(resp.is_some()); + assert_eq!(resp.unwrap().op, VSOCK_OP_CREDIT_UPDATE); + } + + #[test] + fn test_dispatch_unknown_op_sends_rst() { + let mut conn = guest_conn(); + conn.dispatch(&make_request_hdr(), &[]); + + let bad_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: 99, + flags: 0, + buf_alloc: 0, + fwd_cnt: 0, + }; + let (resp, _) = conn.dispatch(&bad_hdr, &[]); + assert!(resp.is_some()); + assert_eq!(resp.unwrap().op, VSOCK_OP_RST); + } + + #[test] + fn test_dispatch_request_on_connected_sends_rst() { + let mut conn = guest_conn(); + conn.dispatch(&make_request_hdr(), &[]); + // Second REQUEST while connected. + let (resp, _) = conn.dispatch(&make_request_hdr(), &[]); + assert!(resp.is_some()); + assert_eq!(resp.unwrap().op, VSOCK_OP_RST); + } + + // --- Make helpers --- + + #[test] + fn test_make_credit_update() { + let mut conn = guest_conn(); + conn.handle_request(&make_request_hdr()); + conn.handle_rw(b"hello"); // fwd_cnt = 5 + let hdr = conn.make_credit_update(); + assert_eq!(hdr.op, VSOCK_OP_CREDIT_UPDATE); + assert_eq!(hdr.fwd_cnt, 5); + assert_eq!(hdr.buf_alloc, DEFAULT_BUF_ALLOC); + } + + #[test] + fn test_make_rst() { + let conn = guest_conn(); + let hdr = conn.make_rst(); + assert_eq!(hdr.op, VSOCK_OP_RST); + assert_eq!(hdr.src_cid, 2); + assert_eq!(hdr.dst_cid, 3); + } +} diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs new file mode 100644 index 000000000..57ec13dc7 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -0,0 +1,1238 @@ +//! Virtio-vsock device backend (virtio spec v1.2 Section 5.10). +//! +//! Provides a socket transport between guest (AF_VSOCK) and host (TCP). +//! The host side uses non-blocking TCP listeners on localhost for +//! cross-platform compatibility (Windows + macOS + Linux). +//! +//! Queue layout: +//! Queue 0 (RX): host -> guest (device writes, guest reads) +//! Queue 1 (TX): guest -> host (guest writes, device reads) +//! Queue 2 (Event): device events (not used currently) + +pub mod connection; +pub mod packet; + +use std::collections::HashMap; +use std::io::{self, Read, Write}; +use std::net::{TcpListener, TcpStream}; + +use super::super::mmio::VirtioDeviceBackend; +use super::super::queue::{GuestMemoryAccessor, Virtqueue}; +use super::connection::{ConnState, VsockConnection}; +use super::packet::{ + VsockHeader, VSOCK_CID_HOST, VSOCK_HEADER_SIZE, VSOCK_OP_REQUEST, +}; + +/// Virtio device ID for vsock (spec Section 5.10). +const VIRTIO_VSOCK_ID: u32 = 19; + +/// VIRTIO_F_VERSION_1 — bit 32 (page 1, bit 0). +const VIRTIO_F_VERSION_1_BIT: u32 = 0; + +/// Number of queues: RX, TX, Event. +const NUM_QUEUES: usize = 3; + +/// Queue index constants. +const RX_QUEUE: usize = 0; +const TX_QUEUE: usize = 1; +// const EVENT_QUEUE: usize = 2; // Not used yet. + +/// Maximum queue size. +const QUEUE_MAX_SIZE: u16 = 128; + +/// Connection key: (guest_port, host_port). +type ConnKey = (u32, u32); + +/// Virtio-vsock device with TCP host-side bridge. +pub struct VirtioVsock { + /// Guest CID (typically 3 for the first guest). + guest_cid: u64, + /// Active connections keyed by (guest_port, host_port). + connections: HashMap, + /// TCP listeners on the host side, keyed by host port. + listeners: HashMap, + /// Accepted TCP streams, keyed by (guest_port, host_port). + streams: HashMap, + /// Pending response/control packets to inject into the RX queue. + rx_pending: Vec<(VsockHeader, Vec)>, +} + +impl VirtioVsock { + /// Create a new vsock device with the given guest CID. + pub fn new(guest_cid: u64) -> Self { + VirtioVsock { + guest_cid, + connections: HashMap::new(), + listeners: HashMap::new(), + streams: HashMap::new(), + rx_pending: Vec::new(), + } + } + + /// Register a TCP listener on `127.0.0.1:port` for the given host port. + /// + /// When a guest connects to this port via AF_VSOCK, the connection + /// is bridged to an accepted TCP client on this listener. + pub fn listen(&mut self, port: u32) -> io::Result<()> { + self.listen_on(port, port as u16) + } + + /// Register a TCP listener on `127.0.0.1:host_port` for the given vsock port. + /// + /// The guest connects to `vsock_port` via AF_VSOCK, and the bridge listens + /// on `host_port` on the host side. This allows multiple VMs to use + /// distinct host ports for the same guest vsock port number. + pub fn listen_on(&mut self, vsock_port: u32, host_port: u16) -> io::Result<()> { + let listener = TcpListener::bind(format!("127.0.0.1:{}", host_port))?; + listener.set_nonblocking(true)?; + self.listeners.insert(vsock_port, listener); + Ok(()) + } + + /// Get the guest CID. + pub fn guest_cid(&self) -> u64 { + self.guest_cid + } + + /// Number of active connections. + pub fn connection_count(&self) -> usize { + self.connections.len() + } + + /// Process the TX queue: read packets from guest, dispatch them. + fn process_tx(&mut self, queue: &mut Virtqueue, mem: &dyn GuestMemoryAccessor) -> bool { + let mut processed = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + if chain.is_empty() { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + + // First descriptor: vsock header (device-readable). + let hdr_desc = &chain[0]; + if (hdr_desc.len as usize) < VSOCK_HEADER_SIZE { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + + let hdr = match VsockHeader::read_from(mem, hdr_desc.addr) { + Ok(h) => h, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + // Read payload from subsequent descriptors. + let mut payload = Vec::new(); + for desc in &chain[1..] { + if !desc.is_write() { + // Device-readable = payload data from guest. + let mut buf = vec![0u8; desc.len as usize]; + if mem.read_at(desc.addr, &mut buf).is_ok() { + payload.extend_from_slice(&buf); + } + } + } + + self.handle_guest_packet(&hdr, &payload); + + let _ = queue.add_used(head, 0, mem); + processed = true; + } + + processed + } + + /// Handle a packet from the guest. + fn handle_guest_packet(&mut self, hdr: &VsockHeader, payload: &[u8]) { + let key = (hdr.src_port, hdr.dst_port); + + if hdr.op == VSOCK_OP_REQUEST { + self.handle_connect_request(hdr); + return; + } + + if let Some(conn) = self.connections.get_mut(&key) { + let (resp_hdr, fwd_data) = conn.dispatch(hdr, payload); + + // Forward data to host TCP socket. + if let Some(data) = fwd_data { + if let Some(stream) = self.streams.get_mut(&key) { + let _ = stream.write_all(&data); + } + } + + // Queue response packet (if any) for RX injection. + if let Some(r) = resp_hdr { + self.rx_pending.push((r, Vec::new())); + } + + // Clean up closed connections. + if conn.state() == ConnState::Closed { + self.connections.remove(&key); + self.streams.remove(&key); + } + } else { + // No connection for this port pair -> RST. + let rst = + VsockHeader::new_rst(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); + self.rx_pending.push((rst, Vec::new())); + } + } + + /// Handle a guest CONNECTION REQUEST. + fn handle_connect_request(&mut self, hdr: &VsockHeader) { + let key = (hdr.src_port, hdr.dst_port); + + // Check if we have a listener on the requested host port. + let has_listener = self.listeners.contains_key(&hdr.dst_port); + + if !has_listener { + // No listener -> RST. + let rst = + VsockHeader::new_rst(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); + self.rx_pending.push((rst, Vec::new())); + return; + } + + // Try to accept a pending TCP connection on this listener. + let stream = if let Some(listener) = self.listeners.get(&hdr.dst_port) { + match listener.accept() { + Ok((stream, _addr)) => { + let _ = stream.set_nonblocking(true); + Some(stream) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { + // No pending TCP connection yet — still accept the vsock connection. + // Data will buffer until a TCP client connects. + None + } + Err(_) => { + let rst = VsockHeader::new_rst( + VSOCK_CID_HOST, + hdr.dst_port, + self.guest_cid, + hdr.src_port, + ); + self.rx_pending.push((rst, Vec::new())); + return; + } + } + } else { + None + }; + + // Create and register the connection. + let mut conn = + VsockConnection::new(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); + + if let Some(resp) = conn.handle_request(hdr) { + self.rx_pending.push((resp, Vec::new())); + self.connections.insert(key, conn); + if let Some(s) = stream { + self.streams.insert(key, s); + } + } else { + let rst = + VsockHeader::new_rst(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); + self.rx_pending.push((rst, Vec::new())); + } + } + + /// Poll TCP streams for incoming data and queue it for RX injection. + fn poll_tcp_streams(&mut self) { + // Collect keys first to avoid borrow issues. + let keys: Vec = self.streams.keys().copied().collect(); + + for key in keys { + let mut buf = [0u8; 4096]; + let data = if let Some(stream) = self.streams.get_mut(&key) { + match stream.read(&mut buf) { + Ok(0) => { + // TCP connection closed. Send SHUTDOWN to guest. + if let Some(conn) = self.connections.get(&key) { + let hdr = VsockHeader::new_shutdown( + conn.local_cid, + conn.local_port, + conn.peer_cid, + conn.peer_port, + packet::VSOCK_SHUTDOWN_SEND | packet::VSOCK_SHUTDOWN_RECV, + ); + self.rx_pending.push((hdr, Vec::new())); + } + self.streams.remove(&key); + if let Some(conn) = self.connections.get_mut(&key) { + conn.handle_shutdown( + packet::VSOCK_SHUTDOWN_SEND | packet::VSOCK_SHUTDOWN_RECV, + ); + } + continue; + } + Ok(n) => Some(buf[..n].to_vec()), + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => None, + Err(_) => { + // I/O error on TCP stream. RST the vsock connection. + if let Some(conn) = self.connections.get(&key) { + let rst = conn.make_rst(); + self.rx_pending.push((rst, Vec::new())); + } + self.streams.remove(&key); + self.connections.remove(&key); + continue; + } + } + } else { + continue; + }; + + // Enqueue data from TCP into the connection's TX buffer. + if let Some(data) = data { + if let Some(conn) = self.connections.get_mut(&key) { + conn.enqueue_tx(&data); + } + } + } + } + + /// Inject pending packets into the RX queue. + fn inject_rx(&mut self, rx_queue: &mut Virtqueue, mem: &dyn GuestMemoryAccessor) -> bool { + let mut injected = false; + + // First: drain connection TX buffers into rx_pending. + let keys: Vec = self.connections.keys().copied().collect(); + for key in keys { + if let Some(conn) = self.connections.get_mut(&key) { + // Also check for credit updates. + if conn.needs_credit_update() { + let hdr = conn.make_credit_update(); + conn.clear_credit_update(); + self.rx_pending.push((hdr, Vec::new())); + } + + // Drain TX data. + while let Some((hdr, data)) = conn.drain_tx(4096) { + self.rx_pending.push((hdr, data)); + } + } + } + + // Inject all pending packets. + while !self.rx_pending.is_empty() { + let head = match rx_queue.pop_avail(mem) { + Ok(Some(h)) => h, + _ => break, // No available RX buffers. + }; + + let chain = match rx_queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = rx_queue.add_used(head, 0, mem); + injected = true; + continue; + } + }; + + let (hdr, payload) = self.rx_pending.remove(0); + + // Write header + payload to device-writable descriptors. + let total_data = hdr + .to_bytes() + .to_vec() + .into_iter() + .chain(payload.into_iter()) + .collect::>(); + + let mut offset = 0; + let mut total_written = 0u32; + for desc in &chain { + if !desc.is_write() { + continue; + } + let remaining = total_data.len().saturating_sub(offset); + let to_write = remaining.min(desc.len as usize); + if to_write > 0 { + let _ = mem.write_at(desc.addr, &total_data[offset..offset + to_write]); + offset += to_write; + total_written += to_write as u32; + } + } + + let _ = rx_queue.add_used(head, total_written, mem); + injected = true; + } + + injected + } +} + +impl VirtioDeviceBackend for VirtioVsock { + fn device_id(&self) -> u32 { + VIRTIO_VSOCK_ID + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 1 => 1 << VIRTIO_F_VERSION_1_BIT, + _ => 0, + } + } + + fn read_config(&self, offset: u64) -> u32 { + // Config space: guest_cid (u64 at offset 0). + match offset { + 0 => self.guest_cid as u32, + 4 => (self.guest_cid >> 32) as u32, + _ => 0, + } + } + + fn queue_notify( + &mut self, + queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + match queue_idx as usize { + TX_QUEUE => self.process_tx(queue, mem), + _ => false, + } + } + + fn num_queues(&self) -> usize { + NUM_QUEUES + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + QUEUE_MAX_SIZE + } + + fn poll(&mut self, queues: &mut [Virtqueue], mem: &dyn GuestMemoryAccessor) -> bool { + // Poll TCP streams for incoming data. + self.poll_tcp_streams(); + + // Inject any pending data into the RX queue. + if queues.len() > RX_QUEUE { + self.inject_rx(&mut queues[RX_QUEUE], mem) + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::super::queue::Virtqueue; + use super::packet::VSOCK_OP_RST; + use super::super::super::super::error::Result; + use std::cell::RefCell; + + struct MockMem { + data: RefCell>, + } + + impl MockMem { + fn new(size: usize) -> Self { + MockMem { + data: RefCell::new(vec![0u8; size]), + } + } + + fn write_bytes(&self, addr: u64, bytes: &[u8]) { + let a = addr as usize; + let mut data = self.data.borrow_mut(); + data[a..a + bytes.len()].copy_from_slice(bytes); + } + + fn read_bytes(&self, addr: u64, len: usize) -> Vec { + let a = addr as usize; + let data = self.data.borrow(); + data[a..a + len].to_vec() + } + + fn write_u16_at(&self, addr: u64, val: u16) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u32_at(&self, addr: u64, val: u32) { + self.write_bytes(addr, &val.to_le_bytes()); + } + + fn write_u64_at(&self, addr: u64, val: u64) { + self.write_bytes(addr, &val.to_le_bytes()); + } + } + + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.data.borrow(); + if a + buf.len() > data.len() { + return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.data.borrow_mut(); + if a + data.len() > mem.len() { + return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + // Memory layout for tests: + // DESC_TABLE at 0x0000 (128 entries * 16 bytes = 2048) + // AVAIL_RING at 0x0800 + // USED_RING at 0x1000 + // BUFFERS at 0x2000+ + const DESC_TABLE: u64 = 0x0000; + const DESC_SIZE: u64 = 16; + const AVAIL_RING: u64 = 0x0800; + const USED_RING: u64 = 0x1000; + const BUF_BASE: u64 = 0x2000; + + fn setup_queue(max_size: u16) -> Virtqueue { + let mut q = Virtqueue::new(max_size); + q.set_size(max_size); + q.set_desc_table(DESC_TABLE); + q.set_avail_ring(AVAIL_RING); + q.set_used_ring(USED_RING); + q.set_ready(true); + q + } + + fn write_descriptor(mem: &MockMem, index: u16, addr: u64, len: u32, flags: u16, next: u16) { + let base = DESC_TABLE + index as u64 * DESC_SIZE; + mem.write_u64_at(base, addr); + mem.write_u32_at(base + 8, len); + mem.write_u16_at(base + 12, flags); + mem.write_u16_at(base + 14, next); + } + + fn push_avail(mem: &MockMem, ring_idx: u16, desc_head: u16) { + let entry_off = AVAIL_RING + 4 + (ring_idx as u64) * 2; + mem.write_u16_at(entry_off, desc_head); + mem.write_u16_at(AVAIL_RING + 2, ring_idx + 1); + } + + // --- Device identity --- + + #[test] + fn test_device_id() { + let dev = VirtioVsock::new(3); + assert_eq!(dev.device_id(), 19); + } + + #[test] + fn test_num_queues() { + let dev = VirtioVsock::new(3); + assert_eq!(dev.num_queues(), 3); + } + + #[test] + fn test_queue_max_size() { + let dev = VirtioVsock::new(3); + assert_eq!(dev.queue_max_size(0), 128); + assert_eq!(dev.queue_max_size(1), 128); + assert_eq!(dev.queue_max_size(2), 128); + } + + #[test] + fn test_version_1_feature() { + let dev = VirtioVsock::new(3); + assert_eq!(dev.device_features(0), 0); + assert_eq!(dev.device_features(1), 1); // VIRTIO_F_VERSION_1 + } + + // --- Config space --- + + #[test] + fn test_config_guest_cid() { + let dev = VirtioVsock::new(3); + assert_eq!(dev.read_config(0), 3); // Low 32 bits. + assert_eq!(dev.read_config(4), 0); // High 32 bits. + } + + #[test] + fn test_config_large_cid() { + let dev = VirtioVsock::new(0x1_0000_0003); + assert_eq!(dev.read_config(0), 3); + assert_eq!(dev.read_config(4), 1); + } + + // --- TX queue: REQUEST handling --- + + #[test] + fn test_tx_request_no_listener_sends_rst() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Write a REQUEST header to guest memory. + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + + // Set up descriptor: header only. + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + + // Should have a RST pending in rx_pending. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_RST); + } + + #[test] + fn test_tx_request_with_listener_sends_response() { + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); // Port 0 = OS-assigned. + // Get the actual port. + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + // Re-register with correct port. + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: port, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Should have a RESPONSE pending. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); + assert_eq!(dev.connection_count(), 1); + } + + // --- TX queue: RW handling --- + + #[test] + fn test_tx_rw_forwards_data() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Establish connection directly. + let req_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + dev.handle_guest_packet(&req_hdr, &[]); + // The REQUEST without a listener sends RST, so let's set up directly. + dev.rx_pending.clear(); + + // Manually create a connected state. + let mut conn = VsockConnection::new(VSOCK_CID_HOST, 2695, 3, 5000); + conn.handle_request(&req_hdr); + dev.connections.insert((5000, 2695), conn); + + // Now send an RW packet. + let rw_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 5, + type_: 1, + op: packet::VSOCK_OP_RW, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &rw_hdr.to_bytes()); + mem.write_bytes(BUF_BASE + VSOCK_HEADER_SIZE as u64, b"hello"); + + // Two descriptors: header (readable) + payload (readable). + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 1, 1); // NEXT + write_descriptor(&mem, 1, BUF_BASE + VSOCK_HEADER_SIZE as u64, 5, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + + // Data was forwarded (no stream connected, so just the connection absorbed it). + let conn = dev.connections.get(&(5000, 2695)).unwrap(); + assert_eq!(conn.fwd_cnt(), 5); + } + + // --- TX queue: SHUTDOWN handling --- + + #[test] + fn test_tx_shutdown_closes_connection() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Set up a connected connection. + let req_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let mut conn = VsockConnection::new(VSOCK_CID_HOST, 2695, 3, 5000); + conn.handle_request(&req_hdr); + dev.connections.insert((5000, 2695), conn); + + // Send SHUTDOWN with both flags. + let shut_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: packet::VSOCK_OP_SHUTDOWN, + flags: packet::VSOCK_SHUTDOWN_SEND | packet::VSOCK_SHUTDOWN_RECV, + buf_alloc: 0, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &shut_hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Connection should be removed. + assert_eq!(dev.connection_count(), 0); + } + + // --- TX queue: RST for unknown connection --- + + #[test] + fn test_tx_rw_to_unknown_port_sends_rst() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let rw_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 9999, + dst_port: 8888, + len: 0, + type_: 1, + op: packet::VSOCK_OP_RW, + flags: 0, + buf_alloc: 0, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &rw_hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_RST); + } + + // --- RX queue: inject pending --- + + #[test] + fn test_inject_rx_writes_header_to_queue() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(128); + + // Set up an RX buffer (device-writable). + write_descriptor(&mem, 0, BUF_BASE, 256, 2, 0); // WRITE flag = 2 + push_avail(&mem, 0, 0); + + // Queue a RESPONSE packet. + let resp = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + dev.rx_pending.push((resp, Vec::new())); + + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(injected); + + // Read back the header from guest memory. + let written = mem.read_bytes(BUF_BASE, VSOCK_HEADER_SIZE); + let read_hdr = VsockHeader::from_bytes(&written.try_into().unwrap()); + assert_eq!(read_hdr.op, packet::VSOCK_OP_RESPONSE); + assert_eq!(read_hdr.src_cid, 2); + assert_eq!(read_hdr.dst_cid, 3); + } + + #[test] + fn test_inject_rx_with_payload() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(128); + + // RX buffer: 256 bytes device-writable. + write_descriptor(&mem, 0, BUF_BASE, 256, 2, 0); + push_avail(&mem, 0, 0); + + let rw = VsockHeader::new_rw(2, 2695, 3, 5000, 5, 65536, 0); + dev.rx_pending.push((rw, b"hello".to_vec())); + + dev.inject_rx(&mut rx_queue, &mem); + + // Check header. + let hdr_bytes = mem.read_bytes(BUF_BASE, VSOCK_HEADER_SIZE); + let hdr = VsockHeader::from_bytes(&hdr_bytes.try_into().unwrap()); + assert_eq!(hdr.op, packet::VSOCK_OP_RW); + assert_eq!(hdr.len, 5); + + // Check payload follows header. + let payload = mem.read_bytes(BUF_BASE + VSOCK_HEADER_SIZE as u64, 5); + assert_eq!(payload, b"hello"); + } + + #[test] + fn test_inject_rx_no_available_buffers() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut rx_queue = setup_queue(128); + // Don't push any available buffers. + + let resp = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + dev.rx_pending.push((resp, Vec::new())); + + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(!injected); + + // Packet should still be pending. + assert_eq!(dev.rx_pending.len(), 1); + } + + // --- Poll default --- + + #[test] + fn test_poll_no_streams_no_pending() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut queues = vec![ + setup_queue(128), // RX + setup_queue(128), // TX + setup_queue(128), // Event + ]; + + let raised = dev.poll(&mut queues, &mem); + assert!(!raised); + } + + // --- Connection lifecycle through TX + RX --- + + #[test] + fn test_connection_lifecycle() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + + // Manually create connection to test data flow without TCP. + let req_hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let mut conn = VsockConnection::new(VSOCK_CID_HOST, 2695, 3, 5000); + conn.handle_request(&req_hdr); + dev.connections.insert((5000, 2695), conn); + + // Enqueue some host->guest data. + dev.connections + .get_mut(&(5000, 2695)) + .unwrap() + .enqueue_tx(b"response data"); + + // Set up RX buffer. + let rx_buf = BUF_BASE + 0x2000; + write_descriptor(&mem, 0, rx_buf, 256, 2, 0); // WRITE + push_avail(&mem, 0, 0); + + let mut rx_queue = setup_queue(128); + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(injected); + + // Verify the injected RW packet. + let hdr_bytes = mem.read_bytes(rx_buf, VSOCK_HEADER_SIZE); + let hdr = VsockHeader::from_bytes(&hdr_bytes.try_into().unwrap()); + assert_eq!(hdr.op, packet::VSOCK_OP_RW); + assert_eq!(hdr.len, 13); + + let payload = mem.read_bytes(rx_buf + VSOCK_HEADER_SIZE as u64, 13); + assert_eq!(payload, b"response data"); + } + + // --- Multiple connections --- + + #[test] + fn test_multiple_connections() { + let mut dev = VirtioVsock::new(3); + + let req1 = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let req2 = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5001, + dst_port: 2696, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + + let mut c1 = VsockConnection::new(VSOCK_CID_HOST, 2695, 3, 5000); + c1.handle_request(&req1); + let mut c2 = VsockConnection::new(VSOCK_CID_HOST, 2696, 3, 5001); + c2.handle_request(&req2); + + dev.connections.insert((5000, 2695), c1); + dev.connections.insert((5001, 2696), c2); + + assert_eq!(dev.connection_count(), 2); + } + + // --- Short descriptor chain --- + + #[test] + fn test_tx_short_header_skipped() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Descriptor with only 10 bytes (< 44 byte header). + write_descriptor(&mem, 0, BUF_BASE, 10, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); // Processed (skipped) the entry. + assert!(dev.rx_pending.is_empty()); // No response generated. + } + + // --- Empty chain --- + + #[test] + fn test_tx_empty_chain_skipped() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Descriptor with 0 length. + write_descriptor(&mem, 0, BUF_BASE, 0, 0, 0); + push_avail(&mem, 0, 0); + + let processed = dev.process_tx(&mut tx_queue, &mem); + assert!(processed); + } + + // --- Credit update flow --- + + #[test] + fn test_credit_update_injected() { + let mut dev = VirtioVsock::new(3); + let mem = MockMem::new(0x10000); + + let req = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let mut conn = VsockConnection::new(VSOCK_CID_HOST, 2695, 3, 5000); + conn.handle_request(&req); + conn.handle_credit_request(); + dev.connections.insert((5000, 2695), conn); + + // RX buffer. + let rx_buf = BUF_BASE + 0x2000; + write_descriptor(&mem, 0, rx_buf, 256, 2, 0); + push_avail(&mem, 0, 0); + + let mut rx_queue = setup_queue(128); + let injected = dev.inject_rx(&mut rx_queue, &mem); + assert!(injected); + + let hdr_bytes = mem.read_bytes(rx_buf, VSOCK_HEADER_SIZE); + let hdr = VsockHeader::from_bytes(&hdr_bytes.try_into().unwrap()); + assert_eq!(hdr.op, packet::VSOCK_OP_CREDIT_UPDATE); + } + + // --- Listen and connect with TCP --- + + #[test] + fn test_listen_creates_listener() { + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); // Port 0 = OS-assigned. + assert_eq!(dev.listeners.len(), 1); + } + + #[test] + fn test_listen_on_different_host_port() { + let mut dev = VirtioVsock::new(3); + // vsock port 2695, host TCP port 0 (OS-assigned) + dev.listen_on(2695, 0).unwrap(); + assert_eq!(dev.listeners.len(), 1); + // Listener is keyed by vsock port, not host port + assert!(dev.listeners.contains_key(&2695)); + // The actual TCP port may differ from the vsock port + let actual_port = dev + .listeners + .get(&2695) + .unwrap() + .local_addr() + .unwrap() + .port(); + assert_ne!(actual_port, 2695); // OS assigned a different port + } + + #[test] + fn test_listen_on_two_vsock_ports_different_host_ports() { + let mut dev = VirtioVsock::new(3); + dev.listen_on(2695, 0).unwrap(); + dev.listen_on(2696, 0).unwrap(); + assert_eq!(dev.listeners.len(), 2); + // Each vsock port has its own listener + let port1 = dev + .listeners + .get(&2695) + .unwrap() + .local_addr() + .unwrap() + .port(); + let port2 = dev + .listeners + .get(&2696) + .unwrap() + .local_addr() + .unwrap() + .port(); + assert_ne!(port1, port2); + } + + #[test] + fn test_listen_with_tcp_connect() { + use std::net::TcpStream; + + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + // Re-register with actual port. + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Connect a TCP client before the guest sends REQUEST. + let _client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + // Brief delay for the accept backlog to propagate. + std::thread::sleep(std::time::Duration::from_millis(50)); + + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: port, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Should have RESPONSE and a TCP stream. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); + assert_eq!(dev.connection_count(), 1); + assert_eq!(dev.streams.len(), 1); + } + + // --- Poll with TCP data --- + + #[test] + fn test_poll_reads_tcp_data() { + use std::io::Write as IoWrite; + use std::net::TcpStream; + + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Connect TCP client. + let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + + // Establish vsock connection. + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: port, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + dev.process_tx(&mut tx_queue, &mem); + dev.rx_pending.clear(); + + // Send data from TCP client to be picked up by poll. + client.write_all(b"tcp data").unwrap(); + client.flush().unwrap(); + + // Small delay to allow TCP data to arrive. + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Poll should read TCP data and queue it. + let mut queues = vec![ + setup_queue(128), // RX + setup_queue(128), // TX + setup_queue(128), // Event + ]; + + // Set up an RX buffer. + let rx_buf = BUF_BASE + 0x4000; + // Use separate addresses for the RX queue to avoid overlap. + let rx_desc = 0x8000u64; + let rx_avail = 0x8800u64; + let rx_used = 0x9000u64; + queues[0].set_desc_table(rx_desc); + queues[0].set_avail_ring(rx_avail); + queues[0].set_used_ring(rx_used); + + // Write descriptor for RX. + mem.write_u64_at(rx_desc, rx_buf); + mem.write_u32_at(rx_desc + 8, 256); + mem.write_u16_at(rx_desc + 12, 2); // WRITE + mem.write_u16_at(rx_desc + 14, 0); + // Push to avail ring. + mem.write_u16_at(rx_avail + 4, 0); // ring[0] = desc 0 + mem.write_u16_at(rx_avail + 2, 1); // avail idx = 1 + + let raised = dev.poll(&mut queues, &mem); + assert!(raised); + + // Check that data was injected. + let hdr_bytes = mem.read_bytes(rx_buf, VSOCK_HEADER_SIZE); + let rx_hdr = VsockHeader::from_bytes(&hdr_bytes.try_into().unwrap()); + assert_eq!(rx_hdr.op, packet::VSOCK_OP_RW); + assert_eq!(rx_hdr.len, 8); + + let payload = mem.read_bytes(rx_buf + VSOCK_HEADER_SIZE as u64, 8); + assert_eq!(payload, b"tcp data"); + } +} diff --git a/src/vmm/src/windows/devices/virtio/vsock/packet.rs b/src/vmm/src/windows/devices/virtio/vsock/packet.rs new file mode 100644 index 000000000..87717036a --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/vsock/packet.rs @@ -0,0 +1,449 @@ +//! Virtio-vsock packet header (virtio spec v1.2 Section 5.10.6). +//! +//! The 44-byte header is prepended to every vsock packet in the +//! TX and RX virtqueues. It carries addressing, flow control credits, +//! and operation codes for the vsock connection protocol. + +use super::super::queue::GuestMemoryAccessor; +use super::super::super::super::error::{Result, WkrunError}; + +// --- CID constants --- + +/// Well-known CID for the host (hypervisor). +pub const VSOCK_CID_HOST: u64 = 2; + +// --- Vsock type --- + +/// Stream transport (SOCK_STREAM equivalent). +pub const VIRTIO_VSOCK_TYPE_STREAM: u16 = 1; + +// --- Vsock operations (spec 5.10.6.6) --- + +/// Invalid operation. +pub const VSOCK_OP_INVALID: u16 = 0; +/// Connection request (guest -> host). +pub const VSOCK_OP_REQUEST: u16 = 1; +/// Connection accepted (host -> guest). +pub const VSOCK_OP_RESPONSE: u16 = 2; +/// Connection reset / refused. +pub const VSOCK_OP_RST: u16 = 3; +/// Graceful shutdown. +pub const VSOCK_OP_SHUTDOWN: u16 = 4; +/// Data transfer. +pub const VSOCK_OP_RW: u16 = 5; +/// Credit update (no payload). +pub const VSOCK_OP_CREDIT_UPDATE: u16 = 6; +/// Credit request (ask peer to send credit update). +pub const VSOCK_OP_CREDIT_REQUEST: u16 = 7; + +// --- Shutdown flags --- + +/// Shutdown flag: no more data to send. +pub const VSOCK_SHUTDOWN_SEND: u32 = 1; +/// Shutdown flag: no more data to receive. +pub const VSOCK_SHUTDOWN_RECV: u32 = 2; + +/// Size of the vsock packet header in bytes. +pub const VSOCK_HEADER_SIZE: usize = 44; + +/// Virtio-vsock packet header (44 bytes, little-endian). +/// +/// Layout (spec 5.10.6): +/// offset 0: src_cid (u64) +/// offset 8: dst_cid (u64) +/// offset 16: src_port (u32) +/// offset 20: dst_port (u32) +/// offset 24: len (u32) - payload length +/// offset 28: type_ (u16) - VIRTIO_VSOCK_TYPE_STREAM +/// offset 30: op (u16) - operation code +/// offset 32: flags (u32) - operation-specific flags +/// offset 36: buf_alloc (u32) - credit: total buffer space +/// offset 40: fwd_cnt (u32) - credit: bytes consumed so far +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct VsockHeader { + pub src_cid: u64, + pub dst_cid: u64, + pub src_port: u32, + pub dst_port: u32, + pub len: u32, + pub type_: u16, + pub op: u16, + pub flags: u32, + pub buf_alloc: u32, + pub fwd_cnt: u32, +} + +impl VsockHeader { + /// Read a vsock header from guest memory at the given address. + pub fn read_from(mem: &dyn GuestMemoryAccessor, addr: u64) -> Result { + let mut buf = [0u8; VSOCK_HEADER_SIZE]; + mem.read_at(addr, &mut buf)?; + Ok(Self::from_bytes(&buf)) + } + + /// Write this vsock header to guest memory at the given address. + pub fn write_to(&self, mem: &dyn GuestMemoryAccessor, addr: u64) -> Result<()> { + let buf = self.to_bytes(); + mem.write_at(addr, &buf) + } + + /// Parse a vsock header from a 44-byte buffer. + pub fn from_bytes(buf: &[u8; VSOCK_HEADER_SIZE]) -> Self { + VsockHeader { + src_cid: u64::from_le_bytes(buf[0..8].try_into().unwrap()), + dst_cid: u64::from_le_bytes(buf[8..16].try_into().unwrap()), + src_port: u32::from_le_bytes(buf[16..20].try_into().unwrap()), + dst_port: u32::from_le_bytes(buf[20..24].try_into().unwrap()), + len: u32::from_le_bytes(buf[24..28].try_into().unwrap()), + type_: u16::from_le_bytes(buf[28..30].try_into().unwrap()), + op: u16::from_le_bytes(buf[30..32].try_into().unwrap()), + flags: u32::from_le_bytes(buf[32..36].try_into().unwrap()), + buf_alloc: u32::from_le_bytes(buf[36..40].try_into().unwrap()), + fwd_cnt: u32::from_le_bytes(buf[40..44].try_into().unwrap()), + } + } + + /// Serialize this header to a 44-byte buffer. + pub fn to_bytes(&self) -> [u8; VSOCK_HEADER_SIZE] { + let mut buf = [0u8; VSOCK_HEADER_SIZE]; + buf[0..8].copy_from_slice(&self.src_cid.to_le_bytes()); + buf[8..16].copy_from_slice(&self.dst_cid.to_le_bytes()); + buf[16..20].copy_from_slice(&self.src_port.to_le_bytes()); + buf[20..24].copy_from_slice(&self.dst_port.to_le_bytes()); + buf[24..28].copy_from_slice(&self.len.to_le_bytes()); + buf[28..30].copy_from_slice(&self.type_.to_le_bytes()); + buf[30..32].copy_from_slice(&self.op.to_le_bytes()); + buf[32..36].copy_from_slice(&self.flags.to_le_bytes()); + buf[36..40].copy_from_slice(&self.buf_alloc.to_le_bytes()); + buf[40..44].copy_from_slice(&self.fwd_cnt.to_le_bytes()); + buf + } + + /// Create a RESPONSE header (host -> guest) for a given REQUEST. + pub fn new_response( + src_cid: u64, + src_port: u32, + dst_cid: u64, + dst_port: u32, + buf_alloc: u32, + fwd_cnt: u32, + ) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_RESPONSE, + flags: 0, + buf_alloc, + fwd_cnt, + } + } + + /// Create an RW (data) header. + pub fn new_rw( + src_cid: u64, + src_port: u32, + dst_cid: u64, + dst_port: u32, + payload_len: u32, + buf_alloc: u32, + fwd_cnt: u32, + ) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: payload_len, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_RW, + flags: 0, + buf_alloc, + fwd_cnt, + } + } + + /// Create a RST header. + pub fn new_rst(src_cid: u64, src_port: u32, dst_cid: u64, dst_port: u32) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_RST, + flags: 0, + buf_alloc: 0, + fwd_cnt: 0, + } + } + + /// Create a SHUTDOWN header. + pub fn new_shutdown( + src_cid: u64, + src_port: u32, + dst_cid: u64, + dst_port: u32, + flags: u32, + ) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_SHUTDOWN, + flags, + buf_alloc: 0, + fwd_cnt: 0, + } + } + + /// Create a CREDIT_UPDATE header. + pub fn new_credit_update( + src_cid: u64, + src_port: u32, + dst_cid: u64, + dst_port: u32, + buf_alloc: u32, + fwd_cnt: u32, + ) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_CREDIT_UPDATE, + flags: 0, + buf_alloc, + fwd_cnt, + } + } + + /// Validate that this header has a known operation and stream type. + pub fn validate(&self) -> Result<()> { + if self.type_ != VIRTIO_VSOCK_TYPE_STREAM { + return Err(WkrunError::Device(format!( + "unsupported vsock type: {} (expected stream=1)", + self.type_ + ))); + } + if self.op > VSOCK_OP_CREDIT_REQUEST { + return Err(WkrunError::Device(format!( + "unknown vsock operation: {}", + self.op + ))); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use super::super::super::super::error::Result; + use std::cell::RefCell; + + struct MockMem(RefCell>); + impl MockMem { + fn new(size: usize) -> Self { + MockMem(RefCell::new(vec![0u8; size])) + } + } + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + let a = addr as usize; + let data = self.0.borrow(); + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + let a = addr as usize; + let mut mem = self.0.borrow_mut(); + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + #[test] + fn test_header_size() { + assert_eq!(VSOCK_HEADER_SIZE, 44); + } + + #[test] + fn test_roundtrip_bytes() { + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 1234, + dst_port: 2695, + len: 100, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_RW, + flags: 0, + buf_alloc: 65536, + fwd_cnt: 512, + }; + let bytes = hdr.to_bytes(); + assert_eq!(bytes.len(), VSOCK_HEADER_SIZE); + let parsed = VsockHeader::from_bytes(&bytes); + assert_eq!(parsed, hdr); + } + + #[test] + fn test_field_offsets() { + let hdr = VsockHeader { + src_cid: 0x0102_0304_0506_0708, + dst_cid: 0x090A_0B0C_0D0E_0F10, + src_port: 0x11121314, + dst_port: 0x15161718, + len: 0x191A1B1C, + type_: 0x1D1E, + op: 0x1F20, + flags: 0x21222324, + buf_alloc: 0x25262728, + fwd_cnt: 0x292A2B2C, + }; + let buf = hdr.to_bytes(); + + // Verify each field starts at the correct offset. + assert_eq!( + u64::from_le_bytes(buf[0..8].try_into().unwrap()), + hdr.src_cid + ); + assert_eq!( + u64::from_le_bytes(buf[8..16].try_into().unwrap()), + hdr.dst_cid + ); + assert_eq!( + u32::from_le_bytes(buf[16..20].try_into().unwrap()), + hdr.src_port + ); + assert_eq!( + u32::from_le_bytes(buf[20..24].try_into().unwrap()), + hdr.dst_port + ); + assert_eq!(u32::from_le_bytes(buf[24..28].try_into().unwrap()), hdr.len); + assert_eq!( + u16::from_le_bytes(buf[28..30].try_into().unwrap()), + hdr.type_ + ); + assert_eq!(u16::from_le_bytes(buf[30..32].try_into().unwrap()), hdr.op); + assert_eq!( + u32::from_le_bytes(buf[32..36].try_into().unwrap()), + hdr.flags + ); + assert_eq!( + u32::from_le_bytes(buf[36..40].try_into().unwrap()), + hdr.buf_alloc + ); + assert_eq!( + u32::from_le_bytes(buf[40..44].try_into().unwrap()), + hdr.fwd_cnt + ); + } + + #[test] + fn test_read_write_guest_memory() { + let mem = MockMem::new(256); + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 4096, + fwd_cnt: 0, + }; + hdr.write_to(&mem, 0).unwrap(); + let read_back = VsockHeader::read_from(&mem, 0).unwrap(); + assert_eq!(read_back, hdr); + } + + #[test] + fn test_new_response() { + let hdr = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + assert_eq!(hdr.src_cid, 2); + assert_eq!(hdr.dst_cid, 3); + assert_eq!(hdr.src_port, 2695); + assert_eq!(hdr.dst_port, 5000); + assert_eq!(hdr.len, 0); + assert_eq!(hdr.type_, VIRTIO_VSOCK_TYPE_STREAM); + assert_eq!(hdr.op, VSOCK_OP_RESPONSE); + assert_eq!(hdr.buf_alloc, 65536); + assert_eq!(hdr.fwd_cnt, 0); + } + + #[test] + fn test_new_rw() { + let hdr = VsockHeader::new_rw(2, 2695, 3, 5000, 128, 65536, 64); + assert_eq!(hdr.op, VSOCK_OP_RW); + assert_eq!(hdr.len, 128); + assert_eq!(hdr.buf_alloc, 65536); + assert_eq!(hdr.fwd_cnt, 64); + } + + #[test] + fn test_new_rst() { + let hdr = VsockHeader::new_rst(2, 2695, 3, 5000); + assert_eq!(hdr.op, VSOCK_OP_RST); + assert_eq!(hdr.len, 0); + assert_eq!(hdr.buf_alloc, 0); + assert_eq!(hdr.fwd_cnt, 0); + } + + #[test] + fn test_new_shutdown() { + let hdr = + VsockHeader::new_shutdown(3, 5000, 2, 2695, VSOCK_SHUTDOWN_SEND | VSOCK_SHUTDOWN_RECV); + assert_eq!(hdr.op, VSOCK_OP_SHUTDOWN); + assert_eq!(hdr.flags, 3); + } + + #[test] + fn test_new_credit_update() { + let hdr = VsockHeader::new_credit_update(2, 2695, 3, 5000, 32768, 1024); + assert_eq!(hdr.op, VSOCK_OP_CREDIT_UPDATE); + assert_eq!(hdr.buf_alloc, 32768); + assert_eq!(hdr.fwd_cnt, 1024); + } + + #[test] + fn test_validate_valid() { + let hdr = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + assert!(hdr.validate().is_ok()); + } + + #[test] + fn test_validate_bad_type() { + let mut hdr = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + hdr.type_ = 99; + assert!(hdr.validate().is_err()); + } + + #[test] + fn test_validate_bad_op() { + let mut hdr = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); + hdr.op = 99; + assert!(hdr.validate().is_err()); + } + + #[test] + fn test_zero_header() { + let buf = [0u8; VSOCK_HEADER_SIZE]; + let hdr = VsockHeader::from_bytes(&buf); + assert_eq!(hdr.src_cid, 0); + assert_eq!(hdr.dst_cid, 0); + assert_eq!(hdr.op, VSOCK_OP_INVALID); + } +} diff --git a/src/vmm/src/windows/error.rs b/src/vmm/src/windows/error.rs new file mode 100644 index 000000000..ac363f95e --- /dev/null +++ b/src/vmm/src/windows/error.rs @@ -0,0 +1,116 @@ +//! Error types for the Windows WHPX backend. + +/// Result type for WHPX operations. +pub type Result = std::result::Result; + +/// Errors that can occur in the WHPX backend. +#[derive(Debug, thiserror::Error)] +pub enum WkrunError { + /// WHPX API call failed with an HRESULT. + #[error("WHPX API call failed: {function} returned 0x{hresult:08X}")] + WhpxApi { + function: &'static str, + hresult: u32, + }, + + /// WHPX/Hyper-V is not available on this system. + #[error("WHPX not available: {0}")] + WhpxUnavailable(String), + + /// Invalid VM context ID. + #[error("invalid context ID: {0}")] + InvalidContext(u32), + + /// Context ID already in use. + #[error("context ID {0} already exists")] + ContextExists(u32), + + /// VM configuration error. + #[error("VM configuration error: {0}")] + Config(String), + + /// Guest memory error. + #[error("guest memory error: {0}")] + Memory(String), + + /// vCPU error. + #[error("vCPU error: {0}")] + Vcpu(String), + + /// I/O error. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// Boot/kernel loading error. + #[error("boot error: {0}")] + Boot(String), + + /// Device emulation error. + #[error("device error: {0}")] + Device(String), + + /// VM is not in the expected state for this operation. + #[error("invalid VM state: expected {expected}, got {actual}")] + InvalidState { + expected: &'static str, + actual: String, + }, +} + +impl WkrunError { + /// Create a WHPX API error from a function name and HRESULT. + pub fn whpx(function: &'static str, hresult: u32) -> Self { + WkrunError::WhpxApi { function, hresult } + } +} + +/// Checks an HRESULT and returns an error if it indicates failure. +/// HRESULT values with the high bit set indicate failure. +#[cfg(target_os = "windows")] +pub fn check_hresult(function: &'static str, hr: i32) -> Result<()> { + if hr < 0 { + Err(WkrunError::whpx(function, hr as u32)) + } else { + Ok(()) + } +} + +/// Return code for the C API: 0 = success, negative = error. +#[repr(i32)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CApiResult { + Success = 0, + InvalidContext = -1, + InvalidArgument = -2, + WhpxError = -3, + MemoryError = -4, + BootError = -5, + DeviceError = -6, + StateError = -7, + IoError = -8, + Unknown = -99, +} + +impl From<&WkrunError> for CApiResult { + fn from(err: &WkrunError) -> Self { + match err { + WkrunError::InvalidContext(_) => CApiResult::InvalidContext, + WkrunError::ContextExists(_) => CApiResult::InvalidContext, + WkrunError::Config(_) => CApiResult::InvalidArgument, + WkrunError::WhpxApi { .. } => CApiResult::WhpxError, + WkrunError::WhpxUnavailable(_) => CApiResult::WhpxError, + WkrunError::Memory(_) => CApiResult::MemoryError, + WkrunError::Boot(_) => CApiResult::BootError, + WkrunError::Device(_) => CApiResult::DeviceError, + WkrunError::InvalidState { .. } => CApiResult::StateError, + WkrunError::Vcpu(_) => CApiResult::DeviceError, + WkrunError::Io(_) => CApiResult::IoError, + } + } +} + +impl From<&WkrunError> for i32 { + fn from(err: &WkrunError) -> Self { + CApiResult::from(err) as i32 + } +} diff --git a/src/vmm/src/windows/insn.rs b/src/vmm/src/windows/insn.rs new file mode 100644 index 000000000..110ae1e19 --- /dev/null +++ b/src/vmm/src/windows/insn.rs @@ -0,0 +1,662 @@ +//! Minimal x86_64 instruction decoder for MMIO emulation. +//! +//! Decodes the instruction bytes provided by WHPX memory access exits +//! to extract write data, access size, and destination register for reads. +//! +//! Only handles the instruction patterns Linux generates for MMIO: +//! - MOV r/m, reg (0x88/0x89) — writeb/writel/writeq +//! - MOV reg, r/m (0x8A/0x8B) — readb/readl/readq +//! - MOV r/m, imm (0xC6/0xC7) — writeb/writel with immediate +//! - MOVZX reg, r/m (0x0F 0xB6/0xB7) — readb/readw with zero-extend + +use super::error::{Result, WkrunError}; +use super::types::StandardRegisters; + +/// Decoded MMIO instruction information. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MmioInsn { + /// Number of bytes accessed (1, 2, 4, or 8). + pub access_size: u8, + /// For writes: the value being written. + pub data: u64, + /// Total instruction length in bytes. + pub len: u8, + /// Whether this is a write (true) or read (false). + pub is_write: bool, + /// For reads: which general-purpose register receives the value (0=RAX..15=R15). + pub gpr_index: Option, +} + +/// REX prefix bit fields. +struct Rex { + /// REX.W — 64-bit operand size. + w: bool, + /// REX.R — extends ModRM reg field. + r: bool, +} + +impl Rex { + fn none() -> Self { + Rex { w: false, r: false } + } + + fn from_byte(byte: u8) -> Self { + Rex { + w: byte & 0x08 != 0, + r: byte & 0x04 != 0, + } + } +} + +/// Read a general-purpose register value by index (0=RAX, 1=RCX, ..., 15=R15). +/// +/// The index matches x86_64 ModRM/SIB encoding: +/// 0=RAX, 1=RCX, 2=RDX, 3=RBX, 4=RSP, 5=RBP, 6=RSI, 7=RDI, +/// 8=R8, 9=R9, 10=R10, 11=R11, 12=R12, 13=R13, 14=R14, 15=R15 +pub fn read_gpr(regs: &StandardRegisters, index: u8) -> u64 { + match index { + 0 => regs.rax, + 1 => regs.rcx, + 2 => regs.rdx, + 3 => regs.rbx, + 4 => regs.rsp, + 5 => regs.rbp, + 6 => regs.rsi, + 7 => regs.rdi, + 8 => regs.r8, + 9 => regs.r9, + 10 => regs.r10, + 11 => regs.r11, + 12 => regs.r12, + 13 => regs.r13, + 14 => regs.r14, + 15 => regs.r15, + _ => 0, + } +} + +/// Calculate the length of the ModRM addressing mode (displacement bytes). +/// +/// For MMIO, the ModRM byte encodes a memory operand. We need to know +/// how many bytes the addressing mode consumes to find the instruction length. +fn modrm_disp_len(modrm: u8, has_sib: bool) -> usize { + let mod_field = modrm >> 6; + let rm = modrm & 0x07; + + match mod_field { + 0b00 => { + if rm == 0b101 { + // [RIP+disp32] or [disp32] — 4-byte displacement + 4 + } else if rm == 0b100 && has_sib { + // SIB byte present, check SIB base + // For simplicity, return 0 (base case) — SIB with mod=00 and base=101 has disp32 + 0 // Will be handled by caller checking SIB + } else { + 0 + } + } + 0b01 => 1, // [reg+disp8] + 0b10 => 4, // [reg+disp32] + _ => 0, // mod=11 is register-to-register (shouldn't happen for MMIO) + } +} + +/// Calculate total bytes consumed by ModRM + SIB + displacement. +fn addressing_mode_len(bytes: &[u8], offset: usize) -> usize { + if offset >= bytes.len() { + return 0; + } + let modrm = bytes[offset]; + let mod_field = modrm >> 6; + let rm = modrm & 0x07; + + // Start with 1 byte for ModRM itself. + let mut len = 1; + + // Check for SIB byte (rm=100 with mod != 11). + let has_sib = rm == 0b100 && mod_field != 0b11; + if has_sib { + len += 1; // SIB byte + + // Check SIB base for special disp32 case. + if offset + 1 < bytes.len() { + let sib = bytes[offset + 1]; + let base = sib & 0x07; + if mod_field == 0b00 && base == 0b101 { + len += 4; // disp32 with SIB + return len; + } + } + } + + // Add displacement bytes. + len += modrm_disp_len(modrm, has_sib); + + len +} + +/// Decode an MMIO instruction from raw instruction bytes. +/// +/// `bytes` contains the instruction bytes from the WHPX exit context. +/// `regs` contains the current vCPU register state (needed to extract +/// write values from source registers). +/// +/// Returns the decoded instruction information, or an error if the +/// instruction pattern is not recognized. +pub fn decode_mmio_insn(bytes: &[u8], regs: &StandardRegisters) -> Result { + if bytes.is_empty() { + return Err(WkrunError::Device("empty instruction bytes".into())); + } + + let mut pos = 0; + let mut rex = Rex::none(); + let mut has_operand_size_prefix = false; + + // Parse prefixes. + loop { + if pos >= bytes.len() { + return Err(WkrunError::Device("instruction too short".into())); + } + match bytes[pos] { + 0x66 => { + has_operand_size_prefix = true; + pos += 1; + } + 0x67 => { + // Address-size prefix — skip but don't change operand size. + pos += 1; + } + 0xF2 | 0xF3 => { + // REP/REPNE prefix — skip. + pos += 1; + } + b @ 0x40..=0x4F => { + rex = Rex::from_byte(b); + pos += 1; + break; // REX must be last prefix. + } + _ => break, + } + } + + if pos >= bytes.len() { + return Err(WkrunError::Device( + "instruction too short after prefixes".into(), + )); + } + + let opcode = bytes[pos]; + pos += 1; + + match opcode { + // MOV r/m8, reg8 (write, 8-bit) + 0x88 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r/m8,r8: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + let value = read_gpr(regs, reg) & 0xFF; + Ok(MmioInsn { + access_size: 1, + data: value, + len: (pos + addr_len) as u8, + is_write: true, + gpr_index: None, + }) + } + + // MOV r/m16/32/64, reg16/32/64 (write) + 0x89 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r/m,r: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + let access_size = if rex.w { + 8 + } else if has_operand_size_prefix { + 2 + } else { + 4 + }; + let mask = match access_size { + 2 => 0xFFFF, + 4 => 0xFFFF_FFFF, + 8 => u64::MAX, + _ => 0xFF, + }; + let value = read_gpr(regs, reg) & mask; + Ok(MmioInsn { + access_size, + data: value, + len: (pos + addr_len) as u8, + is_write: true, + gpr_index: None, + }) + } + + // MOV reg8, r/m8 (read, 8-bit) + 0x8A => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r8,r/m8: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + Ok(MmioInsn { + access_size: 1, + data: 0, + len: (pos + addr_len) as u8, + is_write: false, + gpr_index: Some(reg), + }) + } + + // MOV reg16/32/64, r/m16/32/64 (read) + 0x8B => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r,r/m: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + let access_size = if rex.w { + 8 + } else if has_operand_size_prefix { + 2 + } else { + 4 + }; + Ok(MmioInsn { + access_size, + data: 0, + len: (pos + addr_len) as u8, + is_write: false, + gpr_index: Some(reg), + }) + } + + // MOV r/m8, imm8 (write, 8-bit immediate) + 0xC6 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r/m8,imm8: missing ModRM".into())); + } + let addr_len = addressing_mode_len(bytes, pos); + let imm_pos = pos + addr_len; + if imm_pos >= bytes.len() { + return Err(WkrunError::Device( + "MOV r/m8,imm8: missing immediate".into(), + )); + } + let value = bytes[imm_pos] as u64; + Ok(MmioInsn { + access_size: 1, + data: value, + len: (imm_pos + 1) as u8, + is_write: true, + gpr_index: None, + }) + } + + // MOV r/m16/32, imm16/32 (write, immediate) + 0xC7 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOV r/m,imm: missing ModRM".into())); + } + let addr_len = addressing_mode_len(bytes, pos); + let imm_pos = pos + addr_len; + let (access_size, imm_len) = if has_operand_size_prefix { + (2u8, 2usize) + } else { + (4u8, 4usize) + }; + if imm_pos + imm_len > bytes.len() { + return Err(WkrunError::Device("MOV r/m,imm: missing immediate".into())); + } + let value = match imm_len { + 2 => u16::from_le_bytes([bytes[imm_pos], bytes[imm_pos + 1]]) as u64, + 4 => u32::from_le_bytes([ + bytes[imm_pos], + bytes[imm_pos + 1], + bytes[imm_pos + 2], + bytes[imm_pos + 3], + ]) as u64, + _ => unreachable!(), + }; + Ok(MmioInsn { + access_size, + data: value, + len: (imm_pos + imm_len) as u8, + is_write: true, + gpr_index: None, + }) + } + + // Two-byte opcodes (0x0F prefix). + 0x0F => { + if pos >= bytes.len() { + return Err(WkrunError::Device( + "0x0F: missing second opcode byte".into(), + )); + } + let opcode2 = bytes[pos]; + pos += 1; + + match opcode2 { + // MOVZX reg, r/m8 (read, 8-bit zero-extended to 32/64) + 0xB6 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOVZX r,r/m8: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + Ok(MmioInsn { + access_size: 1, + data: 0, + len: (pos + addr_len) as u8, + is_write: false, + gpr_index: Some(reg), + }) + } + + // MOVZX reg, r/m16 (read, 16-bit zero-extended to 32/64) + 0xB7 => { + if pos >= bytes.len() { + return Err(WkrunError::Device("MOVZX r,r/m16: missing ModRM".into())); + } + let modrm = bytes[pos]; + let reg = ((modrm >> 3) & 0x07) | if rex.r { 8 } else { 0 }; + let addr_len = addressing_mode_len(bytes, pos); + Ok(MmioInsn { + access_size: 2, + data: 0, + len: (pos + addr_len) as u8, + is_write: false, + gpr_index: Some(reg), + }) + } + + _ => Err(WkrunError::Device(format!( + "unrecognized 0x0F opcode: 0x{:02X} (bytes: {:02X?})", + opcode2, bytes + ))), + } + } + + _ => Err(WkrunError::Device(format!( + "unrecognized MMIO opcode: 0x{:02X} (bytes: {:02X?})", + opcode, bytes + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_regs() -> StandardRegisters { + StandardRegisters { + rax: 0xDEAD_BEEF_CAFE_BABE, + rcx: 0x1111_1111_1111_1111, + rdx: 0x2222_2222_2222_2222, + rbx: 0x3333_3333_3333_3333, + rsp: 0x4444_4444_4444_4444, + rbp: 0x5555_5555_5555_5555, + rsi: 0x6666_6666_6666_6666, + rdi: 0x7777_7777_7777_7777, + r8: 0x8888_8888_8888_8888, + r9: 0x9999_9999_9999_9999, + r10: 0xAAAA_AAAA_AAAA_AAAA, + r11: 0xBBBB_BBBB_BBBB_BBBB, + r12: 0xCCCC_CCCC_CCCC_CCCC, + r13: 0xDDDD_DDDD_DDDD_DDDD, + r14: 0xEEEE_EEEE_EEEE_EEEE, + r15: 0xFFFF_FFFF_FFFF_FFFF, + rip: 0, + rflags: 0, + } + } + + // --- MOV r/m32, reg (0x89) — writel --- + + #[test] + fn test_mov_dword_ptr_eax() { + // mov dword [rdi], eax → 89 07 + // ModRM: mod=00, reg=000(eax), r/m=111(rdi) + let bytes = [0x89, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.data, regs.rax & 0xFFFF_FFFF); + assert_eq!(insn.len, 2); + assert_eq!(insn.gpr_index, None); + } + + #[test] + fn test_mov_dword_ptr_ecx_disp8() { + // mov dword [rdi+0x10], ecx → 89 4F 10 + // ModRM: mod=01, reg=001(ecx), r/m=111(rdi) + let bytes = [0x89, 0x4F, 0x10]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.data, regs.rcx & 0xFFFF_FFFF); + assert_eq!(insn.len, 3); + } + + // --- MOV r/m64, reg (REX.W 0x89) — writeq --- + + #[test] + fn test_mov_qword_ptr_rax() { + // mov qword [rdi], rax → 48 89 07 + // REX.W=1, ModRM: mod=00, reg=000(rax), r/m=111(rdi) + let bytes = [0x48, 0x89, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 8); + assert_eq!(insn.data, regs.rax); + assert_eq!(insn.len, 3); + } + + // --- MOV r/m8, reg8 (0x88) — writeb --- + + #[test] + fn test_mov_byte_ptr_al() { + // mov byte [rdi], al → 88 07 + let bytes = [0x88, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 1); + assert_eq!(insn.data, regs.rax & 0xFF); + assert_eq!(insn.len, 2); + } + + // --- MOV r/m16, reg16 (0x66 0x89) — writew --- + + #[test] + fn test_mov_word_ptr_ax() { + // mov word [rdi], ax → 66 89 07 + let bytes = [0x66, 0x89, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 2); + assert_eq!(insn.data, regs.rax & 0xFFFF); + assert_eq!(insn.len, 3); + } + + // --- MOV reg32, r/m32 (0x8B) — readl --- + + #[test] + fn test_mov_eax_dword_ptr() { + // mov eax, dword [rdi] → 8B 07 + let bytes = [0x8B, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(!insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.gpr_index, Some(0)); // RAX + assert_eq!(insn.len, 2); + } + + // --- MOV reg64, r/m64 (REX.W 0x8B) — readq --- + + #[test] + fn test_mov_rax_qword_ptr() { + // mov rax, qword [rdi] → 48 8B 07 + let bytes = [0x48, 0x8B, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(!insn.is_write); + assert_eq!(insn.access_size, 8); + assert_eq!(insn.gpr_index, Some(0)); // RAX + assert_eq!(insn.len, 3); + } + + // --- MOV r/m32, imm32 (0xC7) — writel with immediate --- + + #[test] + fn test_mov_dword_ptr_imm32() { + // mov dword [rdi], 0x12345678 → C7 07 78 56 34 12 + let bytes = [0xC7, 0x07, 0x78, 0x56, 0x34, 0x12]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.data, 0x12345678); + assert_eq!(insn.len, 6); + } + + // --- MOV r/m8, imm8 (0xC6) — writeb with immediate --- + + #[test] + fn test_mov_byte_ptr_imm8() { + // mov byte [rdi], 0xAB → C6 07 AB + let bytes = [0xC6, 0x07, 0xAB]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 1); + assert_eq!(insn.data, 0xAB); + assert_eq!(insn.len, 3); + } + + // --- MOVZX reg, r/m8 (0x0F 0xB6) — readb --- + + #[test] + fn test_movzx_eax_byte_ptr() { + // movzx eax, byte [rdi] → 0F B6 07 + let bytes = [0x0F, 0xB6, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(!insn.is_write); + assert_eq!(insn.access_size, 1); + assert_eq!(insn.gpr_index, Some(0)); // EAX + assert_eq!(insn.len, 3); + } + + // --- MOVZX reg, r/m16 (0x0F 0xB7) — readw --- + + #[test] + fn test_movzx_eax_word_ptr() { + // movzx eax, word [rdi] → 0F B7 07 + let bytes = [0x0F, 0xB7, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(!insn.is_write); + assert_eq!(insn.access_size, 2); + assert_eq!(insn.gpr_index, Some(0)); // EAX + assert_eq!(insn.len, 3); + } + + // --- REX.R extended registers --- + + #[test] + fn test_mov_dword_ptr_r8d() { + // mov dword [rdi], r8d → 44 89 07 + // REX.R=1, reg=000 → reg=8 (R8) + let bytes = [0x44, 0x89, 0x07]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.data, regs.r8 & 0xFFFF_FFFF); + assert_eq!(insn.len, 3); + } + + #[test] + fn test_mov_r10d_dword_ptr() { + // mov r10d, dword [rdi] → 44 8B 17 + // REX.R=1, reg=010 → reg=10 (R10) + let bytes = [0x44, 0x8B, 0x17]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(!insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.gpr_index, Some(10)); // R10 + assert_eq!(insn.len, 3); + } + + // --- Error cases --- + + #[test] + fn test_empty_bytes_error() { + let regs = make_regs(); + assert!(decode_mmio_insn(&[], ®s).is_err()); + } + + #[test] + fn test_unrecognized_opcode_error() { + let regs = make_regs(); + let bytes = [0xFF, 0x07]; // Not a MOV + assert!(decode_mmio_insn(&bytes, ®s).is_err()); + } + + // --- disp32 addressing --- + + #[test] + fn test_mov_dword_ptr_disp32() { + // mov dword [rdi+0x100], eax → 89 87 00 01 00 00 + // ModRM: mod=10, reg=000(eax), r/m=111(rdi) → disp32 + let bytes = [0x89, 0x87, 0x00, 0x01, 0x00, 0x00]; + let regs = make_regs(); + let insn = decode_mmio_insn(&bytes, ®s).unwrap(); + assert!(insn.is_write); + assert_eq!(insn.access_size, 4); + assert_eq!(insn.data, regs.rax & 0xFFFF_FFFF); + assert_eq!(insn.len, 6); + } + + // --- read_gpr coverage --- + + #[test] + fn test_read_gpr_all_registers() { + let regs = make_regs(); + assert_eq!(read_gpr(®s, 0), regs.rax); + assert_eq!(read_gpr(®s, 1), regs.rcx); + assert_eq!(read_gpr(®s, 2), regs.rdx); + assert_eq!(read_gpr(®s, 3), regs.rbx); + assert_eq!(read_gpr(®s, 4), regs.rsp); + assert_eq!(read_gpr(®s, 5), regs.rbp); + assert_eq!(read_gpr(®s, 6), regs.rsi); + assert_eq!(read_gpr(®s, 7), regs.rdi); + assert_eq!(read_gpr(®s, 8), regs.r8); + assert_eq!(read_gpr(®s, 9), regs.r9); + assert_eq!(read_gpr(®s, 10), regs.r10); + assert_eq!(read_gpr(®s, 11), regs.r11); + assert_eq!(read_gpr(®s, 12), regs.r12); + assert_eq!(read_gpr(®s, 13), regs.r13); + assert_eq!(read_gpr(®s, 14), regs.r14); + assert_eq!(read_gpr(®s, 15), regs.r15); + assert_eq!(read_gpr(®s, 16), 0); // Out of range + } +} diff --git a/src/vmm/src/windows/memory.rs b/src/vmm/src/windows/memory.rs new file mode 100644 index 000000000..b8ddf27eb --- /dev/null +++ b/src/vmm/src/windows/memory.rs @@ -0,0 +1,325 @@ +//! Guest memory management for WHPX VMs. +//! +//! Handles allocation and mapping of guest physical memory. +//! On Windows, we use VirtualAlloc for host-side memory allocation +//! since the rust-vmm vm-memory crate doesn't support Windows. +//! +//! Memory layout constants are available on all platforms for cross-platform +//! testing of boot setup logic. + +// Guest physical memory layout constants for x86_64 Linux boot. +// These match the conventional Linux boot protocol addresses. + +/// Start of the zero page (boot_params structure). +pub const ZERO_PAGE_START: u64 = 0x7000; + +/// Start of the PML4 page table. +pub const PML4_START: u64 = 0x9000; + +/// Start of the PDPT page table. +pub const PDPT_START: u64 = 0xA000; + +/// Start of the PD page tables (4 entries for identity-mapping 4GB). +pub const PD_START: u64 = 0xB000; + +/// Kernel command line address. +pub const CMDLINE_START: u64 = 0x20000; + +/// Maximum kernel command line length. +pub const CMDLINE_MAX_SIZE: u64 = 0x10000; + +/// Kernel load address (1MB — standard bzImage load address). +pub const KERNEL_START: u64 = 0x100000; + +/// Offset of the 64-bit entry point (`startup_64`) from KERNEL_START. +pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200; + +/// ACPI tables region. +pub const ACPI_START: u64 = 0xE0000; + +/// Initial stack pointer (below 1MB, above page tables). +pub const BOOT_STACK_POINTER: u64 = 0x8FF0; + +/// Virtio-MMIO base address (above guest RAM, below 4GB identity map). +pub const VIRTIO_MMIO_BASE: u64 = 0xD000_0000; + +// Windows-specific guest memory allocation and mapping. +#[cfg(target_os = "windows")] +mod imp { + use std::ptr; + + use windows_sys::Win32::System::Hypervisor::WHV_MAP_GPA_RANGE_FLAGS; + use windows_sys::Win32::System::Memory::{ + VirtualAlloc, VirtualFree, MEM_COMMIT, MEM_RELEASE, MEM_RESERVE, PAGE_READWRITE, + }; + + use super::super::error::{Result, WkrunError}; + use super::super::whpx::WhpxPartition; + + /// A contiguous region of guest physical memory. + pub struct GuestMemoryRegion { + /// Host virtual address of the allocated memory. + host_addr: *mut u8, + /// Guest physical address this region maps to. + guest_addr: u64, + /// Size of the region in bytes. + size: u64, + } + + // SAFETY: The memory region is a simple allocation that can be sent between threads. + unsafe impl Send for GuestMemoryRegion {} + unsafe impl Sync for GuestMemoryRegion {} + + impl GuestMemoryRegion { + /// Allocate a new memory region using VirtualAlloc. + pub fn new(guest_addr: u64, size: u64) -> Result { + let host_addr = unsafe { + VirtualAlloc( + ptr::null(), + size as usize, + MEM_COMMIT | MEM_RESERVE, + PAGE_READWRITE, + ) + }; + + if host_addr.is_null() { + return Err(WkrunError::Memory(format!( + "VirtualAlloc failed for {} bytes at GPA 0x{:X}", + size, guest_addr + ))); + } + + Ok(GuestMemoryRegion { + host_addr: host_addr as *mut u8, + guest_addr, + size, + }) + } + + /// Get the host virtual address. + pub fn host_addr(&self) -> *mut u8 { + self.host_addr + } + + /// Get the guest physical address. + pub fn guest_addr(&self) -> u64 { + self.guest_addr + } + + /// Get the size of this region. + pub fn size(&self) -> u64 { + self.size + } + + /// Write data into guest memory at a guest physical address offset. + pub fn write_at(&self, offset: u64, data: &[u8]) -> Result<()> { + if offset + data.len() as u64 > self.size { + return Err(WkrunError::Memory(format!( + "write out of bounds: offset 0x{:X} + {} > region size 0x{:X}", + offset, + data.len(), + self.size + ))); + } + + // SAFETY: We verified the offset + len is within bounds. + unsafe { + let dst = self.host_addr.add(offset as usize); + ptr::copy_nonoverlapping(data.as_ptr(), dst, data.len()); + } + Ok(()) + } + + /// Read data from guest memory at a guest physical address offset. + pub fn read_at(&self, offset: u64, buf: &mut [u8]) -> Result<()> { + if offset + buf.len() as u64 > self.size { + return Err(WkrunError::Memory(format!( + "read out of bounds: offset 0x{:X} + {} > region size 0x{:X}", + offset, + buf.len(), + self.size + ))); + } + + // SAFETY: We verified the offset + len is within bounds. + unsafe { + let src = self.host_addr.add(offset as usize); + ptr::copy_nonoverlapping(src, buf.as_mut_ptr(), buf.len()); + } + Ok(()) + } + + /// Write a value at a specific offset. + pub fn write_obj(&self, offset: u64, val: &T) -> Result<()> { + let size = std::mem::size_of::() as u64; + if offset + size > self.size { + return Err(WkrunError::Memory(format!( + "write_obj out of bounds: offset 0x{:X} + {} > region size 0x{:X}", + offset, size, self.size + ))); + } + + // SAFETY: We verified bounds, and T is Copy (no drop needed). + unsafe { + let dst = self.host_addr.add(offset as usize) as *mut T; + ptr::write_unaligned(dst, *val); + } + Ok(()) + } + + /// Map this region into a WHPX partition's guest physical address space. + pub fn map_to_partition(&self, partition: &WhpxPartition) -> Result<()> { + // SAFETY: host_addr points to our VirtualAlloc'd memory which is valid + // for the lifetime of this GuestMemoryRegion. + unsafe { + partition.map_gpa_range( + self.host_addr, + self.guest_addr, + self.size, + // WHvMapGpaRangeFlagRead | WHvMapGpaRangeFlagWrite | WHvMapGpaRangeFlagExecute + 0x7 as WHV_MAP_GPA_RANGE_FLAGS, + ) + } + } + } + + impl Drop for GuestMemoryRegion { + fn drop(&mut self) { + if !self.host_addr.is_null() { + // SAFETY: We allocated this memory with VirtualAlloc. + unsafe { + VirtualFree(self.host_addr as *mut std::ffi::c_void, 0, MEM_RELEASE); + } + } + } + } + + /// Guest memory manager — holds all guest memory regions. + pub struct GuestMemory { + regions: Vec, + total_size: u64, + } + + impl GuestMemory { + /// Create guest memory with a single contiguous region starting at GPA 0. + pub fn new(size_mib: u32) -> Result { + let size = (size_mib as u64) * 1024 * 1024; + let region = GuestMemoryRegion::new(0, size)?; + Ok(GuestMemory { + regions: vec![region], + total_size: size, + }) + } + + /// Map all guest memory regions into a WHPX partition. + pub fn map_to_partition(&self, partition: &WhpxPartition) -> Result<()> { + for region in &self.regions { + region.map_to_partition(partition)?; + } + Ok(()) + } + + /// Write data at a guest physical address. + pub fn write_at_addr(&self, guest_addr: u64, data: &[u8]) -> Result<()> { + for region in &self.regions { + let region_end = region.guest_addr() + region.size(); + if guest_addr >= region.guest_addr() && guest_addr < region_end { + let offset = guest_addr - region.guest_addr(); + return region.write_at(offset, data); + } + } + Err(WkrunError::Memory(format!( + "no region contains GPA 0x{:X}", + guest_addr + ))) + } + + /// Read data from a guest physical address. + pub fn read_at_addr(&self, guest_addr: u64, buf: &mut [u8]) -> Result<()> { + for region in &self.regions { + let region_end = region.guest_addr() + region.size(); + if guest_addr >= region.guest_addr() && guest_addr < region_end { + let offset = guest_addr - region.guest_addr(); + return region.read_at(offset, buf); + } + } + Err(WkrunError::Memory(format!( + "no region contains GPA 0x{:X}", + guest_addr + ))) + } + + /// Write a typed value at a guest physical address. + pub fn write_obj_at_addr(&self, guest_addr: u64, val: &T) -> Result<()> { + for region in &self.regions { + let region_end = region.guest_addr() + region.size(); + if guest_addr >= region.guest_addr() && guest_addr < region_end { + let offset = guest_addr - region.guest_addr(); + return region.write_obj(offset, val); + } + } + Err(WkrunError::Memory(format!( + "no region contains GPA 0x{:X}", + guest_addr + ))) + } + + /// Get total guest memory size in bytes. + pub fn total_size(&self) -> u64 { + self.total_size + } + } +} + +#[cfg(target_os = "windows")] +pub use imp::*; + +#[cfg(test)] +mod tests { + use super::*; + + // Compile-time assertions for memory layout ordering. + const _: () = { + assert!(ZERO_PAGE_START < PML4_START); + assert!(PML4_START < PDPT_START); + assert!(PDPT_START < PD_START); + assert!(PD_START < CMDLINE_START); + assert!(CMDLINE_START < KERNEL_START); + assert!(ZERO_PAGE_START < BOOT_STACK_POINTER); + assert!(BOOT_STACK_POINTER < PML4_START); + }; + + #[test] + fn test_kernel_start_at_1mb() { + assert_eq!(KERNEL_START, 0x100000); + } + + #[test] + fn test_memory_layout_no_overlap() { + let regions = [ + ("zero_page", ZERO_PAGE_START, ZERO_PAGE_START + 0x1000), + ("pml4", PML4_START, PML4_START + 0x1000), + ("pdpt", PDPT_START, PDPT_START + 0x1000), + ("pd", PD_START, PD_START + 0x4000), + ("cmdline", CMDLINE_START, CMDLINE_START + CMDLINE_MAX_SIZE), + ("kernel", KERNEL_START, KERNEL_START + 0x1000), + ]; + + for i in 0..regions.len() { + for j in (i + 1)..regions.len() { + let (name_a, start_a, end_a) = regions[i]; + let (name_b, start_b, end_b) = regions[j]; + assert!( + end_a <= start_b || end_b <= start_a, + "regions {} and {} overlap: [{:#X}..{:#X}) vs [{:#X}..{:#X})", + name_a, + name_b, + start_a, + end_a, + start_b, + end_b + ); + } + } + } +} diff --git a/src/vmm/src/windows/mod.rs b/src/vmm/src/windows/mod.rs new file mode 100644 index 000000000..e8ef0abe3 --- /dev/null +++ b/src/vmm/src/windows/mod.rs @@ -0,0 +1,16 @@ +//! Virtual Machine Manager — hypervisor abstraction and WHPX backend. + +pub mod types; + +#[cfg(target_os = "windows")] +pub mod whpx; + +pub mod boot; +pub mod cmdline; +pub mod context; +pub mod devices; +pub mod error; +pub mod insn; +pub mod memory; +pub mod runner; +pub mod vcpu; diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs new file mode 100644 index 000000000..0e7f3c370 --- /dev/null +++ b/src/vmm/src/windows/runner.rs @@ -0,0 +1,531 @@ +//! VmRunner — full VM boot orchestration. +//! +//! Takes a configured VmContext, creates the WHPX partition and devices, +//! loads the kernel, and runs the vCPU loop until exit. +//! +//! Supports two modes: +//! - **Blocking**: `run()` — runs vCPU loop on the calling thread (used by `wkrun_start_enter`) +//! - **Async**: `start()` / `wait()` / `stop()` — spawns a background VM thread (used by BoxLite's Tokio runtime) + +#[cfg(target_os = "windows")] +mod imp { + use std::collections::HashMap; + use std::sync::atomic::Ordering; + use std::sync::{Arc, Mutex}; + use std::time::Duration; + + use super::super::boot::loader::load_kernel_with_initrd; + use super::super::devices::virtio::queue::GuestMemoryAccessor; + use super::super::error::{Result, WkrunError}; + use super::super::cmdline::build_kernel_cmdline; + use super::super::context::VmContext; + use super::super::devices::manager::{self as devices, DeviceManager}; + use super::super::memory::GuestMemory; + use super::super::types::VcpuExit; + use super::super::vcpu::VcpuRunConfig; + use super::super::whpx::{VcpuCanceller, WhpxPartition, WhpxVcpu}; + + /// Adapter to implement GuestMemoryAccessor for GuestMemory. + struct GuestMemoryAdapter<'a>(&'a GuestMemory); + + impl GuestMemoryAccessor for GuestMemoryAdapter<'_> { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { + self.0.read_at_addr(addr, buf) + } + fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { + self.0.write_at_addr(addr, data) + } + } + + /// Maximum vCPU exits before giving up. + const MAX_EXITS: u64 = 500_000_000; + + /// Maximum consecutive HLT instructions before giving up. + /// + /// When the guest executes `poweroff -f` on WHPX (no ACPI), the kernel + /// enters an HLT loop. With the 1ms timer tick, this translates to + /// ~5 seconds of wall-clock time before we detect the shutdown. + const MAX_HALTS: u64 = 5_000; + + /// Handle for a running VM, stored in `RUNNING_VMS`. + struct VmHandle { + thread: Option>>, + run_config: VcpuRunConfig, + canceller: Arc>>, + } + + /// Registry of running VMs. A ctx_id appears here after `start()` and is + /// removed by `wait()`. + static RUNNING_VMS: std::sync::LazyLock>> = + std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); + + /// Core vCPU loop shared by `run()` and `start()`. + /// + /// Sets up the WHPX partition, loads the kernel, creates devices and vCPU, + /// then runs the vCPU loop. The `run_config` controls when the loop stops, + /// and the vCPU's canceller is stored in `canceller_slot` so that `stop()` + /// can wake the vCPU. + fn run_vcpu_loop( + ctx: VmContext, + run_config: VcpuRunConfig, + canceller_slot: Arc>>, + ) -> Result { + // Validate required fields. + let kernel_path = ctx + .kernel_path + .as_ref() + .ok_or_else(|| WkrunError::Config("kernel_path is required for VM start".into()))?; + + // Read kernel image. + let kernel_image = std::fs::read(kernel_path).map_err(|e| { + WkrunError::Boot(format!( + "failed to read kernel '{}': {}", + kernel_path.display(), + e + )) + })?; + + // Read initrd if provided. + let initrd_data = match ctx.initramfs_path { + Some(ref path) => Some(std::fs::read(path).map_err(|e| { + WkrunError::Boot(format!("failed to read initrd '{}': {}", path.display(), e)) + })?), + None => None, + }; + + // Check WHPX availability. + if !WhpxPartition::is_available()? { + return Err(WkrunError::WhpxUnavailable( + "WHPX is not available on this system".into(), + )); + } + + // Create partition (no APIC emulation — avoids crash on some Win10 hardware). + let partition = WhpxPartition::new()?; + partition.set_processor_count(ctx.num_vcpus as u32)?; + partition.setup()?; + + // Allocate and map guest memory. + let guest_mem = GuestMemory::new(ctx.ram_mib)?; + guest_mem.map_to_partition(&partition)?; + + // Create devices from context. + let setup = DeviceManager::from_context(&ctx)?; + devices::store_console_buffer(ctx.id, setup.console_buffer); + let mut devices = setup.devices; + + // Build kernel command line. + let cmdline = build_kernel_cmdline( + ctx.kernel_cmdline.as_deref(), + setup.has_root_disk, + &setup.mmio_slots, + ); + + // Load kernel. + let initrd_ref = initrd_data.as_deref(); + let (regs, sregs) = + load_kernel_with_initrd(&guest_mem, &kernel_image, &cmdline, ctx.ram_mib, initrd_ref)?; + + log::info!( + "Kernel loaded at 0x100000, RIP=0x{:X}, cmdline: {}", + regs.rip, + cmdline + ); + + // Create vCPU and set registers. + let vcpu = WhpxVcpu::new(&partition, 0)?; + vcpu.set_registers(®s)?; + vcpu.set_special_registers(&sregs)?; + + // Store canceller so stop() can wake the vCPU. + *canceller_slot.lock().unwrap() = Some(vcpu.canceller()); + + // Spawn timer thread for PIT interrupt delivery. + // Uses run_config.running so that request_stop() stops both the timer + // and the vCPU loop. + let timer_flag = run_config.running.clone(); + let canceller = vcpu.canceller(); + let timer_thread = std::thread::spawn(move || { + while timer_flag.load(Ordering::Relaxed) { + std::thread::sleep(Duration::from_millis(1)); + let _ = canceller.cancel(); + } + }); + + // vCPU run loop. + let mem_adapter = GuestMemoryAdapter(&guest_mem); + let mut exit_count: u64 = 0; + let mut halt_count: u64 = 0; + let exit_code; + + loop { + // Tick PIT and poll devices. + devices.tick_and_poll(&mem_adapter); + + // Try to inject pending interrupt. + if devices.pic.has_pending() { + match vcpu.interrupts_enabled() { + Ok(true) => { + if let Some(vector) = devices.pic.acknowledge() { + vcpu.inject_interrupt(vector)?; + devices.set_window_requested(false); + } + } + Ok(false) => { + if !devices.window_requested() { + vcpu.request_interrupt_window()?; + devices.set_window_requested(true); + } + } + Err(_) => {} + } + } + + let exit = vcpu.run()?; + exit_count += 1; + + match exit { + VcpuExit::IoOut { port, size, data } => { + halt_count = 0; + devices.handle_io_out(port, size, data); + vcpu.skip_instruction()?; + } + VcpuExit::IoIn { port, size } => { + halt_count = 0; + let data = devices.handle_io_in(port, size); + vcpu.complete_io_in(data, size)?; + } + VcpuExit::MmioRead { address, size } => { + halt_count = 0; + let data = devices.handle_mmio_read(address, size); + vcpu.complete_mmio_read(data)?; + } + VcpuExit::MmioWrite { + address, + size, + data, + } => { + halt_count = 0; + devices.handle_mmio_write(address, size, data, &mem_adapter); + vcpu.skip_instruction()?; + } + VcpuExit::InterruptWindow => { + devices.set_window_requested(false); + } + VcpuExit::Halt => { + if !run_config.should_run() { + log::info!("VM stop requested, exiting on Halt"); + exit_code = 0; + break; + } + halt_count += 1; + if halt_count > MAX_HALTS { + log::warn!( + "vCPU halted {} times consecutively after {} exits", + halt_count, + exit_count + ); + exit_code = 0; + break; + } + std::thread::sleep(Duration::from_millis(1)); + } + VcpuExit::Shutdown => { + log::info!("VM shutdown after {} exits", exit_count); + exit_code = 0; + break; + } + VcpuExit::Cancelled => { + // Timer thread or stop() cancelled vCPU. Check if we should exit. + if !run_config.should_run() { + log::info!("VM stop requested, exiting on Cancelled"); + exit_code = 0; + break; + } + } + VcpuExit::Unknown(reason) => { + log::error!( + "Unknown vCPU exit reason {} after {} exits", + reason, + exit_count + ); + exit_code = -1; + break; + } + } + + if exit_count >= MAX_EXITS { + log::warn!("Reached {} exit limit", MAX_EXITS); + exit_code = -1; + break; + } + } + + // Stop the timer thread. + run_config.request_stop(); + let _ = timer_thread.join(); + + log::info!("VM exited with code {} ({} exits)", exit_code, exit_count); + Ok(exit_code) + } + + /// Run a VM synchronously on the calling thread (blocking). + /// + /// Used by `wkrun_start_enter()`. Creates a default `VcpuRunConfig` and + /// runs the vCPU loop until the guest shuts down or an error occurs. + pub fn run(ctx: VmContext) -> Result { + let ctx_id = ctx.id; + let run_config = VcpuRunConfig::new(); + let canceller_slot = Arc::new(Mutex::new(None)); + let result = run_vcpu_loop(ctx, run_config, canceller_slot); + devices::remove_console_buffer(ctx_id); + result + } + + /// Start a VM on a background thread (non-blocking). + /// + /// Takes ownership of the context and spawns a thread running the vCPU loop. + /// Use `wait()` to block until the VM exits, or `stop()` to request shutdown. + pub fn start(ctx_id: u32, ctx: VmContext) -> Result<()> { + let run_config = VcpuRunConfig::new(); + let canceller_slot: Arc>> = Arc::new(Mutex::new(None)); + + let rc = run_config.clone(); + let cs = canceller_slot.clone(); + let thread = std::thread::spawn(move || run_vcpu_loop(ctx, rc, cs)); + + let handle = VmHandle { + thread: Some(thread), + run_config, + canceller: canceller_slot, + }; + + let mut map = RUNNING_VMS + .lock() + .map_err(|_| WkrunError::Config("running VMs lock poisoned".into()))?; + if map.contains_key(&ctx_id) { + return Err(WkrunError::Config(format!( + "VM {} is already running", + ctx_id + ))); + } + map.insert(ctx_id, handle); + Ok(()) + } + + /// Block until a running VM exits. Returns the guest exit code. + /// + /// Removes the VM from the running registry. After `wait()` returns, + /// the ctx_id is no longer valid. + pub fn wait(ctx_id: u32) -> Result { + let mut map = RUNNING_VMS + .lock() + .map_err(|_| WkrunError::Config("running VMs lock poisoned".into()))?; + let mut handle = map + .remove(&ctx_id) + .ok_or(WkrunError::InvalidContext(ctx_id))?; + drop(map); // Release lock before blocking join. + + let thread = handle + .thread + .take() + .ok_or_else(|| WkrunError::Config("VM thread already joined".into()))?; + let result = thread + .join() + .map_err(|_| WkrunError::Config("VM thread panicked".into()))?; + devices::remove_console_buffer(ctx_id); + result + } + + /// Request a running VM to stop (non-blocking). + /// + /// Sets the stop flag and wakes the vCPU so it exits promptly. + /// The VM thread will exit on its next Halt or Cancelled check. + /// Call `wait()` afterwards to collect the exit code. + pub fn stop(ctx_id: u32) -> Result<()> { + let map = RUNNING_VMS + .lock() + .map_err(|_| WkrunError::Config("running VMs lock poisoned".into()))?; + let handle = map.get(&ctx_id).ok_or(WkrunError::InvalidContext(ctx_id))?; + handle.run_config.request_stop(); + if let Some(ref canceller) = *handle.canceller.lock().unwrap() { + let _ = canceller.cancel(); + } + Ok(()) + } +} + +#[cfg(target_os = "windows")] +pub use imp::{run, start, stop, wait}; + +/// Stub for non-Windows platforms (compile only, never called). +#[cfg(not(target_os = "windows"))] +pub fn run(_ctx: super::context::VmContext) -> super::error::Result { + Err(super::error::WkrunError::Config( + "VM runner is only available on Windows".into(), + )) +} + +/// Stub for non-Windows platforms. +#[cfg(not(target_os = "windows"))] +pub fn start(_ctx_id: u32, _ctx: super::context::VmContext) -> super::error::Result<()> { + Err(super::error::WkrunError::Config( + "VM runner is only available on Windows".into(), + )) +} + +/// Stub for non-Windows platforms. +#[cfg(not(target_os = "windows"))] +pub fn wait(_ctx_id: u32) -> super::error::Result { + Err(super::error::WkrunError::Config( + "VM runner is only available on Windows".into(), + )) +} + +/// Stub for non-Windows platforms. +#[cfg(not(target_os = "windows"))] +pub fn stop(_ctx_id: u32) -> super::error::Result<()> { + Err(super::error::WkrunError::Config( + "VM runner is only available on Windows".into(), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use super::super::context::VmContext; + use super::super::vcpu::VcpuRunConfig; + use std::sync::{Arc, Mutex}; + + #[test] + fn test_run_without_kernel_returns_error() { + // VmContext with no kernel path should fail. + let ctx = VmContext::default_for_test(); + let result = run(ctx); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + // On non-Windows: "only available on Windows" + // On Windows without kernel: "kernel_path is required" + assert!( + err.contains("kernel_path") || err.contains("Windows"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_start_without_kernel_returns_error() { + // start() should fail the same way as run() for missing kernel. + let ctx = VmContext::default_for_test(); + let result = start(99900, ctx); + + #[cfg(not(target_os = "windows"))] + { + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("Windows")); + } + + #[cfg(target_os = "windows")] + { + // start() spawns a thread — the error surfaces in wait(). + // But on Windows, if WHPX isn't available or kernel is missing, + // we still get Ok(()) from start() since the thread handles it. + if result.is_ok() { + let wait_result = wait(99900); + assert!(wait_result.is_err()); + } + } + } + + #[test] + fn test_wait_invalid_id_returns_error() { + let result = wait(99901); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("invalid context") || err.contains("Windows"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_stop_invalid_id_returns_error() { + let result = stop(99902); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("invalid context") || err.contains("Windows"), + "unexpected error: {}", + err + ); + } + + #[test] + fn test_run_config_used_for_stop() { + // Verify VcpuRunConfig flag propagation (cross-platform). + let config = VcpuRunConfig::new(); + let cloned = config.clone(); + assert!(config.should_run()); + assert!(cloned.should_run()); + + cloned.request_stop(); + assert!(!config.should_run()); + } + + #[test] + fn test_canceller_slot_starts_none() { + // The canceller slot should start as None (cross-platform). + let slot: Arc>> = Arc::new(Mutex::new(None)); + assert!(slot.lock().unwrap().is_none()); + } + + #[cfg(target_os = "windows")] + #[test] + fn test_start_rejects_duplicate_ctx_id() { + // Use a unique ctx_id unlikely to collide with other tests. + let ctx_id = 99903; + let ctx = VmContext::default_for_test(); + // First start might succeed or fail (depending on WHPX availability). + let _ = start(ctx_id, ctx); + + let ctx2 = VmContext::default_for_test(); + let result = start(ctx_id, ctx2); + // If first succeeded, second should fail with "already running". + // Clean up. + let _ = stop(ctx_id); + let _ = wait(ctx_id); + + if result.is_err() { + assert!(result.unwrap_err().to_string().contains("already running")); + } + } + + #[cfg(target_os = "windows")] + #[test] + fn test_double_wait_returns_error() { + let ctx_id = 99904; + let ctx = VmContext::default_for_test(); + if start(ctx_id, ctx).is_ok() { + // First wait should succeed (thread exits with error due to no kernel). + let _ = wait(ctx_id); + // Second wait should fail — already removed. + let result = wait(ctx_id); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("invalid context")); + } + } + + #[cfg(target_os = "windows")] + #[test] + fn test_stop_after_wait_returns_error() { + let ctx_id = 99905; + let ctx = VmContext::default_for_test(); + if start(ctx_id, ctx).is_ok() { + let _ = wait(ctx_id); + // stop() after wait() should fail — already removed from registry. + let result = stop(ctx_id); + assert!(result.is_err()); + } + } +} diff --git a/src/vmm/src/windows/types.rs b/src/vmm/src/windows/types.rs new file mode 100644 index 000000000..e60a33a39 --- /dev/null +++ b/src/vmm/src/windows/types.rs @@ -0,0 +1,112 @@ +//! Common types for the Windows WHPX VMM layer. + +/// x86_64 standard registers (general-purpose + instruction pointer + flags). +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct StandardRegisters { + pub rax: u64, + pub rbx: u64, + pub rcx: u64, + pub rdx: u64, + pub rsi: u64, + pub rdi: u64, + pub rsp: u64, + pub rbp: u64, + pub r8: u64, + pub r9: u64, + pub r10: u64, + pub r11: u64, + pub r12: u64, + pub r13: u64, + pub r14: u64, + pub r15: u64, + pub rip: u64, + pub rflags: u64, +} + +/// x86_64 segment register. +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct SegmentRegister { + pub base: u64, + pub limit: u32, + pub selector: u16, + /// Access rights (type + S + DPL + P + AVL + L + D/B + G). + pub access_rights: u16, +} + +/// x86_64 descriptor table register (GDTR, IDTR). +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct DescriptorTable { + pub base: u64, + pub limit: u16, +} + +/// x86_64 special/system registers. +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct SpecialRegisters { + pub cs: SegmentRegister, + pub ds: SegmentRegister, + pub es: SegmentRegister, + pub fs: SegmentRegister, + pub gs: SegmentRegister, + pub ss: SegmentRegister, + pub tr: SegmentRegister, + pub ldt: SegmentRegister, + pub gdt: DescriptorTable, + pub idt: DescriptorTable, + pub cr0: u64, + pub cr2: u64, + pub cr3: u64, + pub cr4: u64, + pub efer: u64, +} + +/// Reason the vCPU exited back to the VMM. +#[derive(Debug)] +pub enum VcpuExit { + /// Guest performed an I/O port read. + IoIn { port: u16, size: u8 }, + /// Guest performed an I/O port write. + IoOut { port: u16, size: u8, data: u32 }, + /// Guest performed an MMIO read. + MmioRead { address: u64, size: u8 }, + /// Guest performed an MMIO write. + MmioWrite { address: u64, size: u8, data: u64 }, + /// Guest executed HLT instruction. + Halt, + /// VM shutdown requested. + Shutdown, + /// Hypervisor cancelled the run (e.g., stop requested). + Cancelled, + /// Interrupt window available (guest RFLAGS.IF became 1). + InterruptWindow, + /// Exit reason not handled. + Unknown(u32), +} + +/// VM lifecycle state. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VmState { + /// VM context created, accepting configuration. + Created, + /// VM is configured and ready to start. + Configured, + /// VM is running. + Running, + /// VM has stopped. + Stopped, +} + +impl std::fmt::Display for VmState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + VmState::Created => write!(f, "Created"), + VmState::Configured => write!(f, "Configured"), + VmState::Running => write!(f, "Running"), + VmState::Stopped => write!(f, "Stopped"), + } + } +} diff --git a/src/vmm/src/windows/vcpu.rs b/src/vmm/src/windows/vcpu.rs new file mode 100644 index 000000000..4f679d707 --- /dev/null +++ b/src/vmm/src/windows/vcpu.rs @@ -0,0 +1,112 @@ +//! vCPU thread management for the Windows WHPX backend. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +/// Callback for handling I/O port accesses from the guest. +pub trait IoHandler: Send + Sync { + /// Handle an I/O port read. Returns the data to inject into the guest. + fn io_read(&self, port: u16, size: u8) -> u32; + + /// Handle an I/O port write from the guest. + fn io_write(&self, port: u16, size: u8, data: u32); +} + +/// Callback for handling MMIO accesses from the guest. +pub trait MmioHandler: Send + Sync { + /// Handle an MMIO read. Returns the data to inject into the guest. + fn mmio_read(&self, address: u64, size: u8) -> u64; + + /// Handle an MMIO write from the guest. + fn mmio_write(&self, address: u64, size: u8, data: u64); +} + +/// Shared state for a vCPU run loop. +pub struct VcpuRunConfig { + /// Whether the VM should keep running (set to false to request stop). + pub running: Arc, +} + +impl Clone for VcpuRunConfig { + fn clone(&self) -> Self { + VcpuRunConfig { + running: self.running.clone(), + } + } +} + +impl Default for VcpuRunConfig { + fn default() -> Self { + Self::new() + } +} + +impl VcpuRunConfig { + /// Create a new vCPU run configuration. + pub fn new() -> Self { + VcpuRunConfig { + running: Arc::new(AtomicBool::new(true)), + } + } + + /// Request the vCPU to stop running. + pub fn request_stop(&self) { + self.running.store(false, Ordering::Release); + } + + /// Check if the vCPU should continue running. + pub fn should_run(&self) -> bool { + self.running.load(Ordering::Acquire) + } +} + +/// Result of a vCPU run loop iteration. +#[derive(Debug, PartialEq, Eq)] +pub enum VcpuAction { + /// Continue running the vCPU. + Continue, + /// vCPU should halt (HLT instruction). + Halt, + /// VM should shut down. + Shutdown, + /// Run was cancelled externally. + Cancelled, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_vcpu_run_config_lifecycle() { + let config = VcpuRunConfig::new(); + assert!(config.should_run()); + + config.request_stop(); + assert!(!config.should_run()); + } + + #[test] + fn test_vcpu_run_config_shared() { + let config = VcpuRunConfig::new(); + let running = config.running.clone(); + + assert!(running.load(Ordering::Acquire)); + config.request_stop(); + assert!(!running.load(Ordering::Acquire)); + } + + #[test] + fn test_vcpu_run_config_clone_shares_state() { + let config = VcpuRunConfig::new(); + let cloned = config.clone(); + + assert!(config.should_run()); + assert!(cloned.should_run()); + + // Stopping the clone stops the original (shared Arc). + cloned.request_stop(); + assert!(!config.should_run()); + assert!(!cloned.should_run()); + } +} diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs new file mode 100644 index 000000000..d76435da2 --- /dev/null +++ b/src/vmm/src/windows/whpx.rs @@ -0,0 +1,872 @@ +//! WHPX (Windows Hypervisor Platform) backend. +//! +//! Safe Rust wrappers around the WHPX C API for creating and managing +//! VM partitions and virtual processors. + +#[cfg(not(target_os = "windows"))] +compile_error!("WHPX module requires Windows"); + +#[cfg(target_os = "windows")] +mod imp { + use std::cell::Cell; + use std::ptr; + + use windows_sys::Win32::System::Hypervisor::*; + + use super::super::error::{check_hresult, Result}; + use super::super::types::{SpecialRegisters, StandardRegisters, VcpuExit}; + + // Helper: create a zeroed WHV_REGISTER_VALUE (for arrays). + fn zeroed_reg_value() -> WHV_REGISTER_VALUE { + // SAFETY: WHV_REGISTER_VALUE is a union of integer/struct types; all-zeros is valid. + unsafe { std::mem::zeroed() } + } + + // Helper: create a WHV_REGISTER_VALUE from a u64 (for Reg64 field). + fn reg64(val: u64) -> WHV_REGISTER_VALUE { + WHV_REGISTER_VALUE { Reg64: val } + } + + // Helper: extract u64 from a WHV_REGISTER_VALUE Reg64 field. + // SAFETY: Caller must ensure the register contains a 64-bit value. + unsafe fn read_reg64(val: &WHV_REGISTER_VALUE) -> u64 { + val.Reg64 + } + + /// Bitfield accessors for WHV_X64_IO_PORT_ACCESS_INFO. + /// The _bitfield layout (from windows-sys): + /// bits [0..0] = IsWrite + /// bits [1..3] = AccessSize + /// bits [4..4] = StringOp + /// bits [5..5] = RepPrefix + /// bits [6..31] = Reserved + fn io_access_is_write(info: &WHV_X64_IO_PORT_ACCESS_INFO) -> bool { + let bits = unsafe { info.Anonymous._bitfield }; + (bits & 1) != 0 + } + + fn io_access_size(info: &WHV_X64_IO_PORT_ACCESS_INFO) -> u8 { + let bits = unsafe { info.Anonymous._bitfield }; + ((bits >> 1) & 0x7) as u8 + } + + /// Bitfield accessors for WHV_MEMORY_ACCESS_INFO. + /// The _bitfield layout: + /// bits [0..1] = AccessType (0=read, 1=write, 2=execute) + /// bits [2..2] = GpaUnmapped + /// bits [3..3] = GvaValid + /// bits [4..31] = Reserved + fn mem_access_type(info: &WHV_MEMORY_ACCESS_INFO) -> u32 { + let bits = unsafe { info.Anonymous._bitfield }; + bits & 0x3 + } + + /// A WHPX partition (VM container). + /// + /// Wraps `WHV_PARTITION_HANDLE` and manages its lifecycle. + /// When dropped, the partition and all its resources are freed. + pub struct WhpxPartition { + handle: WHV_PARTITION_HANDLE, + } + + // SAFETY: WHPX partition handles can be shared across threads. + // The WHPX API is thread-safe for operations on different objects + // (e.g., different vCPUs within the same partition). + unsafe impl Send for WhpxPartition {} + unsafe impl Sync for WhpxPartition {} + + impl WhpxPartition { + /// Check if WHPX is available on this system. + pub fn is_available() -> Result { + let mut capability = WHV_CAPABILITY { + HypervisorPresent: 0, + }; + let hr = unsafe { + WHvGetCapability( + WHvCapabilityCodeHypervisorPresent, + &mut capability as *mut _ as *mut std::ffi::c_void, + std::mem::size_of::() as u32, + ptr::null_mut(), + ) + }; + check_hresult("WHvGetCapability", hr)?; + + // SAFETY: We requested WHvCapabilityCodeHypervisorPresent, + // so the union field HypervisorPresent is valid. + let present = unsafe { capability.HypervisorPresent }; + Ok(present != 0) + } + + /// Create a new WHPX partition. + pub fn new() -> Result { + // WHV_PARTITION_HANDLE is isize; 0 means invalid. + let mut handle: WHV_PARTITION_HANDLE = 0; + let hr = unsafe { WHvCreatePartition(&mut handle) }; + check_hresult("WHvCreatePartition", hr)?; + + Ok(WhpxPartition { handle }) + } + + /// Set the number of virtual processors for this partition. + pub fn set_processor_count(&self, count: u32) -> Result<()> { + let property = WHV_PARTITION_PROPERTY { + ProcessorCount: count, + }; + let hr = unsafe { + WHvSetPartitionProperty( + self.handle, + WHvPartitionPropertyCodeProcessorCount, + &property as *const _ as *const std::ffi::c_void, + std::mem::size_of::() as u32, + ) + }; + check_hresult("WHvSetPartitionProperty(ProcessorCount)", hr) + } + + /// Enable APIC emulation mode (XApic). + pub fn set_local_apic_emulation(&self, enable: bool) -> Result<()> { + let mode = if enable { + WHvX64LocalApicEmulationModeXApic + } else { + WHvX64LocalApicEmulationModeNone + }; + let property = WHV_PARTITION_PROPERTY { + LocalApicEmulationMode: mode, + }; + let hr = unsafe { + WHvSetPartitionProperty( + self.handle, + WHvPartitionPropertyCodeLocalApicEmulationMode, + &property as *const _ as *const std::ffi::c_void, + std::mem::size_of::() as u32, + ) + }; + check_hresult("WHvSetPartitionProperty(LocalApicEmulationMode)", hr) + } + + /// Finalize the partition configuration. Must be called before creating + /// virtual processors or mapping memory. + pub fn setup(&self) -> Result<()> { + let hr = unsafe { WHvSetupPartition(self.handle) }; + check_hresult("WHvSetupPartition", hr) + } + + /// Map a host memory region into the guest physical address space. + /// + /// # Safety + /// + /// `host_va` must point to a valid memory region of at least `size` bytes + /// that will remain valid for the lifetime of this mapping. + pub unsafe fn map_gpa_range( + &self, + host_va: *mut u8, + guest_pa: u64, + size: u64, + flags: WHV_MAP_GPA_RANGE_FLAGS, + ) -> Result<()> { + let hr = WHvMapGpaRange( + self.handle, + host_va as *const std::ffi::c_void, + guest_pa, + size, + flags, + ); + check_hresult("WHvMapGpaRange", hr) + } + + /// Unmap a guest physical address range. + pub fn unmap_gpa_range(&self, guest_pa: u64, size: u64) -> Result<()> { + let hr = unsafe { WHvUnmapGpaRange(self.handle, guest_pa, size) }; + check_hresult("WHvUnmapGpaRange", hr) + } + + /// Get the raw partition handle (for creating vCPUs etc). + pub fn handle(&self) -> WHV_PARTITION_HANDLE { + self.handle + } + } + + impl Drop for WhpxPartition { + fn drop(&mut self) { + // WHV_PARTITION_HANDLE is isize; 0 means invalid. + if self.handle != 0 { + // SAFETY: We own this partition handle and it's valid. + unsafe { + WHvDeletePartition(self.handle); + } + } + } + } + + /// A WHPX virtual processor (vCPU). + pub struct WhpxVcpu { + partition_handle: WHV_PARTITION_HANDLE, + index: u32, + // Exit context cache — populated by run(), used by skip_instruction()/complete_io_in(). + exit_rip: Cell, + exit_instruction_len: Cell, + exit_rax: Cell, + // MMIO read completion cache — populated by run() on MMIO read exits. + exit_mmio_gpr_index: Cell>, + exit_mmio_access_size: Cell, + } + + // SAFETY: Each vCPU is operated on by a single thread at a time. + // The WHPX API permits calling WHvRunVirtualProcessor from a dedicated thread. + unsafe impl Send for WhpxVcpu {} + + impl WhpxVcpu { + /// Create a new virtual processor in the given partition. + pub fn new(partition: &WhpxPartition, index: u32) -> Result { + let hr = unsafe { WHvCreateVirtualProcessor(partition.handle(), index, 0) }; + check_hresult("WHvCreateVirtualProcessor", hr)?; + + Ok(WhpxVcpu { + partition_handle: partition.handle(), + index, + exit_rip: Cell::new(0), + exit_instruction_len: Cell::new(0), + exit_rax: Cell::new(0), + exit_mmio_gpr_index: Cell::new(None), + exit_mmio_access_size: Cell::new(0), + }) + } + + /// Get standard (general-purpose) registers. + pub fn get_registers(&self) -> Result { + let register_names = [ + WHvX64RegisterRax, + WHvX64RegisterRbx, + WHvX64RegisterRcx, + WHvX64RegisterRdx, + WHvX64RegisterRsi, + WHvX64RegisterRdi, + WHvX64RegisterRsp, + WHvX64RegisterRbp, + WHvX64RegisterR8, + WHvX64RegisterR9, + WHvX64RegisterR10, + WHvX64RegisterR11, + WHvX64RegisterR12, + WHvX64RegisterR13, + WHvX64RegisterR14, + WHvX64RegisterR15, + WHvX64RegisterRip, + WHvX64RegisterRflags, + ]; + + // Use heap allocation (Vec) instead of stack arrays — WHPX on some + // Win10 builds crashes with stack-allocated WHV_REGISTER_VALUE arrays + // (likely a 16-byte alignment issue on the stack). + let mut values: Vec = + vec![zeroed_reg_value(); register_names.len()]; + + let hr = unsafe { + WHvGetVirtualProcessorRegisters( + self.partition_handle, + self.index, + register_names.as_ptr(), + register_names.len() as u32, + values.as_mut_ptr(), + ) + }; + check_hresult("WHvGetVirtualProcessorRegisters", hr)?; + + // SAFETY: We requested 64-bit register values, so Reg64 is the valid union field. + unsafe { + Ok(StandardRegisters { + rax: read_reg64(&values[0]), + rbx: read_reg64(&values[1]), + rcx: read_reg64(&values[2]), + rdx: read_reg64(&values[3]), + rsi: read_reg64(&values[4]), + rdi: read_reg64(&values[5]), + rsp: read_reg64(&values[6]), + rbp: read_reg64(&values[7]), + r8: read_reg64(&values[8]), + r9: read_reg64(&values[9]), + r10: read_reg64(&values[10]), + r11: read_reg64(&values[11]), + r12: read_reg64(&values[12]), + r13: read_reg64(&values[13]), + r14: read_reg64(&values[14]), + r15: read_reg64(&values[15]), + rip: read_reg64(&values[16]), + rflags: read_reg64(&values[17]), + }) + } + } + + /// Set standard (general-purpose) registers. + pub fn set_registers(&self, regs: &StandardRegisters) -> Result<()> { + let register_names = [ + WHvX64RegisterRax, + WHvX64RegisterRbx, + WHvX64RegisterRcx, + WHvX64RegisterRdx, + WHvX64RegisterRsi, + WHvX64RegisterRdi, + WHvX64RegisterRsp, + WHvX64RegisterRbp, + WHvX64RegisterR8, + WHvX64RegisterR9, + WHvX64RegisterR10, + WHvX64RegisterR11, + WHvX64RegisterR12, + WHvX64RegisterR13, + WHvX64RegisterR14, + WHvX64RegisterR15, + WHvX64RegisterRip, + WHvX64RegisterRflags, + ]; + + // Use heap allocation — see get_registers() comment on alignment. + let values: Vec = vec![ + reg64(regs.rax), + reg64(regs.rbx), + reg64(regs.rcx), + reg64(regs.rdx), + reg64(regs.rsi), + reg64(regs.rdi), + reg64(regs.rsp), + reg64(regs.rbp), + reg64(regs.r8), + reg64(regs.r9), + reg64(regs.r10), + reg64(regs.r11), + reg64(regs.r12), + reg64(regs.r13), + reg64(regs.r14), + reg64(regs.r15), + reg64(regs.rip), + reg64(regs.rflags), + ]; + + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + register_names.as_ptr(), + register_names.len() as u32, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters", hr) + } + + /// Get special/system registers (segments, control registers, EFER). + pub fn get_special_registers(&self) -> Result { + let register_names = [ + // Segment registers + WHvX64RegisterCs, + WHvX64RegisterDs, + WHvX64RegisterEs, + WHvX64RegisterFs, + WHvX64RegisterGs, + WHvX64RegisterSs, + WHvX64RegisterTr, + WHvX64RegisterLdtr, + // Descriptor table registers + WHvX64RegisterGdtr, + WHvX64RegisterIdtr, + // Control registers + WHvX64RegisterCr0, + WHvX64RegisterCr2, + WHvX64RegisterCr3, + WHvX64RegisterCr4, + WHvX64RegisterEfer, + ]; + + // Use heap allocation — see get_registers() comment on alignment. + let mut values: Vec = + vec![zeroed_reg_value(); register_names.len()]; + + let hr = unsafe { + WHvGetVirtualProcessorRegisters( + self.partition_handle, + self.index, + register_names.as_ptr(), + register_names.len() as u32, + values.as_mut_ptr(), + ) + }; + check_hresult("WHvGetVirtualProcessorRegisters(special)", hr)?; + + // Helper to extract segment register from WHV_REGISTER_VALUE. + // SAFETY: Segment register values are stored in the Segment field of the union. + let seg = |v: &WHV_REGISTER_VALUE| { + let s = unsafe { &v.Segment }; + super::super::types::SegmentRegister { + base: s.Base, + limit: s.Limit, + selector: s.Selector, + // WHV_X64_SEGMENT_REGISTER_0 is a union with an Attributes field. + access_rights: unsafe { s.Anonymous.Attributes }, + } + }; + + // SAFETY: Table register values are stored in the Table field of the union. + let table = |v: &WHV_REGISTER_VALUE| { + let t = unsafe { &v.Table }; + super::super::types::DescriptorTable { + base: t.Base, + limit: t.Limit, + } + }; + + Ok(SpecialRegisters { + cs: seg(&values[0]), + ds: seg(&values[1]), + es: seg(&values[2]), + fs: seg(&values[3]), + gs: seg(&values[4]), + ss: seg(&values[5]), + tr: seg(&values[6]), + ldt: seg(&values[7]), + gdt: table(&values[8]), + idt: table(&values[9]), + cr0: unsafe { read_reg64(&values[10]) }, + cr2: unsafe { read_reg64(&values[11]) }, + cr3: unsafe { read_reg64(&values[12]) }, + cr4: unsafe { read_reg64(&values[13]) }, + efer: unsafe { read_reg64(&values[14]) }, + }) + } + + /// Set special/system registers. + pub fn set_special_registers(&self, sregs: &SpecialRegisters) -> Result<()> { + let register_names = [ + WHvX64RegisterCs, + WHvX64RegisterDs, + WHvX64RegisterEs, + WHvX64RegisterFs, + WHvX64RegisterGs, + WHvX64RegisterSs, + WHvX64RegisterTr, + WHvX64RegisterLdtr, + WHvX64RegisterGdtr, + WHvX64RegisterIdtr, + WHvX64RegisterCr0, + WHvX64RegisterCr2, + WHvX64RegisterCr3, + WHvX64RegisterCr4, + WHvX64RegisterEfer, + ]; + + // Helper to build WHV_REGISTER_VALUE for a segment register. + let seg_val = |s: &super::super::types::SegmentRegister| WHV_REGISTER_VALUE { + Segment: WHV_X64_SEGMENT_REGISTER { + Base: s.base, + Limit: s.limit, + Selector: s.selector, + Anonymous: WHV_X64_SEGMENT_REGISTER_0 { + Attributes: s.access_rights, + }, + }, + }; + + // Helper to build WHV_REGISTER_VALUE for a table register. + let table_val = |t: &super::super::types::DescriptorTable| WHV_REGISTER_VALUE { + Table: WHV_X64_TABLE_REGISTER { + Pad: [0u16; 3], + Base: t.base, + Limit: t.limit, + }, + }; + + // Use heap allocation — see get_registers() comment on alignment. + let values: Vec = vec![ + seg_val(&sregs.cs), + seg_val(&sregs.ds), + seg_val(&sregs.es), + seg_val(&sregs.fs), + seg_val(&sregs.gs), + seg_val(&sregs.ss), + seg_val(&sregs.tr), + seg_val(&sregs.ldt), + table_val(&sregs.gdt), + table_val(&sregs.idt), + reg64(sregs.cr0), + reg64(sregs.cr2), + reg64(sregs.cr3), + reg64(sregs.cr4), + reg64(sregs.efer), + ]; + + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + register_names.as_ptr(), + register_names.len() as u32, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(special)", hr) + } + + /// Run the virtual processor until a VM exit occurs. + /// + /// After an I/O exit, call [`skip_instruction`] (for writes) or + /// [`complete_io_in`] (for reads) to resume execution. + pub fn run(&self) -> Result { + let mut exit_context: WHV_RUN_VP_EXIT_CONTEXT = unsafe { std::mem::zeroed() }; + let hr = unsafe { + WHvRunVirtualProcessor( + self.partition_handle, + self.index, + &mut exit_context as *mut _ as *mut std::ffi::c_void, + std::mem::size_of::() as u32, + ) + }; + check_hresult("WHvRunVirtualProcessor", hr)?; + + // Cache RIP from the VP context for skip_instruction/complete_io_in. + self.exit_rip.set(exit_context.VpContext.Rip); + + // Extract instruction length from VpContext. + // WHV_VP_EXIT_CONTEXT layout: [ExecutionState:2][InstructionLength(4bits)|Cr8(4bits):1]... + // InstructionLength is at byte offset 2, lower 4 bits. + // SAFETY: VpContext is a repr(C) struct; byte access at offset 2 is within bounds. + let vp_instruction_len = unsafe { + let vp_bytes = &exit_context.VpContext as *const _ as *const u8; + *vp_bytes.add(2) & 0xF + }; + self.exit_instruction_len.set(vp_instruction_len); + + // WHV_RUN_VP_EXIT_REASON is i32; use if/else chain to avoid + // warnings about lowercase constant names in match patterns. + let reason = exit_context.ExitReason; + if reason == WHvRunVpExitReasonX64IoPortAccess { + // SAFETY: ExitReason is IoPortAccess, so the IoPortAccess union field is valid. + let io = unsafe { &exit_context.Anonymous.IoPortAccess }; + let port = io.PortNumber; + let size = io_access_size(&io.AccessInfo); + let is_write = io_access_is_write(&io.AccessInfo); + + self.exit_rax.set(io.Rax); + + if is_write { + let data = io.Rax as u32; + Ok(VcpuExit::IoOut { port, size, data }) + } else { + Ok(VcpuExit::IoIn { port, size }) + } + } else if reason == WHvRunVpExitReasonMemoryAccess { + // SAFETY: ExitReason is MemoryAccess, so the MemoryAccess union field is valid. + let mem_ctx = unsafe { &exit_context.Anonymous.MemoryAccess }; + let address = mem_ctx.Gpa; + let access_type = mem_access_type(&mem_ctx.AccessInfo); + let is_write = access_type == 1; + + // Decode the faulting instruction to get access size and write data. + let byte_count = mem_ctx.InstructionByteCount as usize; + let insn_bytes = &mem_ctx.InstructionBytes[..byte_count.min(16)]; + let regs = self.get_registers()?; + let insn = super::super::insn::decode_mmio_insn(insn_bytes, ®s)?; + + self.exit_instruction_len.set(insn.len); + self.exit_mmio_gpr_index.set(insn.gpr_index); + self.exit_mmio_access_size.set(insn.access_size); + + if is_write { + Ok(VcpuExit::MmioWrite { + address, + size: insn.access_size, + data: insn.data, + }) + } else { + Ok(VcpuExit::MmioRead { + address, + size: insn.access_size, + }) + } + } else if reason == WHvRunVpExitReasonX64InterruptWindow { + Ok(VcpuExit::InterruptWindow) + } else if reason == WHvRunVpExitReasonX64Halt { + Ok(VcpuExit::Halt) + } else if reason == WHvRunVpExitReasonCanceled { + Ok(VcpuExit::Cancelled) + } else if reason == WHvRunVpExitReasonNone { + Ok(VcpuExit::Shutdown) + } else { + Ok(VcpuExit::Unknown(reason as u32)) + } + } + + /// Get cached exit context info (for diagnostics and testing). + /// + /// Returns `(rip, instruction_len, rax)` from the last VM exit. + pub fn exit_info(&self) -> (u64, u8, u64) { + ( + self.exit_rip.get(), + self.exit_instruction_len.get(), + self.exit_rax.get(), + ) + } + + /// Advance RIP past the last intercepted instruction. + /// + /// Call after handling [`VcpuExit::IoOut`] or [`VcpuExit::MmioWrite`] + /// to resume execution at the next instruction. + pub fn skip_instruction(&self) -> Result<()> { + let instruction_len = self.exit_instruction_len.get(); + // Read current RIP from registers (guaranteed correct). + let regs = self.get_registers()?; + let new_rip = regs.rip + instruction_len as u64; + let names = [WHvX64RegisterRip]; + let values: Vec = vec![reg64(new_rip)]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 1, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(skip)", hr) + } + + /// Complete an I/O IN operation: inject data into RAX and advance RIP. + /// + /// Preserves upper RAX bits based on the I/O access size: + /// - size 1: modifies AL only (bits 0-7) + /// - size 2: modifies AX only (bits 0-15) + /// - size 4: modifies EAX (bits 0-31) + /// + /// Call after handling [`VcpuExit::IoIn`]. + pub fn complete_io_in(&self, data: u32, size: u8) -> Result<()> { + let instruction_len = self.exit_instruction_len.get(); + // Read current registers (RIP and RAX guaranteed correct). + let regs = self.get_registers()?; + let new_rip = regs.rip + instruction_len as u64; + let mask: u64 = match size { + 1 => 0xFF, + 2 => 0xFFFF, + 4 => 0xFFFF_FFFF, + _ => 0xFF, + }; + let new_rax = (regs.rax & !mask) | (data as u64 & mask); + + let names = [WHvX64RegisterRip, WHvX64RegisterRax]; + let values: Vec = vec![reg64(new_rip), reg64(new_rax)]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 2, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(io_in)", hr) + } + + /// Complete an MMIO read: inject data into the destination GPR and advance RIP. + /// + /// The destination register and access size were cached during [`run()`]. + /// Data is zero-extended into the register per x86 semantics: + /// - 1-byte: zero-extends to 64 bits (MOVZX) or writes AL (MOV) + /// - 2-byte: zero-extends to 64 bits (MOVZX) or writes AX (MOV) + /// - 4-byte: zero-extends to 64 bits (x86-64 implicit) + /// - 8-byte: writes full 64-bit register + /// + /// Call after handling [`VcpuExit::MmioRead`]. + pub fn complete_mmio_read(&self, data: u64) -> Result<()> { + let gpr_index = match self.exit_mmio_gpr_index.get() { + Some(idx) => idx, + None => { + return Err(super::super::error::WkrunError::Vcpu( + "complete_mmio_read: no cached GPR index".into(), + )) + } + }; + let access_size = self.exit_mmio_access_size.get(); + let insn_len = self.exit_instruction_len.get(); + + let mut regs = self.get_registers()?; + let new_rip = regs.rip + insn_len as u64; + + // Mask data to access size. For 4-byte writes, x86-64 zero-extends + // the 32-bit result into the full 64-bit register. + let masked = match access_size { + 1 => data & 0xFF, + 2 => data & 0xFFFF, + 4 => data & 0xFFFF_FFFF, + _ => data, + }; + + // Write into the destination GPR. + match gpr_index { + 0 => regs.rax = masked, + 1 => regs.rcx = masked, + 2 => regs.rdx = masked, + 3 => regs.rbx = masked, + 4 => regs.rsp = masked, + 5 => regs.rbp = masked, + 6 => regs.rsi = masked, + 7 => regs.rdi = masked, + 8 => regs.r8 = masked, + 9 => regs.r9 = masked, + 10 => regs.r10 = masked, + 11 => regs.r11 = masked, + 12 => regs.r12 = masked, + 13 => regs.r13 = masked, + 14 => regs.r14 = masked, + 15 => regs.r15 = masked, + _ => {} + } + + regs.rip = new_rip; + self.set_registers(®s) + } + + /// Inject an external hardware interrupt into the vCPU. + /// + /// The interrupt is delivered on the next `run()` call. The caller + /// must ensure `RFLAGS.IF = 1` before calling this (use + /// [`interrupts_enabled`] to check, and [`request_interrupt_window`] + /// if interrupts are currently disabled). + pub fn inject_interrupt(&self, vector: u8) -> Result<()> { + // Build WHV_X64_PENDING_INTERRUPTION_REGISTER as u64: + // Bit 0: InterruptionPending = 1 + // Bits 1-3: InterruptionType = 0 (external interrupt) + // Bit 4: DeliverErrorCode = 0 + // Bits 16-31: InterruptionVector = vector + let pending: u64 = 1 | ((vector as u64) << 16); + + let names = [WHvRegisterPendingInterruption]; + let values: Vec = vec![WHV_REGISTER_VALUE { Reg64: pending }]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 1, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(inject_interrupt)", hr) + } + + /// Check if the guest has interrupts enabled (RFLAGS.IF = 1). + pub fn interrupts_enabled(&self) -> Result { + let regs = self.get_registers()?; + Ok(regs.rflags & (1 << 9) != 0) + } + + /// Request an interrupt window exit. + /// + /// The next `run()` call will exit with [`VcpuExit::InterruptWindow`] + /// as soon as the guest enables interrupts (RFLAGS.IF = 1). + pub fn request_interrupt_window(&self) -> Result<()> { + // WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER: + // Bit 1: InterruptNotification = 1 + let notifications: u64 = 1 << 1; + + let names = [WHvX64RegisterDeliverabilityNotifications]; + let values: Vec = vec![WHV_REGISTER_VALUE { + Reg64: notifications, + }]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 1, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(interrupt_window)", hr) + } + + /// Cancel a running vCPU (causes it to exit with Cancelled). + pub fn cancel(&self) -> Result<()> { + let hr = unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, self.index, 0) }; + check_hresult("WHvCancelRunVirtualProcessor", hr) + } + + /// Get the vCPU index. + pub fn index(&self) -> u32 { + self.index + } + + /// Create a lightweight canceller that can be sent to another thread. + pub fn canceller(&self) -> VcpuCanceller { + VcpuCanceller { + partition_handle: self.partition_handle, + index: self.index, + } + } + } + + /// Lightweight handle for cancelling a running vCPU from another thread. + /// + /// Only supports the cancel operation — safe to use from a timer thread + /// to preempt the vCPU for interrupt delivery. + pub struct VcpuCanceller { + partition_handle: WHV_PARTITION_HANDLE, + index: u32, + } + + // SAFETY: WHvCancelRunVirtualProcessor is documented as safe to call + // from any thread while the vCPU is running. + unsafe impl Send for VcpuCanceller {} + unsafe impl Sync for VcpuCanceller {} + + impl VcpuCanceller { + /// Cancel the vCPU run, causing it to exit with VcpuExit::Cancelled. + pub fn cancel(&self) -> Result<()> { + let hr = unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, self.index, 0) }; + check_hresult("WHvCancelRunVirtualProcessor", hr) + } + } + + impl Drop for WhpxVcpu { + fn drop(&mut self) { + // SAFETY: We own this vCPU and the partition handle is still valid + // (guaranteed by the borrow lifetime in practice, but we store a raw handle). + unsafe { + WHvDeleteVirtualProcessor(self.partition_handle, self.index); + } + } + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn test_standard_registers_default() { + let regs = StandardRegisters::default(); + assert_eq!(regs.rax, 0); + assert_eq!(regs.rip, 0); + assert_eq!(regs.rflags, 0); + } + + #[test] + fn test_special_registers_default() { + let sregs = SpecialRegisters::default(); + assert_eq!(sregs.cr0, 0); + assert_eq!(sregs.cr3, 0); + assert_eq!(sregs.efer, 0); + assert_eq!(sregs.cs.selector, 0); + } + + #[test] + fn test_segment_register_construction() { + let seg = super::super::super::types::SegmentRegister { + base: 0, + limit: 0xFFFF_FFFF, + selector: 0x10, + access_rights: 0xC093, // data segment + }; + assert_eq!(seg.selector, 0x10); + assert_eq!(seg.access_rights, 0xC093); + } + } +} + +#[cfg(target_os = "windows")] +pub use imp::*; From 154bb08a5a105464a0fe341ce569c47683bc705b Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 16 Apr 2026 22:17:27 +0800 Subject: [PATCH 03/33] refactor: gate Unix-only code and add Windows platform support - libkrun/Cargo.toml: split deps into cross-platform (log, vmm) and Unix-only (crossbeam, env_logger, libc, devices, polly, utils) - libkrun/src/lib.rs: wrap entire Unix C API in `mod unix_api` gated by `#[cfg(not(target_os = "windows"))]`; move stub functions (krun_start, krun_wait, krun_stop, krun_get_console_output, krun_add_net) outside so they compile unconditionally - vmm/Cargo.toml: split deps into cross-platform, Unix-only, and Windows-only (windows-sys, zerocopy, rand); gate cpuid to Unix x86_64 - vmm/src/lib.rs: gate all upstream VMM infrastructure (builder, device_manager, resources, vmm_config, terminal, worker, Vmm struct, Error enum) with `#[cfg(unix)]`; add `pub mod windows` for WHPX Co-Authored-By: Claude Opus 4.6 --- src/libkrun/Cargo.toml | 9 ++++-- src/libkrun/src/lib.rs | 64 ++++++++++++++++++++---------------------- src/vmm/Cargo.toml | 37 ++++++++++++++++-------- src/vmm/src/lib.rs | 45 ++++++++++++++++++++++++----- 4 files changed, 102 insertions(+), 53 deletions(-) diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index dc9000916..7868baba4 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -21,20 +21,25 @@ virgl_resource_map2 = ["devices/virgl_resource_map2"] aws-nitro = ["vmm/aws-nitro", "devices/aws-nitro", "dep:aws-nitro", "dep:nitro-enclaves"] [dependencies] +# Cross-platform dependencies +log = "0.4.0" +vmm = { path = "../vmm" } + +# Unix-only dependencies (the upstream C API implementation) +[target.'cfg(unix)'.dependencies] crossbeam-channel = ">=0.5.15" env_logger = "0.11" libc = ">=0.2.39" libloading = "0.8" -log = "0.4.0" once_cell = "1.4.1" krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } +rand = "0.9.2" devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } -rand = "0.9.2" [target.'cfg(target_os = "macos")'.dependencies] hvf = { package = "krun-hvf", version = "=0.1.0-1.18.0", path = "../hvf" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index ce13b10e2..7f696e0c3 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -6,7 +6,14 @@ extern crate log; #[cfg(target_os = "windows")] mod windows_api; +// ── Unix C API implementation ──────────────────────────────────────────────── +// On Windows, the entire C API is implemented in windows_api.rs, which +// delegates to vmm::windows::* directly. Everything below (imports, types, +// statics, and all #[no_mangle] krun_* functions) is the upstream Unix +// C API implementation — gated out on Windows by this single module cfg. #[cfg(not(target_os = "windows"))] +mod unix_api { + use crossbeam_channel::unbounded; #[cfg(feature = "blk")] use devices::virtio::CacheType; @@ -72,12 +79,9 @@ use devices::virtio::display::{DisplayInfoEdid, MAX_DISPLAYS, PhysicalSize}; #[cfg(feature = "input")] use krun_input::{InputConfigBackend, InputEventProviderBackend}; -// Value returned on success. We use libc's errors otherwise. const KRUN_SUCCESS: i32 = 0; -// Maximum number of arguments/environment variables we allow const MAX_ARGS: usize = 4096; -// krunfw library name for each context #[cfg(all(target_os = "linux", not(feature = "tee")))] const KRUNFW_NAME: &str = "libkrunfw.so.5"; #[cfg(all(target_os = "linux", feature = "amd-sev"))] @@ -87,10 +91,9 @@ const KRUNFW_NAME: &str = "libkrunfw-tdx.so.5"; #[cfg(target_os = "macos")] const KRUNFW_NAME: &str = "libkrunfw.5.dylib"; -#[cfg(feature = "aws-nitro")] +#[cfg(feature = "nitro")] static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); -// Path to the init binary to be executed inside the VM. const INIT_PATH: &str = "/init.krun"; static KRUNFW: LazyLock> = @@ -2789,13 +2792,34 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { } } +#[cfg(feature = "nitro")] +#[no_mangle] +fn krun_start_enter_nitro(ctx_id: u32) -> i32 { + let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { + Some(ctx_cfg) => ctx_cfg, + None => return -libc::ENOENT, + }; + + let Ok(enclave) = NitroEnclave::try_from(ctx_cfg) else { + return -libc::EINVAL; + }; + + match enclave.run() { + Ok(ret) => ret, + Err(e) => { + error!("Error running nitro enclave: {e}"); + + -libc::EINVAL + } + } +} + // ============================================================================ // New functions — Unix stubs (full implementations in windows_api.rs) // ============================================================================ /// Start VM on a background thread (non-blocking). /// Not yet implemented on Unix — use krun_start_enter() instead. -#[cfg(not(target_os = "windows"))] #[no_mangle] pub extern "C" fn krun_start(_ctx_id: u32) -> i32 { -libc::ENOSYS @@ -2803,7 +2827,6 @@ pub extern "C" fn krun_start(_ctx_id: u32) -> i32 { /// Block until a running VM exits. Returns exit code. /// Not yet implemented on Unix. -#[cfg(not(target_os = "windows"))] #[no_mangle] pub extern "C" fn krun_wait(_ctx_id: u32) -> i32 { -libc::ENOSYS @@ -2811,7 +2834,6 @@ pub extern "C" fn krun_wait(_ctx_id: u32) -> i32 { /// Request a running VM to stop (non-blocking). /// Not yet implemented on Unix. -#[cfg(not(target_os = "windows"))] #[no_mangle] pub extern "C" fn krun_stop(_ctx_id: u32) -> i32 { -libc::ENOSYS @@ -2819,7 +2841,6 @@ pub extern "C" fn krun_stop(_ctx_id: u32) -> i32 { /// Get captured console output for a VM. /// Not yet implemented on Unix. -#[cfg(not(target_os = "windows"))] #[no_mangle] pub unsafe extern "C" fn krun_get_console_output( _ctx_id: u32, @@ -2831,7 +2852,6 @@ pub unsafe extern "C" fn krun_get_console_output( /// Add a network device backed by a TCP endpoint. /// Not yet implemented on Unix — use krun_add_net_unixstream/unixgram instead. -#[cfg(not(target_os = "windows"))] #[no_mangle] pub unsafe extern "C" fn krun_add_net( _ctx_id: u32, @@ -2841,26 +2861,4 @@ pub unsafe extern "C" fn krun_add_net( -libc::ENOSYS } -// ============================================================================ - -#[cfg(feature = "nitro")] -#[no_mangle] -fn krun_start_enter_nitro(ctx_id: u32) -> i32 { - let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { - Some(ctx_cfg) => ctx_cfg, - None => return -libc::ENOENT, - }; - - let Ok(enclave) = NitroEnclave::try_from(ctx_cfg) else { - return -libc::EINVAL; - }; - - match enclave.run() { - Ok(ret) => ret, - Err(e) => { - error!("Error running nitro enclave: {e}"); - - -libc::EINVAL - } - } -} +} // mod unix_api diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 10e3d3674..99362e29f 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -23,8 +23,19 @@ aws-nitro = [] crossbeam-channel = ">=0.5.15" flate2 = "1.0.35" libc = ">=0.2.39" -linux-loader = { version = "0.13.2", features = ["bzimage", "elf", "pe"] } log = "0.4.0" + +# Dependencies for amd-sev +kbs-types = { version = "0.13.0", optional = true } +serde = { version = "1.0.125", optional = true } +serde_json = { version = "1.0.64", optional = true } +iocuddle = { version = "0.1.1", optional = true } +bitfield = { version = "0.19.4", optional = true } +bitflags = { version = "2.10.0", optional = true } + +# Unix-only dependencies (macOS + Linux) — the upstream Vmm infrastructure +[target.'cfg(unix)'.dependencies] +linux-loader = { version = "0.13.0", features = ["bzimage", "elf", "pe"] } nix = { version = "0.30.1", features = ["fs", "term"] } vm-memory = { version = "0.17.0", features = ["backend-mmap"] } vmm-sys-util = "0.14" @@ -38,19 +49,14 @@ kernel = { package = "krun-kernel", version = "=0.1.0-1.18.0", path = "../kernel utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } -# Dependencies for amd-sev -kbs-types = { version = "0.13.0", optional = true } -serde = { version = "1.0.125", optional = true } -serde_json = { version = "1.0.64", optional = true } -iocuddle = { version = "0.1.1", optional = true } -bitfield = { version = "0.19.4", optional = true } -bitflags = { version = "2.10.0", optional = true } - [target.'cfg(target_arch = "x86_64")'.dependencies] bzip2 = "0.5" -cpuid = { package = "krun-cpuid", version = "=0.1.0-1.18.0", path = "../cpuid" } zstd = "0.13" +# cpuid is needed on Unix x86_64 only (upstream VMM uses it) +[target.'cfg(all(target_arch = "x86_64", unix))'.dependencies] +cpuid = { path = "../cpuid" } + [target.'cfg(target_os = "linux")'.dependencies] tdx = { version = "0.1.0", optional = true } kvm-bindings = { version = "0.12", features = ["fam-wrappers"] } @@ -59,5 +65,14 @@ kvm-ioctls = "0.22" [target.'cfg(target_os = "macos")'.dependencies] hvf = { package = "krun-hvf", version = "=0.1.0-1.18.0", path = "../hvf" } +# Windows-only dependencies (WHPX backend) +[target.'cfg(target_os = "windows")'.dependencies] +windows-sys = { version = "0.61", features = [ + "Win32_System_Hypervisor", + "Win32_System_Memory", + "Win32_System_LibraryLoader", +] } +zerocopy = { version = "0.8", features = ["derive"] } +rand = "0.9" + [dev-dependencies] -devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices", features = ["test_utils"] } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 598da766a..c977b9db3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -13,15 +13,23 @@ #[macro_use] extern crate log; -/// Handles setup and initialization a `Vmm` object. +// ── Windows WHPX backend ───────────────────────────────────────────────────── +// Self-contained module — does NOT use the upstream Vmm infrastructure. +#[cfg(target_os = "windows")] +pub mod windows; + +// ── Unix (Linux + macOS) upstream VMM infrastructure ───────────────────────── +// Everything below is the original Firecracker/libkrun VMM code that depends +// on KVM (Linux), Hypervisor.framework (macOS), and POSIX APIs. +#[cfg(unix)] pub mod builder; +#[cfg(unix)] pub(crate) mod device_manager; -/// Resource store for configured microVM resources. +#[cfg(unix)] pub mod resources; -/// Signal handling utilities. #[cfg(target_os = "linux")] pub mod signal_handler; -/// Wrappers over structures used to configure the VMM. +#[cfg(unix)] pub mod vmm_config; #[cfg(target_os = "linux")] @@ -30,41 +38,57 @@ mod linux; use crate::linux::vstate; #[cfg(target_os = "macos")] mod macos; -#[cfg(target_os = "windows")] -pub mod windows; +#[cfg(unix)] mod terminal; +#[cfg(unix)] pub mod worker; #[cfg(target_os = "macos")] use macos::vstate; +#[cfg(unix)] use std::fmt::{Display, Formatter}; +#[cfg(unix)] use std::io; +#[cfg(unix)] use std::os::unix::io::AsRawFd; +#[cfg(unix)] use std::sync::atomic::{AtomicI32, Ordering}; +#[cfg(unix)] use std::sync::{Arc, Mutex}; #[cfg(target_os = "linux")] use std::time::Duration; -#[cfg(target_arch = "x86_64")] +#[cfg(all(unix, target_arch = "x86_64"))] use crate::device_manager::legacy::PortIODeviceManager; +#[cfg(unix)] use crate::device_manager::mmio::MMIODeviceManager; #[cfg(target_os = "linux")] use crate::vstate::VcpuEvent; +#[cfg(unix)] use crate::vstate::{Vcpu, VcpuHandle, VcpuResponse, Vm}; +#[cfg(unix)] use arch::{ArchMemoryInfo, InitrdConfig}; #[cfg(target_os = "macos")] use crossbeam_channel::Sender; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use devices::fdt; +#[cfg(unix)] use devices::legacy::IrqChip; +#[cfg(unix)] use devices::virtio::VmmExitObserver; +#[cfg(unix)] use devices::{BusDevice, DeviceType}; +#[cfg(unix)] use kernel::cmdline::Cmdline as KernelCmdline; +#[cfg(unix)] use polly::event_manager::{self, EventManager, Subscriber}; +#[cfg(unix)] use utils::epoll::{EpollEvent, EventSet}; +#[cfg(unix)] use utils::eventfd::EventFd; +#[cfg(unix)] use vm_memory::GuestMemoryMmap; /// Success exit code. @@ -87,6 +111,7 @@ pub const FC_EXIT_CODE_ARG_PARSING: u8 = 153; /// Errors associated with the VMM internal logic. These errors cannot be generated by direct user /// input, but can result from bad configuration of the host (for example if Firecracker doesn't /// have permissions to open the KVM fd). +#[cfg(unix)] #[derive(Debug)] pub enum Error { /// This error is thrown by the minimal boot loader implementation. @@ -139,6 +164,7 @@ pub enum Error { VmmObserverTeardown(utils::errno::Error), } +#[cfg(unix)] impl Display for Error { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { use self::Error::*; @@ -179,6 +205,7 @@ impl Display for Error { } /// Trait for objects that need custom initialization and teardown during the Vmm lifetime. +#[cfg(unix)] pub trait VmmEventsObserver { /// This function will be called during microVm boot. fn on_vmm_boot(&mut self) -> std::result::Result<(), utils::errno::Error> { @@ -191,9 +218,11 @@ pub trait VmmEventsObserver { } /// Shorthand result type for internal VMM commands. +#[cfg(unix)] pub type Result = std::result::Result; /// Contains the state and associated methods required for the Firecracker VMM. +#[cfg(unix)] pub struct Vmm { // Guest VM core resources. guest_memory: GuestMemoryMmap, @@ -213,6 +242,7 @@ pub struct Vmm { pio_device_manager: PortIODeviceManager, } +#[cfg(unix)] impl Vmm { /// Gets the the specified bus device. pub fn get_bus_device( @@ -398,6 +428,7 @@ impl Vmm { } } +#[cfg(unix)] impl Subscriber for Vmm { /// Handle a read event (EPOLLIN). fn process(&mut self, event: &EpollEvent, _: &mut EventManager) { From 2fe23a8cf578527d4dd35809f70be27e71884d8e Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 17 Apr 2026 09:22:55 +0800 Subject: [PATCH 04/33] feat(vmm): intercept MSR/CPUID exits to fix WHPX triple fault during Linux boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable WHvPartitionPropertyCodeExtendedVmExits to intercept RDMSR/WRMSR and CPUID instructions before they fault. Without this, unrecognized MSR accesses during early kernel boot cause #GP → double fault → triple fault (WHvRunVpExitReasonUnrecoverableException). Return safe defaults (0 for MSR reads, host pass-through for CPUID) so the kernel proceeds past hardware probing. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/runner.rs | 43 ++++++++++++ src/vmm/src/windows/types.rs | 23 ++++++ src/vmm/src/windows/whpx.rs | 127 ++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 0e7f3c370..e013a2e51 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -103,6 +103,7 @@ mod imp { // Create partition (no APIC emulation — avoids crash on some Win10 hardware). let partition = WhpxPartition::new()?; partition.set_processor_count(ctx.num_vcpus as u32)?; + partition.set_extended_vm_exits(true, true)?; partition.setup()?; // Allocate and map guest memory. @@ -243,6 +244,48 @@ mod imp { break; } } + VcpuExit::MsrAccess { msr_number, is_write, rax, rdx } => { + halt_count = 0; + if is_write { + log::trace!( + "MSR write: 0x{:08X} <- 0x{:016X}", + msr_number, + (rdx << 32) | (rax & 0xFFFF_FFFF) + ); + vcpu.skip_instruction()?; + } else { + log::trace!("MSR read: 0x{:08X} -> 0", msr_number); + vcpu.complete_msr_read(0)?; + } + } + VcpuExit::CpuidAccess { + rax, + rcx, + default_rax, + default_rbx, + default_rcx, + default_rdx, + } => { + halt_count = 0; + log::trace!("CPUID leaf=0x{:X} sub=0x{:X}", rax, rcx); + vcpu.complete_cpuid(default_rax, default_rbx, default_rcx, default_rdx)?; + } + VcpuExit::UnrecoverableException => { + let regs = vcpu.get_registers().ok(); + let sregs = vcpu.get_special_registers().ok(); + log::error!( + "Unrecoverable exception (triple fault) after {} exits. \ + RIP={:#X}, CR0={:#X}, CR3={:#X}, CR4={:#X}, EFER={:#X}", + exit_count, + regs.as_ref().map_or(0, |r| r.rip), + sregs.as_ref().map_or(0, |s| s.cr0), + sregs.as_ref().map_or(0, |s| s.cr3), + sregs.as_ref().map_or(0, |s| s.cr4), + sregs.as_ref().map_or(0, |s| s.efer), + ); + exit_code = -1; + break; + } VcpuExit::Unknown(reason) => { log::error!( "Unknown vCPU exit reason {} after {} exits", diff --git a/src/vmm/src/windows/types.rs b/src/vmm/src/windows/types.rs index e60a33a39..a1005bfd9 100644 --- a/src/vmm/src/windows/types.rs +++ b/src/vmm/src/windows/types.rs @@ -83,6 +83,29 @@ pub enum VcpuExit { Cancelled, /// Interrupt window available (guest RFLAGS.IF became 1). InterruptWindow, + /// Guest executed RDMSR/WRMSR (requires ExtendedVmExits.X64MsrExit). + MsrAccess { + msr_number: u32, + is_write: bool, + /// RAX value (contains write data for WRMSR, undefined for RDMSR). + rax: u64, + /// RDX value (contains write data for WRMSR, undefined for RDMSR). + rdx: u64, + }, + /// Guest executed CPUID (requires ExtendedVmExits.X64CpuidExit). + CpuidAccess { + /// Input: EAX (leaf). + rax: u64, + /// Input: ECX (sub-leaf). + rcx: u64, + /// Default results from host CPUID (pass-through values from WHPX). + default_rax: u64, + default_rbx: u64, + default_rcx: u64, + default_rdx: u64, + }, + /// Unrecoverable guest exception (triple fault). + UnrecoverableException, /// Exit reason not handled. Unknown(u32), } diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index d76435da2..7fba6383a 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -61,6 +61,18 @@ mod imp { bits & 0x3 } + /// Bitfield constants for WHV_EXTENDED_VM_EXITS. + /// Bit 0 = X64CpuidExit, Bit 1 = X64MsrExit. + const EXTENDED_VM_EXITS_CPUID: u64 = 1 << 0; + const EXTENDED_VM_EXITS_MSR: u64 = 1 << 1; + + /// Bitfield accessor for WHV_X64_MSR_ACCESS_INFO. + /// Bit 0 = IsWrite. + fn msr_access_is_write(info: &WHV_X64_MSR_ACCESS_INFO) -> bool { + let bits = unsafe { info.Anonymous._bitfield }; + (bits & 1) != 0 + } + /// A WHPX partition (VM container). /// /// Wraps `WHV_PARTITION_HANDLE` and manages its lifecycle. @@ -144,6 +156,35 @@ mod imp { check_hresult("WHvSetPartitionProperty(LocalApicEmulationMode)", hr) } + /// Enable extended VM exits for MSR and/or CPUID interception. + /// + /// Must be called before [`setup()`]. When enabled, guest RDMSR/WRMSR + /// and CPUID instructions cause VM exits instead of being handled + /// by the hypervisor directly. This is required for Linux kernel boot + /// on WHPX — without it, MSR accesses to unrecognized registers cause + /// #GP faults that cascade into triple faults. + pub fn set_extended_vm_exits(&self, msr_exit: bool, cpuid_exit: bool) -> Result<()> { + let mut bits: u64 = 0; + if cpuid_exit { + bits |= EXTENDED_VM_EXITS_CPUID; + } + if msr_exit { + bits |= EXTENDED_VM_EXITS_MSR; + } + let property = WHV_PARTITION_PROPERTY { + ExtendedVmExits: WHV_EXTENDED_VM_EXITS { AsUINT64: bits }, + }; + let hr = unsafe { + WHvSetPartitionProperty( + self.handle, + WHvPartitionPropertyCodeExtendedVmExits, + &property as *const _ as *const std::ffi::c_void, + std::mem::size_of::() as u32, + ) + }; + check_hresult("WHvSetPartitionProperty(ExtendedVmExits)", hr) + } + /// Finalize the partition configuration. Must be called before creating /// virtual processors or mapping memory. pub fn setup(&self) -> Result<()> { @@ -587,6 +628,29 @@ mod imp { Ok(VcpuExit::Halt) } else if reason == WHvRunVpExitReasonCanceled { Ok(VcpuExit::Cancelled) + } else if reason == WHvRunVpExitReasonX64MsrAccess { + // SAFETY: ExitReason is MsrAccess, so the MsrAccess union field is valid. + let msr_ctx = unsafe { &exit_context.Anonymous.MsrAccess }; + let is_write = msr_access_is_write(&msr_ctx.AccessInfo); + Ok(VcpuExit::MsrAccess { + msr_number: msr_ctx.MsrNumber, + is_write, + rax: msr_ctx.Rax, + rdx: msr_ctx.Rdx, + }) + } else if reason == WHvRunVpExitReasonX64Cpuid { + // SAFETY: ExitReason is CpuidAccess, so the CpuidAccess union field is valid. + let cpuid_ctx = unsafe { &exit_context.Anonymous.CpuidAccess }; + Ok(VcpuExit::CpuidAccess { + rax: cpuid_ctx.Rax, + rcx: cpuid_ctx.Rcx, + default_rax: cpuid_ctx.DefaultResultRax, + default_rbx: cpuid_ctx.DefaultResultRbx, + default_rcx: cpuid_ctx.DefaultResultRcx, + default_rdx: cpuid_ctx.DefaultResultRdx, + }) + } else if reason == WHvRunVpExitReasonUnrecoverableException { + Ok(VcpuExit::UnrecoverableException) } else if reason == WHvRunVpExitReasonNone { Ok(VcpuExit::Shutdown) } else { @@ -628,6 +692,69 @@ mod imp { check_hresult("WHvSetVirtualProcessorRegisters(skip)", hr) } + /// Complete an MSR read (RDMSR): inject result into RAX:RDX and advance RIP. + /// + /// For RDMSR, the 64-bit result is split: low 32 bits in EAX, high 32 in EDX. + /// Call after handling [`VcpuExit::MsrAccess`] where `is_write == false`. + pub fn complete_msr_read(&self, value: u64) -> Result<()> { + let instruction_len = self.exit_instruction_len.get(); + let regs = self.get_registers()?; + let new_rip = regs.rip + instruction_len as u64; + let new_rax = value & 0xFFFF_FFFF; + let new_rdx = value >> 32; + + let names = [WHvX64RegisterRip, WHvX64RegisterRax, WHvX64RegisterRdx]; + let values: Vec = vec![ + reg64(new_rip), + reg64(new_rax), + reg64(new_rdx), + ]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 3, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(msr_read)", hr) + } + + /// Complete a CPUID exit: inject results into RAX/RBX/RCX/RDX and advance RIP. + /// + /// Call after handling [`VcpuExit::CpuidAccess`]. + pub fn complete_cpuid(&self, rax: u64, rbx: u64, rcx: u64, rdx: u64) -> Result<()> { + let instruction_len = self.exit_instruction_len.get(); + let regs = self.get_registers()?; + let new_rip = regs.rip + instruction_len as u64; + + let names = [ + WHvX64RegisterRip, + WHvX64RegisterRax, + WHvX64RegisterRbx, + WHvX64RegisterRcx, + WHvX64RegisterRdx, + ]; + let values: Vec = vec![ + reg64(new_rip), + reg64(rax), + reg64(rbx), + reg64(rcx), + reg64(rdx), + ]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 5, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(cpuid)", hr) + } + /// Complete an I/O IN operation: inject data into RAX and advance RIP. /// /// Preserves upper RAX bits based on the I/O access size: From d052c0857b38acbf6124db6e5d9cfe22c7f5e5a8 Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 17 Apr 2026 14:27:19 +0800 Subject: [PATCH 05/33] feat(vmm): complete WHPX device emulation for Linux kernel boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PIT (8254 Programmable Interval Timer): - Time-based decrementing counter via ns_accumulator (was returning static reload value, causing calibration loops to hang) - BIOS defaults: counter 0 pre-programmed Mode 2, reload=0 (=65536, ~18.2 Hz) so timer interrupts fire before kernel programs the PIT - Tick all 3 counters (not just counter 0) so counter 2 reads work for pit_calibrate_tsc() - Handle reload=0 as 65536 per 8254 specification Device manager: - Port 0x61 bit 5 toggles on each read (was static 0x20), required by Linux pit_calibrate_tsc() which polls for bit changes - i8042 PS/2 controller stubs (ports 0x60/0x64 return 0x00) to prevent driver spin loops on empty buffers Runner (vCPU loop): - Hyper-V CPUID masking: leaf 1 ECX bit 31 cleared, leaves 0x40000000-0x400000FF return zeros — prevents guest from using broken Hyper-V enlightenments (synthetic timers, SynIC) - Progress reporting every 5 seconds with IO port statistics, MSR/CPUID counters, and console output size - GVA-to-GPA page table walker for debugging (unused but available) Smoke test: - examples/boot_kernel.rs: standalone binary using vmm runner directly - Usage: boot_kernel.exe [initrd] [-- extra-cmdline-args] Verified: Linux 6.12.80-0-virt boots to interactive shell in ~5 seconds on Win10 WHPX (MacBook Pro 2014, Haswell). Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1314 ++++++++++------- src/libkrun/Cargo.toml | 5 + src/vmm/Cargo.toml | 4 + src/vmm/examples/boot_kernel.rs | 105 ++ src/vmm/src/windows/devices/manager.rs | 43 +- src/vmm/src/windows/devices/pit.rs | 149 +- src/vmm/src/windows/devices/virtio/p9/mod.rs | 4 +- .../src/windows/devices/virtio/vsock/mod.rs | 8 +- src/vmm/src/windows/runner.rs | 115 +- 9 files changed, 1210 insertions(+), 537 deletions(-) create mode 100644 src/vmm/examples/boot_kernel.rs diff --git a/Cargo.lock b/Cargo.lock index e2c7a049c..250e46f6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,17 +2,26 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + [[package]] name = "adler2" -version = "2.0.1" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "aho-corasick" -version = "1.1.4" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -25,19 +34,19 @@ checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "annotate-snippets" -version = "0.11.5" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "710e8eae58854cdc1790fcb56cca04d712a17be849eeb81da2a724bf4bae2bc4" +checksum = "ccaf7e9dfbb6ab22c82e473cd1a8a7bd313c19a5b7e40970f3d89ef5a5c9e81e" dependencies = [ - "anstyle", "unicode-width", + "yansi-term", ] [[package]] name = "anstream" -version = "0.6.21" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -50,9 +59,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.14" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" @@ -65,35 +74,54 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.5" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.11" +version = "3.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.102" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "arch" +version = "0.1.0" +dependencies = [ + "arch_gen", + "kvm-bindings", + "kvm-ioctls", + "libc", + "smbios", + "tdx", + "utils", + "vm-memory", + "vmm-sys-util 0.14.0", +] + +[[package]] +name = "arch_gen" +version = "0.1.0" [[package]] name = "async-trait" -version = "0.1.89" +version = "0.1.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", @@ -102,9 +130,24 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.5.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets 0.52.6", +] [[package]] name = "base64" @@ -133,19 +176,39 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.72.1" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "annotate-snippets", - "bitflags 2.11.0", + "bitflags 2.10.0", "cexpr", "clang-sys", "itertools", + "lazy_static", + "lazycell", "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 1.1.0", + "shlex", + "syn", +] + +[[package]] +name = "bindgen" +version = "0.72.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f72209734318d0b619a5e0f5129918b848c416e122a3c4ce054e03cb87b726f" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.1", "shlex", "syn", ] @@ -178,9 +241,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.11.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "block-buffer" @@ -193,9 +256,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" [[package]] name = "bzip2" @@ -218,20 +281,20 @@ dependencies = [ [[package]] name = "caps" -version = "0.5.6" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd1ddba47aba30b6a889298ad0109c3b8dcb0e8fc993b459daa7067d46f865e0" +checksum = "190baaad529bcfbde9e1a19022c42781bdb6ff9de25721abdb8fd98c0807730b" dependencies = [ "libc", + "thiserror 1.0.69", ] [[package]] name = "cc" -version = "1.2.57" +version = "1.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "956a5e21988b87f372569b66183b78babf23ebc2e744b733e4350a752c4dafac" dependencies = [ - "find-msvc-tools", "jobserver", "libc", "shlex", @@ -243,14 +306,14 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom 7.1.3", + "nom", ] [[package]] name = "cfg-expr" -version = "0.20.7" +version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6b04e07d8080154ed4ac03546d9a2b303cc2fe1901ba0b35b301516e289368" +checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02" dependencies = [ "smallvec", "target-lexicon", @@ -258,9 +321,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.4" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "cfg_aliases" @@ -281,15 +344,15 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.5" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "convert_case" -version = "0.8.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" dependencies = [ "unicode-segmentation", ] @@ -299,6 +362,9 @@ name = "cookie-factory" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2" +dependencies = [ + "futures", +] [[package]] name = "cpufeatures" @@ -309,11 +375,20 @@ dependencies = [ "libc", ] +[[package]] +name = "cpuid" +version = "0.1.0" +dependencies = [ + "kvm-bindings", + "kvm-ioctls", + "vmm-sys-util 0.14.0", +] + [[package]] name = "crc32fast" -version = "1.5.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] @@ -335,14 +410,45 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crypto-common" -version = "0.1.7" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] +[[package]] +name = "devices" +version = "0.1.0" +dependencies = [ + "arch", + "bitflags 1.3.2", + "caps", + "crossbeam-channel", + "hvf", + "imago", + "krun_display", + "krun_input", + "kvm-bindings", + "kvm-ioctls", + "libc", + "libloading", + "log", + "lru", + "nix 0.30.1", + "pipewire", + "polly", + "rand", + "rutabaga_gfx", + "thiserror 2.0.12", + "utils", + "virtio-bindings", + "vm-fdt", + "vm-memory", + "zerocopy", +] + [[package]] name = "digest" version = "0.10.7" @@ -361,9 +467,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "env_filter" -version = "1.0.0" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", "regex", @@ -371,9 +477,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.9" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" dependencies = [ "anstream", "anstyle", @@ -395,31 +501,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] name = "filetime" -version = "0.2.27" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" dependencies = [ "cfg-if", "libc", "libredox", + "windows-sys 0.60.2", ] -[[package]] -name = "find-msvc-tools" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" - [[package]] name = "flate2" -version = "1.1.9" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "miniz_oxide", @@ -427,9 +528,98 @@ dependencies = [ [[package]] name = "foldhash" -version = "0.2.0" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] [[package]] name = "generic-array" @@ -443,27 +633,33 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.3.4" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", "r-efi", - "wasip2", + "wasi", ] +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + [[package]] name = "glob" -version = "0.3.3" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" dependencies = [ "allocator-api2", "equivalent", @@ -476,11 +672,21 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hvf" +version = "0.1.0" +dependencies = [ + "arch", + "crossbeam-channel", + "libloading", + "log", +] + [[package]] name = "imago" -version = "0.2.2" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e8e4b92aa0dd860579cfba776dbf0918a3a7ac5cb601af7d3fc835e71592a5b" +checksum = "5a46885ecbabe024b9e6e3ee9ee3a32ad05adf2b6269f45aa912793d9c54c805" dependencies = [ "async-trait", "bincode", @@ -493,14 +699,14 @@ dependencies = [ "tokio", "tracing", "vm-memory", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] name = "indexmap" -version = "2.13.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown", @@ -514,43 +720,43 @@ checksum = "d8972d5be69940353d5347a1344cb375d9b457d6809b428b05bb1ca2fb9ce007" [[package]] name = "is_terminal_polyfill" -version = "1.70.2" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" -version = "0.13.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] [[package]] name = "itoa" -version = "1.0.18" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jiff" -version = "0.2.23" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +checksum = "a194df1107f33c79f4f93d02c80798520551949d59dfad22b6157048a88cca93" dependencies = [ "jiff-static", "log", "portable-atomic", "portable-atomic-util", - "serde_core", + "serde", ] [[package]] name = "jiff-static" -version = "0.2.23" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +checksum = "6c6e1db7ed32c6c71b759497fae34bf7933636f75a251b9e736555da426f6442" dependencies = [ "proc-macro2", "quote", @@ -559,9 +765,9 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.34" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ "getrandom", "libc", @@ -569,9 +775,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ "once_cell", "wasm-bindgen", @@ -589,340 +795,188 @@ dependencies = [ "sha2", "sm3", "strum", - "thiserror 2.0.18", + "thiserror 2.0.12", ] [[package]] -name = "krun-arch" -version = "0.1.0-1.18.0" -dependencies = [ - "krun-arch-gen", - "krun-smbios", - "krun-utils", - "kvm-bindings", - "kvm-ioctls", - "libc", - "tdx", - "vm-memory", - "vmm-sys-util 0.14.0", -] - -[[package]] -name = "krun-arch-gen" -version = "0.1.0-1.18.0" - -[[package]] -name = "krun-aws-nitro" -version = "0.1.0-1.18.0" -dependencies = [ - "krun-devices", - "libc", - "log", - "nitro-enclaves 0.6.1", - "nix 0.30.1", - "signal-hook", - "tar", - "vsock", -] - -[[package]] -name = "krun-cpuid" -version = "0.1.0-1.18.0" -dependencies = [ - "kvm-bindings", - "kvm-ioctls", - "vmm-sys-util 0.14.0", -] - -[[package]] -name = "krun-devices" -version = "0.1.0-1.18.0" +name = "kernel" +version = "0.1.0" dependencies = [ - "bitflags 1.3.2", - "caps", - "crossbeam-channel", - "imago", - "krun-arch", - "krun-display", - "krun-hvf", - "krun-input", - "krun-polly", - "krun-rutabaga-gfx", - "krun-utils", - "kvm-bindings", - "kvm-ioctls", - "libc", - "libloading", - "log", - "lru", - "nix 0.30.1", - "pipewire", - "rand", - "thiserror 2.0.18", - "virtio-bindings", - "vm-fdt", + "utils", "vm-memory", - "zerocopy", ] [[package]] -name = "krun-display" +name = "krun_display" version = "0.1.0" dependencies = [ - "bindgen", - "bitflags 2.11.0", + "bindgen 0.72.0", + "bitflags 2.10.0", "log", "static_assertions", - "thiserror 2.0.18", + "thiserror 2.0.12", ] [[package]] -name = "krun-hvf" -version = "0.1.0-1.18.0" -dependencies = [ - "crossbeam-channel", - "krun-arch", - "libloading", - "log", -] - -[[package]] -name = "krun-input" +name = "krun_input" version = "0.1.0" dependencies = [ - "bindgen", - "bitflags 2.11.0", + "bindgen 0.72.0", + "bitflags 2.10.0", "libc", "log", "static_assertions", - "thiserror 2.0.18", + "thiserror 2.0.12", ] [[package]] -name = "krun-kernel" -version = "0.1.0-1.18.0" -dependencies = [ - "krun-utils", - "vm-memory", -] - -[[package]] -name = "krun-polly" -version = "0.1.0-1.18.0" -dependencies = [ - "krun-utils", - "libc", -] - -[[package]] -name = "krun-rutabaga-gfx" -version = "0.1.0-1.18.0" -dependencies = [ - "anyhow", - "cfg-if", - "libc", - "log", - "nix 0.30.1", - "pkg-config", - "remain", - "thiserror 1.0.69", - "vmm-sys-util 0.14.0", - "winapi", - "zerocopy", -] - -[[package]] -name = "krun-smbios" -version = "0.1.0-1.18.0" -dependencies = [ - "vm-memory", -] - -[[package]] -name = "krun-utils" -version = "0.1.0-1.18.0" +name = "kvm-bindings" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b153a59bb3ca930ff8148655b2ef68c34259a623ae08cf2fb9b570b2e45363" dependencies = [ - "bitflags 1.3.2", - "crossbeam-channel", - "kvm-bindings", - "libc", - "log", - "nix 0.30.1", "vmm-sys-util 0.14.0", ] [[package]] -name = "krun-vmm" -version = "0.1.0-1.18.0" +name = "kvm-ioctls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b702df98508cb63ad89dd9beb9f6409761b30edca10d48e57941d3f11513a006" dependencies = [ - "bitfield", - "bitflags 2.11.0", - "bzip2", - "crossbeam-channel", - "flate2", - "iocuddle", - "kbs-types", - "krun-arch", - "krun-arch-gen", - "krun-cpuid", - "krun-devices", - "krun-display", - "krun-hvf", - "krun-input", - "krun-kernel", - "krun-polly", - "krun-utils", + "bitflags 2.10.0", "kvm-bindings", - "kvm-ioctls", "libc", - "linux-loader", - "log", - "nix 0.30.1", - "serde", - "serde_json", - "tdx", - "vm-memory", "vmm-sys-util 0.14.0", - "zstd", ] [[package]] -name = "kvm-bindings" -version = "0.12.1" +name = "lazy_static" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" -dependencies = [ - "vmm-sys-util 0.14.0", -] +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] -name = "kvm-ioctls" -version = "0.22.1" +name = "lazycell" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" -dependencies = [ - "bitflags 2.11.0", - "kvm-bindings", - "libc", - "vmm-sys-util 0.14.0", -] +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libkrun" -version = "1.18.0" +version = "1.16.0" dependencies = [ "crossbeam-channel", + "devices", "env_logger", - "krun-aws-nitro", - "krun-devices", - "krun-display", - "krun-hvf", - "krun-input", - "krun-polly", - "krun-utils", - "krun-vmm", + "hvf", + "krun_display", + "krun_input", "kvm-bindings", "kvm-ioctls", "libc", "libloading", "log", - "nitro-enclaves 0.5.0", + "nitro", + "nitro-enclaves", "once_cell", + "polly", "rand", + "utils", "vm-memory", + "vmm", ] [[package]] name = "libloading" -version = "0.8.9" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-link", + "windows-targets 0.53.5", ] [[package]] name = "libredox" -version = "0.1.14" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "libc", - "plain", "redox_syscall", ] [[package]] name = "libspa" -version = "0.9.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6b8cfa2a7656627b4c92c6b9ef929433acd673d5ab3708cda1b18478ac00df4" +checksum = "65f3a4b81b2a2d8c7f300643676202debd1b7c929dbf5c9bb89402ea11d19810" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "cc", "convert_case", "cookie-factory", "libc", "libspa-sys", - "nix 0.30.1", - "nom 8.0.0", + "nix 0.27.1", + "nom", "system-deps", ] [[package]] name = "libspa-sys" -version = "0.9.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901049455d2eb6decf9058235d745237952f4804bc584c5fcb41412e6adcc6e0" +checksum = "bf0d9716420364790e85cbb9d3ac2c950bde16a7dd36f3209b7dfdfc4a24d01f" dependencies = [ - "bindgen", + "bindgen 0.69.5", "cc", "system-deps", ] [[package]] name = "linux-loader" -version = "0.13.2" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" +checksum = "870c3814345f050991f99869417779f6062542bcf4ed81db7a1b926ad1306638" dependencies = [ "vm-memory", ] [[package]] name = "linux-raw-sys" -version = "0.12.1" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "log" -version = "0.4.29" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "lru" -version = "0.16.3" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +checksum = "9f8cc7106155f10bdf99a6f379688f543ad6596a415375b36a59a054ceda1198" dependencies = [ "hashbrown", ] [[package]] name = "memchr" -version = "2.8.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" @@ -950,36 +1004,36 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.8.9" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", "simd-adler32", ] [[package]] -name = "nitro-enclaves" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b5b539a76e3f555fb143c3e67d5e05fa1d5fece02a515f6ecf41b3f1a081f58" +name = "nitro" +version = "0.1.0" dependencies = [ - "bitflags 2.11.0", + "devices", "libc", - "nix 0.26.4", - "rand", + "nitro-enclaves", + "nix 0.30.1", + "tar", "vsock", ] [[package]] name = "nitro-enclaves" -version = "0.6.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6436c562bcdb6f192e0e59f627bff5b0b88f2e1c48264079f4f1d6da42bec2d" +checksum = "2b5b539a76e3f555fb143c3e67d5e05fa1d5fece02a515f6ecf41b3f1a081f58" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "libc", "nix 0.26.4", + "rand", "vsock", ] @@ -998,11 +1052,22 @@ dependencies = [ [[package]] name = "nix" -version = "0.30.1" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "libc", +] + +[[package]] +name = "nix" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -1011,11 +1076,11 @@ dependencies = [ [[package]] name = "nix" -version = "0.31.2" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "cfg-if", "cfg_aliases", "libc", @@ -1033,25 +1098,25 @@ dependencies = [ ] [[package]] -name = "nom" -version = "8.0.0" +name = "object" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.21.4" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.2" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] name = "page_size" @@ -1065,9 +1130,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.17" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -1077,28 +1142,28 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipewire" -version = "0.9.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9688b89abf11d756499f7c6190711d6dbe5a3acdb30c8fbf001d6596d06a8d44" +checksum = "08e645ba5c45109106d56610b3ee60eb13a6f2beb8b74f8dc8186cf261788dda" dependencies = [ "anyhow", - "bitflags 2.11.0", + "bitflags 2.10.0", "libc", "libspa", "libspa-sys", - "nix 0.30.1", + "nix 0.27.1", "once_cell", "pipewire-sys", - "thiserror 2.0.18", + "thiserror 1.0.69", ] [[package]] name = "pipewire-sys" -version = "0.9.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb028afee0d6ca17020b090e3b8fa2d7de23305aef975c7e5192a5050246ea36" +checksum = "849e188f90b1dda88fe2bfe1ad31fe5f158af2c98f80fb5d13726c44f3f01112" dependencies = [ - "bindgen", + "bindgen 0.69.5", "libspa-sys", "system-deps", ] @@ -1110,22 +1175,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] -name = "plain" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +name = "polly" +version = "0.0.1" +dependencies = [ + "libc", + "utils", +] [[package]] name = "portable-atomic" -version = "1.13.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "portable-atomic-util" -version = "0.2.6" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" dependencies = [ "portable-atomic", ] @@ -1141,27 +1208,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.106" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.45" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] [[package]] name = "r-efi" -version = "5.3.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rand" @@ -1185,27 +1252,27 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.9.5" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ "getrandom", ] [[package]] name = "redox_syscall" -version = "0.7.3" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", ] [[package]] name = "regex" -version = "1.12.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -1215,9 +1282,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.14" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -1226,9 +1293,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "remain" @@ -1241,6 +1308,18 @@ dependencies = [ "syn", ] +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -1258,53 +1337,66 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.4" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] name = "rustversion" -version = "1.0.22" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" [[package]] -name = "semver" -version = "1.0.27" +name = "rutabaga_gfx" +version = "0.1.2" +dependencies = [ + "anyhow", + "cfg-if", + "libc", + "log", + "nix 0.30.1", + "pkg-config", + "remain", + "thiserror 1.0.69", + "vmm-sys-util 0.14.0", + "winapi", + "zerocopy", +] + +[[package]] +name = "ryu" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] -name = "serde" -version = "1.0.228" +name = "semver" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" [[package]] -name = "serde_core" -version = "1.0.228" +name = "serde" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.228" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", @@ -1313,24 +1405,23 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", + "ryu", "serde", - "serde_core", - "zmij", ] [[package]] name = "serde_spanned" -version = "1.1.1" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" dependencies = [ - "serde_core", + "serde", ] [[package]] @@ -1351,31 +1442,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] -name = "signal-hook" -version = "0.3.18" +name = "simd-adler32" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" -dependencies = [ - "libc", - "signal-hook-registry", -] +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] -name = "signal-hook-registry" -version = "1.4.8" +name = "slab" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "errno", - "libc", + "autocfg", ] -[[package]] -name = "simd-adler32" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" - [[package]] name = "sm3" version = "0.4.2" @@ -1387,9 +1467,16 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" + +[[package]] +name = "smbios" +version = "0.1.0" +dependencies = [ + "vm-memory", +] [[package]] name = "static_assertions" @@ -1420,9 +1507,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" dependencies = [ "proc-macro2", "quote", @@ -1431,9 +1518,9 @@ dependencies = [ [[package]] name = "system-deps" -version = "7.0.8" +version = "6.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "396a35feb67335377e0251fcbc1092fc85c484bd4e3a7a54319399da127796e7" +checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349" dependencies = [ "cfg-expr", "heck", @@ -1444,9 +1531,9 @@ dependencies = [ [[package]] name = "tar" -version = "0.4.45" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" dependencies = [ "filetime", "libc", @@ -1455,9 +1542,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.13.3" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tdx" @@ -1465,7 +1552,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad59e5bf374211a1fdd8e7439a07d5a5e617fe97f5cf21d03bcd1bf8c82b73af" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.10.0", "iocuddle", "kvm-bindings", "kvm-ioctls", @@ -1485,11 +1572,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.18" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ - "thiserror-impl 2.0.18", + "thiserror-impl 2.0.12", ] [[package]] @@ -1505,9 +1592,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.18" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", @@ -1516,57 +1603,53 @@ dependencies = [ [[package]] name = "tokio" -version = "1.50.0" +version = "1.45.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" dependencies = [ + "backtrace", "pin-project-lite", ] [[package]] name = "toml" -version = "1.1.2+spec-1.1.0" +version = "0.8.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ - "indexmap", - "serde_core", + "serde", "serde_spanned", "toml_datetime", - "toml_parser", - "toml_writer", - "winnow", + "toml_edit", ] [[package]] name = "toml_datetime" -version = "1.1.1+spec-1.1.0" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" dependencies = [ - "serde_core", + "serde", ] [[package]] -name = "toml_parser" -version = "1.1.2+spec-1.1.0" +name = "toml_edit" +version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", "winnow", ] -[[package]] -name = "toml_writer" -version = "1.1.1+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" - [[package]] name = "tracing" -version = "0.1.44" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -1575,9 +1658,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.31" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +checksum = "1b1ffbcf9c6f6b99d386e7444eb608ba646ae452a36b39737deb9663b610f662" dependencies = [ "proc-macro2", "quote", @@ -1586,24 +1669,24 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.36" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" dependencies = [ "once_cell", ] [[package]] name = "typenum" -version = "1.19.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "unicode-ident" -version = "1.0.24" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-segmentation" @@ -1613,9 +1696,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.2.2" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unty" @@ -1629,11 +1712,24 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "utils" +version = "0.1.0" +dependencies = [ + "bitflags 1.3.2", + "crossbeam-channel", + "kvm-bindings", + "libc", + "log", + "nix 0.30.1", + "vmm-sys-util 0.14.0", +] + [[package]] name = "uuid" -version = "1.22.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "js-sys", "wasm-bindgen", @@ -1641,9 +1737,9 @@ dependencies = [ [[package]] name = "version-compare" -version = "0.2.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c2856837ef78f57382f06b2b8563a2f512f7185d732608fd9176cb3b8edf0e" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" [[package]] name = "version_check" @@ -1653,9 +1749,9 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "virtio-bindings" -version = "0.2.7" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" +checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" [[package]] name = "virtue" @@ -1671,15 +1767,54 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.17.1" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39348a049689cabd3377cdd9182bf526ec76a6f823b79903896452e9d7a7380" +checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" dependencies = [ "libc", - "thiserror 2.0.18", + "thiserror 1.0.69", "winapi", ] +[[package]] +name = "vmm" +version = "0.1.0" +dependencies = [ + "arch", + "arch_gen", + "bitfield", + "bitflags 2.10.0", + "bzip2", + "cpuid", + "crossbeam-channel", + "devices", + "flate2", + "hvf", + "iocuddle", + "kbs-types", + "kernel", + "krun_display", + "krun_input", + "kvm-bindings", + "kvm-ioctls", + "libc", + "linux-loader", + "log", + "nix 0.30.1", + "polly", + "rand", + "serde", + "serde_json", + "tdx", + "thiserror 2.0.12", + "utils", + "vm-memory", + "vmm-sys-util 0.14.0", + "windows-sys 0.61.2", + "zerocopy", + "zstd", +] + [[package]] name = "vmm-sys-util" version = "0.12.1" @@ -1702,41 +1837,54 @@ dependencies = [ [[package]] name = "vsock" -version = "0.5.3" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b82aeb12ad864eb8cd26a6c21175d0bdc66d398584ee6c93c76964c3bcfc78ff" +checksum = "4e8b4d00e672f147fc86a09738fadb1445bd1c0a40542378dfb82909deeee688" dependencies = [ "libc", - "nix 0.31.2", + "nix 0.29.0", ] [[package]] -name = "wasip2" -version = "1.0.2+wasi-0.2.9" +name = "wasi" +version = "0.14.2+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" dependencies = [ - "wit-bindgen", + "wit-bindgen-rt", ] [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1744,22 +1892,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ - "bumpalo", "proc-macro2", "quote", "syn", + "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" dependencies = [ "unicode-ident", ] @@ -1792,6 +1940,24 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -1801,17 +1967,152 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "winnow" -version = "1.0.2" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0" +checksum = "c06928c8748d81b05c9be96aad92e1b6ff01833332f281e8cfca3be4b35fc9ec" +dependencies = [ + "memchr", +] [[package]] -name = "wit-bindgen" -version = "0.51.0" +name = "wit-bindgen-rt" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.10.0", +] [[package]] name = "xattr" @@ -1823,32 +2124,35 @@ dependencies = [ "rustix", ] +[[package]] +name = "yansi-term" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe5c30ade05e61656247b2e334a031dfd0cc466fadef865bdcdea8d537951bf1" +dependencies = [ + "winapi", +] + [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", "syn", ] -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" - [[package]] name = "zstd" version = "0.13.3" @@ -1869,9 +2173,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.16+zstd.1.5.7" +version = "2.0.15+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" dependencies = [ "cc", "pkg-config", diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 7868baba4..60ba04270 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -51,6 +51,11 @@ aws-nitro = { package = "krun-aws-nitro", version = "=0.1.0-1.18.0", path = "../ nitro-enclaves = { version = "0.5.0", optional = true } vm-memory = { version = "0.17", features = ["backend-mmap"] } +# Windows-only dependencies (WHPX C API) +[target.'cfg(target_os = "windows")'.dependencies] +env_logger = "0.11" +libc = ">=0.2.39" + [lib] name = "krun" crate-type = ["cdylib", "lib"] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 99362e29f..131bb2f0f 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -67,6 +67,7 @@ hvf = { package = "krun-hvf", version = "=0.1.0-1.18.0", path = "../hvf" } # Windows-only dependencies (WHPX backend) [target.'cfg(target_os = "windows")'.dependencies] +thiserror = "2" windows-sys = { version = "0.61", features = [ "Win32_System_Hypervisor", "Win32_System_Memory", @@ -75,4 +76,7 @@ windows-sys = { version = "0.61", features = [ zerocopy = { version = "0.8", features = ["derive"] } rand = "0.9" +[target.'cfg(target_os = "windows")'.dev-dependencies] +env_logger = "0.11" + [dev-dependencies] diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs new file mode 100644 index 000000000..fb54be877 --- /dev/null +++ b/src/vmm/examples/boot_kernel.rs @@ -0,0 +1,105 @@ +//! Smoke test: boot a Linux kernel inside a WHPX VM using the VMM runner. +//! +//! Usage: +//! boot_kernel.exe [initrd] [-- extra-cmdline-args...] +//! +//! Example: +//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 C:\kernels\initrd.img +//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 -- console=ttyS0 lpj=1000000 + +use std::path::PathBuf; + +fn main() { + // Initialize logging (RUST_LOG controls verbosity). + env_logger::init(); + + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + eprintln!("Usage: {} [initrd] [-- extra-cmdline-args...]", args[0]); + std::process::exit(1); + } + + let kernel_path = PathBuf::from(&args[1]); + if !kernel_path.exists() { + eprintln!("Kernel not found: {}", kernel_path.display()); + std::process::exit(1); + } + + // Parse optional initrd and extra cmdline args. + let mut initrd_path: Option = None; + let mut extra_cmdline: Vec<&str> = Vec::new(); + let mut past_separator = false; + + for arg in &args[2..] { + if arg == "--" { + past_separator = true; + continue; + } + if past_separator { + extra_cmdline.push(arg); + } else if initrd_path.is_none() { + let p = PathBuf::from(arg); + if p.exists() { + initrd_path = Some(p); + } else { + eprintln!("Warning: initrd not found: {}, treating as cmdline arg", arg); + extra_cmdline.push(arg); + } + } else { + extra_cmdline.push(arg); + } + } + + // Build the VmContext via the C-API-style context functions. + let ctx_id = vmm::windows::context::create_ctx().expect("create_ctx failed"); + + vmm::windows::context::with_ctx_mut(ctx_id, |ctx| { + ctx.num_vcpus = 1; + ctx.ram_mib = 256; + ctx.kernel_path = Some(kernel_path.clone()); + ctx.initramfs_path = initrd_path.clone(); + + // Build kernel command line. + let mut cmdline_parts = vec![ + "console=ttyS0", + "earlyprintk=serial", + "noapic", + "nolapic", + "noacpi", + "nosmp", + "lpj=1000000", + "nokaslr", + "panic=-1", + ]; + cmdline_parts.extend(extra_cmdline.iter()); + ctx.kernel_cmdline = Some(cmdline_parts.join(" ")); + + Ok(()) + }) + .expect("configure ctx failed"); + + println!("=== WHPX Smoke Test ==="); + println!("Kernel: {}", kernel_path.display()); + println!( + "Initrd: {}", + initrd_path + .as_ref() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| "(none)".to_string()) + ); + + // Take the context out of the global map and run synchronously. + let ctx = vmm::windows::context::take_ctx(ctx_id).expect("take_ctx failed"); + + println!("Starting VM..."); + match vmm::windows::runner::run(ctx) { + Ok(code) => { + println!("VM exited with code {}", code); + std::process::exit(code); + } + Err(e) => { + eprintln!("VM error: {}", e); + std::process::exit(1); + } + } +} diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 77885eddf..104472352 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -135,6 +135,11 @@ pub struct DeviceManager { window_requested: bool, /// Last PIT tick timestamp. last_tick: Instant, + /// Toggle state for port 0x61 bit 5 (PIT counter 2 output). + /// + /// Linux's `pit_calibrate_tsc()` loops reading port 0x61 waiting for + /// bit 5 to toggle. Without toggling, TSC calibration stalls forever. + port61_toggle: bool, } impl DeviceManager { @@ -236,6 +241,7 @@ impl DeviceManager { virtio_net, window_requested: false, last_tick: Instant::now(), + port61_toggle: false, }; Ok(DeviceSetup { @@ -256,8 +262,10 @@ impl DeviceManager { self.pic.raise_irq(4); } } else if self.pic.handles_port(port) { + log::trace!("PIC write: port={:#X} data={:#X}", port, data as u8); self.pic.write_port(port, data as u8); } else if self.pit.handles_port(port) { + log::trace!("PIT write: port={:#X} data={:#X}", port, data as u8); self.pit.write_port(port, data as u8); } else if port == 0x70 { self.cmos_addr = (data as u8) & 0x7F; @@ -284,9 +292,23 @@ impl DeviceManager { } else if (0xCF8..=0xCFF).contains(&port) { 0xFFFF_FFFF // PCI config: no devices. } else if port == 0x61 { - 0x20 // System control port B: timer 2 output high. + // System control port B: toggle bit 5 (PIT counter 2 output). + // + // Linux's `pit_calibrate_tsc()` reads this port in a tight loop + // waiting for bit 5 to change. A static value causes an infinite + // loop that stalls kernel boot. Toggling on each read lets the + // calibration complete. + self.port61_toggle = !self.port61_toggle; + if self.port61_toggle { 0x20 } else { 0x00 } } else if port == 0x92 { 0x02 // System control port A: A20 enabled. + } else if port == 0x60 || port == 0x64 { + // i8042 PS/2 controller: data (0x60) and status (0x64). + // + // Return 0x00 = both buffers empty, no pending data. + // Without this, the default 0xFF makes the i8042 driver spin in + // udelay() loops waiting for the input buffer to drain. + 0x00 } else { 0xFF // Default: no device. } @@ -460,6 +482,7 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { virtio_net: None, window_requested: false, last_tick: Instant::now(), + port61_toggle: false, } } @@ -531,9 +554,14 @@ mod tests { } #[test] - fn test_io_in_system_control_port_b() { + fn test_io_in_system_control_port_b_toggles() { let mut dm = make_test_devices(); - assert_eq!(dm.handle_io_in(0x61, 1), 0x20); + // Port 0x61 bit 5 toggles on each read. + let first = dm.handle_io_in(0x61, 1); + let second = dm.handle_io_in(0x61, 1); + assert_ne!(first, second, "bit 5 should toggle"); + let third = dm.handle_io_in(0x61, 1); + assert_eq!(first, third, "should cycle back"); } #[test] @@ -542,6 +570,15 @@ mod tests { assert_eq!(dm.handle_io_in(0x92, 1), 0x02); } + #[test] + fn test_io_in_i8042_status_empty() { + let mut dm = make_test_devices(); + // Port 0x64 (i8042 status): both buffers empty. + assert_eq!(dm.handle_io_in(0x64, 1), 0x00); + // Port 0x60 (i8042 data): no data. + assert_eq!(dm.handle_io_in(0x60, 1), 0x00); + } + #[test] fn test_io_in_unknown_port() { let mut dm = make_test_devices(); diff --git a/src/vmm/src/windows/devices/pit.rs b/src/vmm/src/windows/devices/pit.rs index 79ef191ce..1ffc0e003 100644 --- a/src/vmm/src/windows/devices/pit.rs +++ b/src/vmm/src/windows/devices/pit.rs @@ -140,7 +140,7 @@ impl PitCounter { /// Read a data byte from this counter's data port. fn read_data(&mut self) -> u8 { - let value = self.latched_value.unwrap_or(self.reload); + let value = self.latched_value.unwrap_or_else(|| self.current_count()); match self.access { AccessMode::Low => { @@ -168,18 +168,53 @@ impl PitCounter { /// Latch the current count value for reading. fn latch(&mut self) { if self.latched_value.is_none() { - self.latched_value = Some(self.reload); + self.latched_value = Some(self.current_count()); } } + /// Effective reload value: 0 means 65536 per 8254 specification. + /// + /// In the real 8254 PIT, a reload value of 0 is treated as 65536 + /// (the maximum 16-bit count). This matches BIOS behavior where the + /// PIT is initialized with reload=0 giving ~18.2 Hz. + fn effective_reload(&self) -> u64 { + if self.reload == 0 { 65536 } else { self.reload as u64 } + } + + /// Compute the current counter value based on accumulated time. + /// + /// A real 8254 counts down from the reload value to 0. Software + /// reads the counter (via latch or direct read) to measure elapsed + /// time. Without this, counter reads return the static reload value + /// and Linux calibration loops that poll the counter never terminate. + fn current_count(&self) -> u16 { + if !self.reload_ready { + // Counter not (yet) programmed, or Mode 0 finished (one-shot). + // Mode 0 after terminal count: counter sits at 0. + if matches!(self.mode, CounterMode::InterruptOnTerminal) { + return 0; + } + return self.reload; + } + let reload = self.effective_reload(); + // How many PIT ticks into the current reload cycle? + let ticks_in_period = + (self.ns_accumulator as u128 * PIT_FREQUENCY as u128) / NS_PER_SEC as u128; + let position = (ticks_in_period as u64) % reload; + // Counter counts down: reload → 0. + (reload - position) as u16 + } + /// Advance the counter by `elapsed_ns` nanoseconds. /// /// Returns the number of times the counter reached zero (fired). fn tick(&mut self, elapsed_ns: u64) -> u64 { - if !self.reload_ready || self.reload == 0 { + if !self.reload_ready { return 0; } + let reload = self.effective_reload(); + match self.mode { CounterMode::RateGenerator | CounterMode::SquareWave => { // Accumulate elapsed time. @@ -192,12 +227,12 @@ impl PitCounter { (self.ns_accumulator as u128 * PIT_FREQUENCY as u128) / NS_PER_SEC as u128; // How many full reload cycles is that? - let fires = total_ticks / self.reload as u128; + let fires = total_ticks / reload as u128; // Subtract consumed nanoseconds (keep remainder in accumulator). // consumed_ns = fires * reload * NS_PER_SEC / PIT_FREQUENCY let consumed_ns = - (fires * self.reload as u128 * NS_PER_SEC as u128) / PIT_FREQUENCY as u128; + (fires * reload as u128 * NS_PER_SEC as u128) / PIT_FREQUENCY as u128; self.ns_accumulator -= consumed_ns as u64; fires as u64 @@ -207,7 +242,7 @@ impl PitCounter { self.ns_accumulator += elapsed_ns; let total_ticks = (self.ns_accumulator as u128 * PIT_FREQUENCY as u128) / NS_PER_SEC as u128; - if total_ticks >= self.reload as u128 { + if total_ticks >= reload as u128 { self.reload_ready = false; // One-shot: stop after firing. self.ns_accumulator = 0; 1 @@ -232,10 +267,22 @@ impl Default for Pit { } impl Pit { - /// Create a new PIT with all counters in their initial state. + /// Create a new PIT with BIOS-compatible default state. + /// + /// Counter 0 is pre-programmed in Mode 2 (rate generator) with a + /// reload value of 0 (= 65536, giving ~18.2 Hz). This matches real + /// PC behavior where the BIOS initializes the PIT before handing + /// off to the OS. Without this, timer interrupts won't fire until + /// the kernel programs the PIT, but the kernel may depend on timer + /// interrupts *before* it programs the PIT (e.g., jiffies-based + /// timeouts in early hardware probing). pub fn new() -> Self { + let mut counter0 = PitCounter::new(); + counter0.mode = CounterMode::RateGenerator; + counter0.reload = 0; // 0 = 65536 per 8254 spec → ~18.2 Hz + counter0.reload_ready = true; Pit { - counters: [PitCounter::new(), PitCounter::new(), PitCounter::new()], + counters: [counter0, PitCounter::new(), PitCounter::new()], } } @@ -266,12 +313,18 @@ impl Pit { } } - /// Advance counter 0 by `elapsed_ns` nanoseconds. + /// Advance all counters by `elapsed_ns` nanoseconds. /// /// Returns the number of times counter 0 fired (should raise IRQ 0 - /// for each fire). + /// for each fire). Counters 1 and 2 are also ticked so their + /// `ns_accumulator` stays current — required for `current_count()` + /// to return meaningful values when Linux reads these counters + /// (e.g., PIT counter 2 for TSC calibration). pub fn tick(&mut self, elapsed_ns: u64) -> u64 { - self.counters[0].tick(elapsed_ns) + let fires = self.counters[0].tick(elapsed_ns); + self.counters[1].tick(elapsed_ns); + self.counters[2].tick(elapsed_ns); + fires } /// Parse and apply a control word written to port 0x43. @@ -399,13 +452,16 @@ mod tests { } #[test] - fn test_counter_zero_reload_no_fire() { + fn test_counter_zero_reload_means_65536() { let mut counter = PitCounter::new(); counter.mode = CounterMode::RateGenerator; - counter.reload = 0; // Zero reload should not fire. + counter.reload = 0; // 0 = 65536 per 8254 spec. counter.reload_ready = true; - assert_eq!(counter.tick(1_000_000), 0); + // 65536 ticks at 1,193,182 Hz → ~54.9ms period. + // 100ms should produce ~1 fire. + let fires = counter.tick(100_000_000); + assert!(fires >= 1 && fires <= 2, "expected ~1 fire, got {}", fires); } #[test] @@ -579,12 +635,15 @@ mod tests { } #[test] - fn test_pit_no_fire_before_program() { + fn test_pit_fires_with_bios_defaults() { let mut pit = Pit::new(); - assert_eq!( - pit.tick(100_000_000), - 0, - "should not fire when unprogrammed" + // PIT starts pre-programmed at ~18.2 Hz (reload 0 = 65536). + // 100ms should produce ~1-2 fires. + let fires = pit.tick(100_000_000); + assert!( + fires >= 1 && fires <= 2, + "expected ~1-2 fires from BIOS defaults, got {}", + fires ); } @@ -624,6 +683,58 @@ mod tests { ); } + #[test] + fn test_counter_read_decrements_after_tick() { + let mut counter = PitCounter::new(); + counter.mode = CounterMode::RateGenerator; + counter.access = AccessMode::LoThenHi; + counter.reload = 11932; // ~100 Hz + counter.reload_ready = true; + + // Initially, count should equal reload (no time elapsed). + assert_eq!(counter.current_count(), 11932); + + // Tick 5ms — about half a period. Counter should be roughly half. + counter.tick(5_000_000); + let count = counter.current_count(); + assert!( + count < 11932 && count > 0, + "expected count between 0 and 11932, got {}", + count + ); + } + + #[test] + fn test_counter2_counts_down_for_tsc_calibration() { + // Linux's pit_calibrate_tsc() programs counter 2 in Mode 0 + // and reads it in a loop expecting the value to decrease. + let mut pit = Pit::new(); + + // Program counter 2: mode 0 (interrupt on terminal), lo-hi. + // Control word: counter=2 (bits 7-6=10), access=lo-hi (bits 5-4=11), + // mode=0 (bits 3-1=000), BCD=0 (bit 0=0) + // = 0b_10_11_000_0 = 0xB0 + pit.write_port(PIT_COMMAND, 0xB0); + pit.write_port(PIT_COUNTER2, 0xFF); // Low byte. + pit.write_port(PIT_COUNTER2, 0xFF); // High byte → reload = 0xFFFF. + + // Tick the PIT (simulating vCPU loop iterations). + pit.tick(10_000_000); // 10ms + + // Latch counter 2 and read it. + pit.write_port(PIT_COMMAND, 0x80); // Latch counter 2 (counter=2, access=00). + let lo = pit.read_port(PIT_COUNTER2); + let hi = pit.read_port(PIT_COUNTER2); + let count = lo as u16 | ((hi as u16) << 8); + + // Count should be less than the initial reload value. + assert!( + count < 0xFFFF, + "counter 2 should have decremented, got {:#X}", + count + ); + } + #[test] fn test_pit_incremental_ticks() { let mut pit = Pit::new(); diff --git a/src/vmm/src/windows/devices/virtio/p9/mod.rs b/src/vmm/src/windows/devices/virtio/p9/mod.rs index 5402c6704..24d19567e 100644 --- a/src/vmm/src/windows/devices/virtio/p9/mod.rs +++ b/src/vmm/src/windows/devices/virtio/p9/mod.rs @@ -11,8 +11,8 @@ pub mod protocol; use std::path::PathBuf; -use super::super::mmio::VirtioDeviceBackend; -use super::super::queue::{GuestMemoryAccessor, Virtqueue}; +use super::mmio::VirtioDeviceBackend; +use super::queue::{GuestMemoryAccessor, Virtqueue}; use self::filesystem::P9Filesystem; use self::protocol::*; diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 57ec13dc7..360d4ec84 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -16,10 +16,10 @@ use std::collections::HashMap; use std::io::{self, Read, Write}; use std::net::{TcpListener, TcpStream}; -use super::super::mmio::VirtioDeviceBackend; -use super::super::queue::{GuestMemoryAccessor, Virtqueue}; -use super::connection::{ConnState, VsockConnection}; -use super::packet::{ +use super::mmio::VirtioDeviceBackend; +use super::queue::{GuestMemoryAccessor, Virtqueue}; +use connection::{ConnState, VsockConnection}; +use packet::{ VsockHeader, VSOCK_CID_HOST, VSOCK_HEADER_SIZE, VSOCK_OP_REQUEST, }; diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index e013a2e51..170ae280a 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -12,7 +12,7 @@ mod imp { use std::collections::HashMap; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; - use std::time::Duration; + use std::time::{Duration, Instant}; use super::super::boot::loader::load_kernel_with_initrd; use super::super::devices::virtio::queue::GuestMemoryAccessor; @@ -59,6 +59,49 @@ mod imp { static RUNNING_VMS: std::sync::LazyLock>> = std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); + /// Translate a guest virtual address (GVA) to guest physical address (GPA) + /// by walking the x86_64 4-level page table starting from CR3. + #[allow(dead_code)] + fn translate_gva(guest_mem: &GuestMemory, cr3: u64, gva: u64) -> Option { + let pml4_base = cr3 & !0xFFF; + let pml4_idx = ((gva >> 39) & 0x1FF) as usize; + let pdpt_idx = ((gva >> 30) & 0x1FF) as usize; + let pd_idx = ((gva >> 21) & 0x1FF) as usize; + let pt_idx = ((gva >> 12) & 0x1FF) as usize; + let offset = gva & 0xFFF; + + // PML4 entry + let mut buf = [0u8; 8]; + guest_mem.read_at_addr(pml4_base + (pml4_idx as u64) * 8, &mut buf).ok()?; + let pml4e = u64::from_le_bytes(buf); + if pml4e & 1 == 0 { return None; } // not present + + // PDPT entry + let pdpt_base = pml4e & 0x000F_FFFF_FFFF_F000; + guest_mem.read_at_addr(pdpt_base + (pdpt_idx as u64) * 8, &mut buf).ok()?; + let pdpte = u64::from_le_bytes(buf); + if pdpte & 1 == 0 { return None; } + if pdpte & 0x80 != 0 { // 1GB page + return Some((pdpte & 0x000F_FFFF_C000_0000) | (gva & 0x3FFF_FFFF)); + } + + // PD entry + let pd_base = pdpte & 0x000F_FFFF_FFFF_F000; + guest_mem.read_at_addr(pd_base + (pd_idx as u64) * 8, &mut buf).ok()?; + let pde = u64::from_le_bytes(buf); + if pde & 1 == 0 { return None; } + if pde & 0x80 != 0 { // 2MB page + return Some((pde & 0x000F_FFFF_FFE0_0000) | (gva & 0x1F_FFFF)); + } + + // PT entry + let pt_base = pde & 0x000F_FFFF_FFFF_F000; + guest_mem.read_at_addr(pt_base + (pt_idx as u64) * 8, &mut buf).ok()?; + let pte = u64::from_le_bytes(buf); + if pte & 1 == 0 { return None; } + Some((pte & 0x000F_FFFF_FFFF_F000) | offset) + } + /// Core vCPU loop shared by `run()` and `start()`. /// /// Sets up the WHPX partition, loads the kernel, creates devices and vCPU, @@ -111,8 +154,9 @@ mod imp { guest_mem.map_to_partition(&partition)?; // Create devices from context. + let ctx_id = ctx.id; let setup = DeviceManager::from_context(&ctx)?; - devices::store_console_buffer(ctx.id, setup.console_buffer); + devices::store_console_buffer(ctx_id, setup.console_buffer); let mut devices = setup.devices; // Build kernel command line. @@ -157,6 +201,14 @@ mod imp { let mem_adapter = GuestMemoryAdapter(&guest_mem); let mut exit_count: u64 = 0; let mut halt_count: u64 = 0; + let start_time = Instant::now(); + let mut last_progress = Instant::now(); + // IO/MMIO access counters for debugging boot stalls. + let mut io_read_counts: HashMap = HashMap::new(); + let mut io_write_counts: HashMap = HashMap::new(); + let mut mmio_count: u64 = 0; + let mut msr_count: u64 = 0; + let mut cpuid_count: u64 = 0; let exit_code; loop { @@ -168,6 +220,7 @@ mod imp { match vcpu.interrupts_enabled() { Ok(true) => { if let Some(vector) = devices.pic.acknowledge() { + log::trace!("Injecting interrupt vector {:#X}", vector); vcpu.inject_interrupt(vector)?; devices.set_window_requested(false); } @@ -188,16 +241,19 @@ mod imp { match exit { VcpuExit::IoOut { port, size, data } => { halt_count = 0; + *io_write_counts.entry(port).or_insert(0) += 1; devices.handle_io_out(port, size, data); vcpu.skip_instruction()?; } VcpuExit::IoIn { port, size } => { halt_count = 0; + *io_read_counts.entry(port).or_insert(0) += 1; let data = devices.handle_io_in(port, size); vcpu.complete_io_in(data, size)?; } VcpuExit::MmioRead { address, size } => { halt_count = 0; + mmio_count += 1; let data = devices.handle_mmio_read(address, size); vcpu.complete_mmio_read(data)?; } @@ -207,6 +263,7 @@ mod imp { data, } => { halt_count = 0; + mmio_count += 1; devices.handle_mmio_write(address, size, data, &mem_adapter); vcpu.skip_instruction()?; } @@ -243,9 +300,45 @@ mod imp { exit_code = 0; break; } + // Wall-clock progress report every 5 seconds. + if last_progress.elapsed() >= Duration::from_secs(5) { + last_progress = Instant::now(); + if let Ok(regs) = vcpu.get_registers() { + let console_len = devices::get_console_output(ctx_id) + .map(|b| b.len()) + .unwrap_or(0); + // Sort IO ports by frequency (descending), show all. + let mut reads: Vec<_> = io_read_counts.iter().collect(); + reads.sort_by(|a, b| b.1.cmp(a.1)); + let top_reads: Vec = reads.iter() + .map(|(p, c)| format!("{:#X}:{}", p, c)) + .collect(); + let mut writes: Vec<_> = io_write_counts.iter().collect(); + writes.sort_by(|a, b| b.1.cmp(a.1)); + let top_writes: Vec = writes.iter() + .map(|(p, c)| format!("{:#X}:{}", p, c)) + .collect(); + log::info!( + "Progress @ {:.1}s: exits={} RIP={:#X} RSP={:#X} RFLAGS={:#X} console={}B \ + mmio={} msr={} cpuid={}", + start_time.elapsed().as_secs_f64(), + exit_count, + regs.rip, + regs.rsp, + regs.rflags, + console_len, + mmio_count, + msr_count, + cpuid_count, + ); + log::info!(" IO_reads=[{}]", top_reads.join(", ")); + log::info!(" IO_writes=[{}]", top_writes.join(", ")); + } + } } VcpuExit::MsrAccess { msr_number, is_write, rax, rdx } => { halt_count = 0; + msr_count += 1; if is_write { log::trace!( "MSR write: 0x{:08X} <- 0x{:016X}", @@ -267,8 +360,22 @@ mod imp { default_rdx, } => { halt_count = 0; - log::trace!("CPUID leaf=0x{:X} sub=0x{:X}", rax, rcx); - vcpu.complete_cpuid(default_rax, default_rbx, default_rcx, default_rdx)?; + cpuid_count += 1; + let leaf = rax as u32; + // Mask hypervisor-related CPUID leaves to prevent the Linux + // guest from detecting Hyper-V and trying to use enlightenments + // (synthetic timers, SynIC, TSC page) that our WHPX partition + // doesn't fully support. Without this, the kernel's Hyper-V + // init code stalls on broken clock sources. + let (out_rax, out_rbx, out_rcx, out_rdx) = match leaf { + // Leaf 1: clear "hypervisor present" bit (ECX bit 31). + 1 => (default_rax, default_rbx, default_rcx & !(1u64 << 31), default_rdx), + // Hyper-V CPUID range: return zeros (no hypervisor features). + 0x40000000..=0x400000FF => (0, 0, 0, 0), + _ => (default_rax, default_rbx, default_rcx, default_rdx), + }; + log::trace!("CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", rax, rcx, out_rax); + vcpu.complete_cpuid(out_rax, out_rbx, out_rcx, out_rdx)?; } VcpuExit::UnrecoverableException => { let regs = vcpu.get_registers().ok(); From 047861d3eeea7109230f17b082ec2be6ba1d344f Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 17 Apr 2026 16:17:59 +0800 Subject: [PATCH 06/33] feat(vmm): add virtio-blk disk support and WHPX cmdline refinements - boot_kernel.rs: add --disk CLI flag for virtio-blk testing - cmdline.rs: move nohyperv/lpj/nokaslr from user cmdline to BASE_CMDLINE (always needed for WHPX, prevents Hyper-V enlightenment stalls) - runner.rs: minor comment cleanup in CPUID masking Tested on Win10 WHPX: - Kernel boot without disk: OK (shell prompt in ~5s) - Kernel boot with 64MB ext4 disk: virtio_blk detected [vda], EXT4-fs mounted r/w successfully, 168 MMIO exits Co-Authored-By: Claude Opus 4.6 --- src/vmm/examples/boot_kernel.rs | 72 ++++++++++++++++++++++++--------- src/vmm/src/windows/cmdline.rs | 19 ++++++++- src/vmm/src/windows/runner.rs | 3 +- 3 files changed, 72 insertions(+), 22 deletions(-) diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs index fb54be877..545487822 100644 --- a/src/vmm/examples/boot_kernel.rs +++ b/src/vmm/examples/boot_kernel.rs @@ -1,21 +1,27 @@ //! Smoke test: boot a Linux kernel inside a WHPX VM using the VMM runner. //! //! Usage: -//! boot_kernel.exe [initrd] [-- extra-cmdline-args...] +//! boot_kernel.exe [initrd] [--disk ] [-- extra-cmdline-args...] //! //! Example: //! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 C:\kernels\initrd.img -//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 -- console=ttyS0 lpj=1000000 +//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 -- lpj=1000000 nokaslr +//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 --disk C:\rootfs.img -- lpj=1000000 use std::path::PathBuf; +use vmm::windows::context::{DiskConfig, DISK_FORMAT_RAW}; + fn main() { // Initialize logging (RUST_LOG controls verbosity). env_logger::init(); let args: Vec = std::env::args().collect(); if args.len() < 2 { - eprintln!("Usage: {} [initrd] [-- extra-cmdline-args...]", args[0]); + eprintln!( + "Usage: {} [initrd] [--disk ] [-- extra-cmdline-args...]", + args[0] + ); std::process::exit(1); } @@ -25,18 +31,37 @@ fn main() { std::process::exit(1); } - // Parse optional initrd and extra cmdline args. + // Parse optional initrd, --disk, and extra cmdline args. let mut initrd_path: Option = None; + let mut disk_path: Option = None; let mut extra_cmdline: Vec<&str> = Vec::new(); let mut past_separator = false; + let mut i = 2; - for arg in &args[2..] { + while i < args.len() { + let arg = &args[i]; if arg == "--" { past_separator = true; + i += 1; continue; } if past_separator { extra_cmdline.push(arg); + i += 1; + continue; + } + if arg == "--disk" { + i += 1; + if i >= args.len() { + eprintln!("--disk requires a path argument"); + std::process::exit(1); + } + let p = PathBuf::from(&args[i]); + if !p.exists() { + eprintln!("Disk image not found: {}", p.display()); + std::process::exit(1); + } + disk_path = Some(p); } else if initrd_path.is_none() { let p = PathBuf::from(arg); if p.exists() { @@ -48,6 +73,7 @@ fn main() { } else { extra_cmdline.push(arg); } + i += 1; } // Build the VmContext via the C-API-style context functions. @@ -59,20 +85,21 @@ fn main() { ctx.kernel_path = Some(kernel_path.clone()); ctx.initramfs_path = initrd_path.clone(); - // Build kernel command line. - let mut cmdline_parts = vec![ - "console=ttyS0", - "earlyprintk=serial", - "noapic", - "nolapic", - "noacpi", - "nosmp", - "lpj=1000000", - "nokaslr", - "panic=-1", - ]; - cmdline_parts.extend(extra_cmdline.iter()); - ctx.kernel_cmdline = Some(cmdline_parts.join(" ")); + // Attach disk if provided. + if let Some(ref dp) = disk_path { + ctx.disks.push(DiskConfig { + block_id: "root".to_string(), + path: dp.clone(), + format: DISK_FORMAT_RAW, + read_only: false, + }); + } + + // Extra cmdline args are appended after the base cmdline and MMIO + // device lines that build_kernel_cmdline() generates automatically. + if !extra_cmdline.is_empty() { + ctx.kernel_cmdline = Some(extra_cmdline.join(" ")); + } Ok(()) }) @@ -87,6 +114,13 @@ fn main() { .map(|p| p.display().to_string()) .unwrap_or_else(|| "(none)".to_string()) ); + println!( + "Disk: {}", + disk_path + .as_ref() + .map(|p| p.display().to_string()) + .unwrap_or_else(|| "(none)".to_string()) + ); // Take the context out of the global map and run synchronously. let ctx = vmm::windows::context::take_ctx(ctx_id).expect("take_ctx failed"); diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index 5e8989742..13f58daff 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -9,8 +9,16 @@ pub const MMIO_SLOT_SIZE: u64 = 0x200; pub const FIRST_MMIO_IRQ: u8 = 5; /// Base kernel command line parameters. +/// +/// - `nohyperv`: Disable Hyper-V guest enlightenments. WHPX exposes Hyper-V +/// CPUID leaves but doesn't fully support synthetic timers/SynIC, causing +/// clock stalls if the kernel tries to use them. +/// - `lpj=1000000`: Preset loops_per_jiffy to skip delay calibration, which +/// depends on a reliable timer source. +/// - `nokaslr`: Disable kernel address space randomization for deterministic +/// boot in our controlled single-vCPU environment. const BASE_CMDLINE: &str = - "console=ttyS0 earlyprintk=serial,ttyS0,115200 noapic nolapic noacpi nosmp"; + "console=ttyS0 earlyprintk=serial,ttyS0,115200 noapic nolapic noacpi nosmp nohyperv lpj=1000000 nokaslr"; /// Description of a virtio-MMIO device slot for command line generation. #[derive(Debug, Clone)] @@ -130,9 +138,18 @@ mod tests { #[test] fn test_empty_user_cmdline_no_trailing_space() { let cmdline = build_kernel_cmdline(Some(""), false, &[]); + assert!(!cmdline.ends_with(' ')); assert_eq!(cmdline, BASE_CMDLINE); } + #[test] + fn test_base_cmdline_has_nohyperv() { + let cmdline = build_kernel_cmdline(None, false, &[]); + assert!(cmdline.contains("nohyperv")); + assert!(cmdline.contains("lpj=1000000")); + assert!(cmdline.contains("nokaslr")); + } + #[test] fn test_mmio_base_for_slot() { assert_eq!(mmio_base_for_slot(0), 0xD000_0000); diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 170ae280a..f13538f35 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -365,8 +365,7 @@ mod imp { // Mask hypervisor-related CPUID leaves to prevent the Linux // guest from detecting Hyper-V and trying to use enlightenments // (synthetic timers, SynIC, TSC page) that our WHPX partition - // doesn't fully support. Without this, the kernel's Hyper-V - // init code stalls on broken clock sources. + // doesn't fully support. let (out_rax, out_rbx, out_rcx, out_rdx) = match leaf { // Leaf 1: clear "hypervisor present" bit (ECX bit 31). 1 => (default_rax, default_rbx, default_rcx & !(1u64 << 31), default_rdx), From 9569d73c98268eb136ca8d0912b7524834751e9a Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 18 Apr 2026 14:36:32 +0800 Subject: [PATCH 07/33] feat(vmm): wire root disk remount and init path into WHPX kernel cmdline Implement krun_set_root_disk_remount on Windows (was -ENOSYS stub) to store root_disk_device and root_disk_fstype in VmContext. Extend build_kernel_cmdline() to accept root device override, init path, and init argv -- producing cmdlines like: root=/dev/vdb rootfstype=ext4 rw init=/boxlite/bin/boxlite-guest ... -- --listen vsock://2695 --notify vsock://2696 This enables the full box lifecycle on WHPX where there is no firmware to mount rootfs and exec the guest agent automatically. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + src/libkrun/src/windows_api.rs | 12 +- src/vmm/src/windows/cmdline.rs | 193 +++++++++++++++++++++++++++++++-- src/vmm/src/windows/context.rs | 7 ++ src/vmm/src/windows/runner.rs | 4 + 5 files changed, 203 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 250e46f6a..8b2b9a039 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1788,6 +1788,7 @@ dependencies = [ "cpuid", "crossbeam-channel", "devices", + "env_logger", "flate2", "hvf", "iocuddle", diff --git a/src/libkrun/src/windows_api.rs b/src/libkrun/src/windows_api.rs index 5828fb295..12883d5f5 100644 --- a/src/libkrun/src/windows_api.rs +++ b/src/libkrun/src/windows_api.rs @@ -486,12 +486,16 @@ pub unsafe extern "C" fn krun_set_data_disk(_ctx_id: u32, _c_disk_path: *const c #[no_mangle] pub unsafe extern "C" fn krun_set_root_disk_remount( - _ctx_id: u32, - _device: *const c_char, - _fstype: *const c_char, + ctx_id: u32, + device: *const c_char, + fstype: *const c_char, _options: *const c_char, ) -> i32 { - -libc::ENOSYS + to_c_result(context::with_ctx_mut(ctx_id, |ctx| { + ctx.root_disk_device = c_str_to_string(device); + ctx.root_disk_fstype = c_str_to_string(fstype); + Ok(()) + })) } #[no_mangle] diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index 13f58daff..717d8ae77 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -30,17 +30,42 @@ pub struct MmioSlot { } /// Build the full kernel command line. +/// +/// Parameters: +/// - `user_cmdline`: Extra kernel parameters appended after device config. +/// - `has_root_disk`: Whether a root disk is attached (default `/dev/vda`). +/// - `mmio_slots`: Virtio-MMIO device slots to register. +/// - `root_disk_device`: Override root device (e.g., "/dev/vdb"). Takes priority over `has_root_disk`. +/// - `root_disk_fstype`: Filesystem type for root device (e.g., "ext4"). +/// - `exec_path`: Path to init binary (added as `init=`). +/// - `exec_argv`: Arguments passed after `--` separator for the init process. pub fn build_kernel_cmdline( user_cmdline: Option<&str>, has_root_disk: bool, mmio_slots: &[MmioSlot], + root_disk_device: Option<&str>, + root_disk_fstype: Option<&str>, + exec_path: Option<&str>, + exec_argv: &[String], ) -> String { let mut cmdline = BASE_CMDLINE.to_string(); - if has_root_disk { + // Root device: explicit override takes priority over default. + if let Some(device) = root_disk_device { + cmdline.push_str(&format!(" root={}", device)); + if let Some(fstype) = root_disk_fstype { + cmdline.push_str(&format!(" rootfstype={}", fstype)); + } + cmdline.push_str(" rw"); + } else if has_root_disk { cmdline.push_str(" root=/dev/vda rw"); } + // Init binary path. + if let Some(path) = exec_path { + cmdline.push_str(&format!(" init={}", path)); + } + for slot in mmio_slots { if !slot.active { continue; @@ -60,6 +85,12 @@ pub fn build_kernel_cmdline( } } + // Init arguments after separator. + if !exec_argv.is_empty() { + cmdline.push_str(" -- "); + cmdline.push_str(&exec_argv.join(" ")); + } + cmdline } @@ -77,15 +108,20 @@ pub fn irq_for_slot(index: u8) -> u8 { mod tests { use super::*; + /// Helper: build cmdline with only the legacy params (no root override, no init). + fn build_simple(user: Option<&str>, has_root: bool, slots: &[MmioSlot]) -> String { + build_kernel_cmdline(user, has_root, slots, None, None, None, &[]) + } + #[test] fn test_base_cmdline_only() { - let cmdline = build_kernel_cmdline(None, false, &[]); + let cmdline = build_simple(None, false, &[]); assert_eq!(cmdline, BASE_CMDLINE); } #[test] fn test_with_root_disk() { - let cmdline = build_kernel_cmdline(None, true, &[]); + let cmdline = build_simple(None, true, &[]); assert!(cmdline.contains("root=/dev/vda rw")); assert!(cmdline.starts_with(BASE_CMDLINE)); } @@ -102,7 +138,7 @@ mod tests { active: true, }, ]; - let cmdline = build_kernel_cmdline(None, true, &slots); + let cmdline = build_simple(None, true, &slots); assert!(cmdline.contains("virtio_mmio.device=512@0xd0000000:5")); assert!(cmdline.contains("virtio_mmio.device=512@0xd0000200:6")); } @@ -123,7 +159,7 @@ mod tests { active: true, }, ]; - let cmdline = build_kernel_cmdline(None, false, &slots); + let cmdline = build_simple(None, false, &slots); assert!(cmdline.contains("virtio_mmio.device=512@0xd0000000:5")); assert!(!cmdline.contains("0xd0000200")); assert!(cmdline.contains("virtio_mmio.device=512@0xd0000400:7")); @@ -131,20 +167,20 @@ mod tests { #[test] fn test_user_cmdline_appended() { - let cmdline = build_kernel_cmdline(Some("init=/bin/sh"), false, &[]); - assert!(cmdline.ends_with("init=/bin/sh")); + let cmdline = build_simple(Some("custom_param=1"), false, &[]); + assert!(cmdline.ends_with("custom_param=1")); } #[test] fn test_empty_user_cmdline_no_trailing_space() { - let cmdline = build_kernel_cmdline(Some(""), false, &[]); + let cmdline = build_simple(Some(""), false, &[]); assert!(!cmdline.ends_with(' ')); assert_eq!(cmdline, BASE_CMDLINE); } #[test] fn test_base_cmdline_has_nohyperv() { - let cmdline = build_kernel_cmdline(None, false, &[]); + let cmdline = build_simple(None, false, &[]); assert!(cmdline.contains("nohyperv")); assert!(cmdline.contains("lpj=1000000")); assert!(cmdline.contains("nokaslr")); @@ -180,7 +216,7 @@ mod tests { active: true, }, ]; - let cmdline = build_kernel_cmdline(Some("quiet"), true, &slots); + let cmdline = build_simple(Some("quiet"), true, &slots); let base_pos = cmdline.find(BASE_CMDLINE).unwrap(); let root_pos = cmdline.find("root=/dev/vda").unwrap(); @@ -195,4 +231,141 @@ mod tests { assert!(mmio1_pos < mmio2_pos); assert!(mmio2_pos < user_pos); } + + // ---- New tests for root_disk_device, exec_path, exec_argv ---- + + #[test] + fn test_root_disk_device_override() { + let cmdline = build_kernel_cmdline( + None, + false, + &[], + Some("/dev/vdb"), + Some("ext4"), + None, + &[], + ); + assert!(cmdline.contains("root=/dev/vdb")); + assert!(cmdline.contains("rootfstype=ext4")); + assert!(cmdline.contains("rw")); + assert!(!cmdline.contains("/dev/vda")); + } + + #[test] + fn test_root_disk_overrides_default() { + // When both has_root_disk=true and root_disk_device is set, + // the explicit device takes priority. + let cmdline = build_kernel_cmdline( + None, + true, + &[], + Some("/dev/vdb"), + Some("ext4"), + None, + &[], + ); + assert!(cmdline.contains("root=/dev/vdb")); + assert!(!cmdline.contains("root=/dev/vda")); + } + + #[test] + fn test_root_disk_device_without_fstype() { + let cmdline = build_kernel_cmdline( + None, + false, + &[], + Some("/dev/vdb"), + None, + None, + &[], + ); + assert!(cmdline.contains("root=/dev/vdb")); + assert!(!cmdline.contains("rootfstype=")); + assert!(cmdline.contains("rw")); + } + + #[test] + fn test_init_path() { + let cmdline = build_kernel_cmdline( + None, + false, + &[], + None, + None, + Some("/boxlite/bin/boxlite-guest"), + &[], + ); + assert!(cmdline.contains("init=/boxlite/bin/boxlite-guest")); + } + + #[test] + fn test_init_args_after_separator() { + let argv = vec![ + "--listen".to_string(), + "vsock://2695".to_string(), + "--notify".to_string(), + "vsock://2696".to_string(), + ]; + let cmdline = build_kernel_cmdline( + None, + false, + &[], + None, + None, + Some("/boxlite/bin/boxlite-guest"), + &argv, + ); + assert!(cmdline.contains("init=/boxlite/bin/boxlite-guest")); + assert!(cmdline.ends_with("-- --listen vsock://2695 --notify vsock://2696")); + } + + #[test] + fn test_no_separator_when_argv_empty() { + let cmdline = build_kernel_cmdline( + None, + false, + &[], + None, + None, + Some("/bin/init"), + &[], + ); + assert!(cmdline.contains("init=/bin/init")); + assert!(!cmdline.contains("--")); + } + + #[test] + fn test_full_lifecycle_cmdline() { + // Simulates the full box lifecycle cmdline: + // root=/dev/vdb rootfstype=ext4 rw init=/boxlite/bin/boxlite-guest + // virtio_mmio devices, then -- + let slots = vec![ + MmioSlot { index: 0, active: true }, + MmioSlot { index: 1, active: true }, + ]; + let argv = vec![ + "--listen".to_string(), + "vsock://2695".to_string(), + ]; + let cmdline = build_kernel_cmdline( + None, + true, + &slots, + Some("/dev/vdb"), + Some("ext4"), + Some("/boxlite/bin/boxlite-guest"), + &argv, + ); + + // Verify ordering: base < root < init < mmio < argv + let root_pos = cmdline.find("root=/dev/vdb").unwrap(); + let init_pos = cmdline.find("init=/boxlite/bin/boxlite-guest").unwrap(); + let mmio_pos = cmdline.find("virtio_mmio").unwrap(); + let sep_pos = cmdline.find("-- --listen").unwrap(); + + assert!(root_pos < init_pos); + assert!(init_pos < mmio_pos); + assert!(mmio_pos < sep_pos); + assert!(!cmdline.contains("root=/dev/vda")); + } } diff --git a/src/vmm/src/windows/context.rs b/src/vmm/src/windows/context.rs index 1a46811ac..f33a92481 100644 --- a/src/vmm/src/windows/context.rs +++ b/src/vmm/src/windows/context.rs @@ -62,6 +62,11 @@ pub struct VmContext { pub apic_emulation: bool, /// Network device configuration. pub net_config: Option, + /// Root disk device path override (e.g., "/dev/vdb"). + /// When set, the kernel cmdline uses this instead of the default "/dev/vda". + pub root_disk_device: Option, + /// Root disk filesystem type (e.g., "ext4"). + pub root_disk_fstype: Option, } /// Network device configuration. @@ -119,6 +124,8 @@ impl VmContext { rlimits: Vec::new(), apic_emulation: true, net_config: None, + root_disk_device: None, + root_disk_fstype: None, } } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index f13538f35..1e2812e8e 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -164,6 +164,10 @@ mod imp { ctx.kernel_cmdline.as_deref(), setup.has_root_disk, &setup.mmio_slots, + ctx.root_disk_device.as_deref(), + ctx.root_disk_fstype.as_deref(), + ctx.exec_path.as_deref(), + &ctx.argv, ); // Load kernel. From 94f66d3842db52b3b50ec69b109cbd0a77fbe6d4 Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 18 Apr 2026 14:43:37 +0800 Subject: [PATCH 08/33] feat(vmm): add --init, --root, --fstype, --argv options to boot_kernel example Enable Level 2 smoke testing of the new cmdline wiring on WHPX: boot_kernel.exe vmlinuz --disk rootfs.img --init /bin/sh boot_kernel.exe vmlinuz --disk rootfs.img --root /dev/vda --fstype ext4 --init /init --argv --flag Co-Authored-By: Claude Opus 4.6 --- src/vmm/examples/boot_kernel.rs | 126 +++++++++++++++++++++++++------- 1 file changed, 100 insertions(+), 26 deletions(-) diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs index 545487822..c1c650262 100644 --- a/src/vmm/examples/boot_kernel.rs +++ b/src/vmm/examples/boot_kernel.rs @@ -1,12 +1,27 @@ //! Smoke test: boot a Linux kernel inside a WHPX VM using the VMM runner. //! //! Usage: -//! boot_kernel.exe [initrd] [--disk ] [-- extra-cmdline-args...] +//! boot_kernel.exe [initrd] [options] [-- extra-cmdline-args...] //! -//! Example: -//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 C:\kernels\initrd.img -//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 -- lpj=1000000 nokaslr -//! boot_kernel.exe C:\kernels\vmlinuz-6.6.75 --disk C:\rootfs.img -- lpj=1000000 +//! Options: +//! --disk Attach a raw disk image as virtio-blk device +//! --init Set init binary path (kernel `init=` parameter) +//! --root Override root device (e.g., /dev/vda). Default: auto from --disk +//! --fstype Root filesystem type (e.g., ext4). Used with --root +//! --argv Arguments passed to init after `--` separator (repeat for each arg) +//! +//! Examples: +//! # Boot with initramfs only (existing behavior) +//! boot_kernel.exe vmlinuz initrd.img +//! +//! # Boot with disk as root, kernel mounts /dev/vda automatically +//! boot_kernel.exe vmlinuz --disk rootfs.img +//! +//! # Boot with disk + explicit init binary +//! boot_kernel.exe vmlinuz --disk rootfs.img --init /bin/sh +//! +//! # Full lifecycle test: disk + init + argv +//! boot_kernel.exe vmlinuz --disk rootfs.img --init /init --argv --listen --argv vsock://2695 use std::path::PathBuf; @@ -19,7 +34,8 @@ fn main() { let args: Vec = std::env::args().collect(); if args.len() < 2 { eprintln!( - "Usage: {} [initrd] [--disk ] [-- extra-cmdline-args...]", + "Usage: {} [initrd] [--disk ] [--init ] \ + [--root ] [--fstype ] [--argv ]... [-- extra-cmdline-args...]", args[0] ); std::process::exit(1); @@ -31,9 +47,13 @@ fn main() { std::process::exit(1); } - // Parse optional initrd, --disk, and extra cmdline args. + // Parse optional arguments. let mut initrd_path: Option = None; let mut disk_path: Option = None; + let mut init_path: Option = None; + let mut root_device: Option = None; + let mut root_fstype: Option = None; + let mut init_argv: Vec = Vec::new(); let mut extra_cmdline: Vec<&str> = Vec::new(); let mut past_separator = false; let mut i = 2; @@ -50,28 +70,65 @@ fn main() { i += 1; continue; } - if arg == "--disk" { - i += 1; - if i >= args.len() { - eprintln!("--disk requires a path argument"); - std::process::exit(1); + match arg.as_str() { + "--disk" => { + i += 1; + if i >= args.len() { + eprintln!("--disk requires a path argument"); + std::process::exit(1); + } + let p = PathBuf::from(&args[i]); + if !p.exists() { + eprintln!("Disk image not found: {}", p.display()); + std::process::exit(1); + } + disk_path = Some(p); } - let p = PathBuf::from(&args[i]); - if !p.exists() { - eprintln!("Disk image not found: {}", p.display()); - std::process::exit(1); + "--init" => { + i += 1; + if i >= args.len() { + eprintln!("--init requires a path argument"); + std::process::exit(1); + } + init_path = Some(args[i].clone()); } - disk_path = Some(p); - } else if initrd_path.is_none() { - let p = PathBuf::from(arg); - if p.exists() { - initrd_path = Some(p); - } else { - eprintln!("Warning: initrd not found: {}, treating as cmdline arg", arg); - extra_cmdline.push(arg); + "--root" => { + i += 1; + if i >= args.len() { + eprintln!("--root requires a device argument"); + std::process::exit(1); + } + root_device = Some(args[i].clone()); + } + "--fstype" => { + i += 1; + if i >= args.len() { + eprintln!("--fstype requires a type argument"); + std::process::exit(1); + } + root_fstype = Some(args[i].clone()); + } + "--argv" => { + i += 1; + if i >= args.len() { + eprintln!("--argv requires an argument"); + std::process::exit(1); + } + init_argv.push(args[i].clone()); + } + _ => { + if initrd_path.is_none() { + let p = PathBuf::from(arg); + if p.exists() { + initrd_path = Some(p); + } else { + eprintln!("Warning: initrd not found: {}, treating as cmdline arg", arg); + extra_cmdline.push(arg); + } + } else { + extra_cmdline.push(arg); + } } - } else { - extra_cmdline.push(arg); } i += 1; } @@ -95,6 +152,14 @@ fn main() { }); } + // Root disk device override. + ctx.root_disk_device = root_device.clone(); + ctx.root_disk_fstype = root_fstype.clone(); + + // Init binary path and arguments. + ctx.exec_path = init_path.clone(); + ctx.argv = init_argv.clone(); + // Extra cmdline args are appended after the base cmdline and MMIO // device lines that build_kernel_cmdline() generates automatically. if !extra_cmdline.is_empty() { @@ -121,6 +186,15 @@ fn main() { .map(|p| p.display().to_string()) .unwrap_or_else(|| "(none)".to_string()) ); + if let Some(ref root) = root_device { + println!("Root: {} (fstype: {})", root, root_fstype.as_deref().unwrap_or("auto")); + } + if let Some(ref init) = init_path { + println!("Init: {}", init); + } + if !init_argv.is_empty() { + println!("Argv: {:?}", init_argv); + } // Take the context out of the global map and run synchronously. let ctx = vmm::windows::context::take_ctx(ctx_id).expect("take_ctx failed"); From b17e4871777ba7981684810a34ceab1c0b2364bc Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 18 Apr 2026 16:55:56 +0800 Subject: [PATCH 09/33] fix(vmm): set valid virtio vendor ID and add block error logging The Linux kernel's virtio-mmio driver rejects devices with vendor_id == 0, causing virtio-blk device enumeration to fail silently. Set vendor ID to the standard QEMU value (0x554D4551). Also add debug logging to virtio-blk error paths for easier diagnosis of I/O failures. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/virtio/block.rs | 23 +++++++++++++++++---- src/vmm/src/windows/devices/virtio/mmio.rs | 5 +++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs index a544b3b8a..2ecb66ebb 100644 --- a/src/vmm/src/windows/devices/virtio/block.rs +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -64,17 +64,20 @@ impl VirtioBlock { fn process_request(&mut self, chain: &[Descriptor], mem: &dyn GuestMemoryAccessor) -> u8 { // Minimum: header + status (flush has no data descriptor). if chain.len() < 2 { + log::debug!("BLK: short chain len={}", chain.len()); return VIRTIO_BLK_S_IOERR; } // First descriptor: request header (device-readable). let header_desc = &chain[0]; if header_desc.len < 16 { + log::debug!("BLK: short header len={}", header_desc.len); return VIRTIO_BLK_S_IOERR; } let mut header_buf = [0u8; 16]; if mem.read_at(header_desc.addr, &mut header_buf).is_err() { + log::debug!("BLK: header read failed addr=0x{:X}", header_desc.addr); return VIRTIO_BLK_S_IOERR; } @@ -121,15 +124,27 @@ impl VirtioBlock { ) -> u8 { let mut offset = sector * SECTOR_SIZE; - for desc in data_descs { + for (i, desc) in data_descs.iter().enumerate() { if !desc.is_write() { - return VIRTIO_BLK_S_IOERR; // Data buffer must be device-writable for reads. + log::debug!( + "BLK READ: desc[{}] not writable, flags=0x{:X}", + i, desc.flags + ); + return VIRTIO_BLK_S_IOERR; } let mut buf = vec![0u8; desc.len as usize]; - if self.disk.read_at(offset, &mut buf).is_err() { + if let Err(e) = self.disk.read_at(offset, &mut buf) { + log::debug!( + "BLK READ: disk.read_at(0x{:X}, {}) failed: {}", + offset, desc.len, e + ); return VIRTIO_BLK_S_IOERR; } - if mem.write_at(desc.addr, &buf).is_err() { + if let Err(e) = mem.write_at(desc.addr, &buf) { + log::debug!( + "BLK READ: mem.write_at(0x{:X}, {}) failed: {}", + desc.addr, buf.len(), e + ); return VIRTIO_BLK_S_IOERR; } offset += desc.len as u64; diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index 45f302889..fd7646c0f 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -54,8 +54,9 @@ const VIRTIO_MMIO_MAGIC: u32 = 0x7472_6976; /// Virtio-MMIO version (2 = virtio 1.0+). const VIRTIO_MMIO_VERSION: u32 = 2; -/// Vendor ID (0 = no vendor). -const VIRTIO_VENDOR_ID: u32 = 0; +/// Vendor ID — "QEMU" in little-endian (standard for virtio devices). +/// The Linux kernel's virtio-mmio driver rejects devices with vendor_id == 0. +const VIRTIO_VENDOR_ID: u32 = 0x554D_4551; // Interrupt status bits. const INTERRUPT_USED_RING: u32 = 1; From 7d5485c820c7108e5f645a3ab0d3cd92e48e527e Mon Sep 17 00:00:00 2001 From: lile Date: Sun, 19 Apr 2026 22:15:42 +0800 Subject: [PATCH 10/33] feat(vmm/windows): vsock outbound TCP, second disk, E820 MMIO hole MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Vsock: add connect_targets HashMap + connect_to() for guest-initiated outbound TCP connections (guest vsock → host TCP). Previously only host→guest (listen_on) was supported. 4 new tests. - Device manager: dispatch vsock ports by VsockPort.listen flag (true=listen_on, false=connect_to). Add second virtio-blk device (slot 4) for guest rootfs disk. - Boot loader: split E820 memory map around MMIO region when RAM > 3.25GB to prevent kernel using device addresses as RAM. 2 new tests. - Memory: add MMIO_REGION_SIZE constant, split GuestMemory into two regions when RAM overlaps VIRTIO_MMIO_BASE. - Runner: add eprintln progress reporting for shim capture (log::* may be silently dropped without log→tracing bridge). - boot_kernel: add --vsock-listen and --vsock-connect CLI flags. Co-Authored-By: Claude Opus 4.6 --- src/vmm/examples/boot_kernel.rs | 49 ++++- src/vmm/src/windows/boot/loader.rs | 87 +++++++- src/vmm/src/windows/devices/manager.rs | 44 ++++- .../src/windows/devices/virtio/vsock/mod.rs | 187 +++++++++++++++++- src/vmm/src/windows/memory.rs | 37 +++- src/vmm/src/windows/runner.rs | 23 +++ 6 files changed, 405 insertions(+), 22 deletions(-) diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs index c1c650262..88594f288 100644 --- a/src/vmm/examples/boot_kernel.rs +++ b/src/vmm/examples/boot_kernel.rs @@ -9,6 +9,8 @@ //! --root Override root device (e.g., /dev/vda). Default: auto from --disk //! --fstype Root filesystem type (e.g., ext4). Used with --root //! --argv Arguments passed to init after `--` separator (repeat for each arg) +//! --vsock-listen : VMM listens on TCP, bridges to guest vsock +//! --vsock-connect : VMM connects to TCP when guest connects to vsock //! //! Examples: //! # Boot with initramfs only (existing behavior) @@ -25,7 +27,7 @@ use std::path::PathBuf; -use vmm::windows::context::{DiskConfig, DISK_FORMAT_RAW}; +use vmm::windows::context::{DiskConfig, VsockPort, DISK_FORMAT_RAW}; fn main() { // Initialize logging (RUST_LOG controls verbosity). @@ -54,6 +56,7 @@ fn main() { let mut root_device: Option = None; let mut root_fstype: Option = None; let mut init_argv: Vec = Vec::new(); + let mut vsock_ports: Vec = Vec::new(); let mut extra_cmdline: Vec<&str> = Vec::new(); let mut past_separator = false; let mut i = 2; @@ -116,6 +119,33 @@ fn main() { } init_argv.push(args[i].clone()); } + "--vsock-listen" | "--vsock-connect" => { + let is_listen = arg == "--vsock-listen"; + i += 1; + if i >= args.len() { + eprintln!("{} requires :", arg); + std::process::exit(1); + } + let parts: Vec<&str> = args[i].split(':').collect(); + if parts.len() != 2 { + eprintln!("Expected :, got: {}", args[i]); + std::process::exit(1); + } + let guest_port: u32 = parts[0].parse().unwrap_or_else(|_| { + eprintln!("Invalid guest port: {}", parts[0]); + std::process::exit(1); + }); + let host_port: u16 = parts[1].parse().unwrap_or_else(|_| { + eprintln!("Invalid host port: {}", parts[1]); + std::process::exit(1); + }); + vsock_ports.push(VsockPort { + port: guest_port, + host_path: PathBuf::new(), + listen: is_listen, + host_tcp_port: Some(host_port), + }); + } _ => { if initrd_path.is_none() { let p = PathBuf::from(arg); @@ -195,6 +225,23 @@ fn main() { if !init_argv.is_empty() { println!("Argv: {:?}", init_argv); } + for vp in &vsock_ports { + let host_port = vp.host_tcp_port.unwrap_or(vp.port as u16); + if vp.listen { + println!("Vsock: guest:{} <- TCP listen:{} (host→guest)", vp.port, host_port); + } else { + println!("Vsock: guest:{} -> TCP connect:127.0.0.1:{} (guest→host)", vp.port, host_port); + } + } + + // Move vsock ports into context (after printing, since VsockPort doesn't impl Clone). + if !vsock_ports.is_empty() { + vmm::windows::context::with_ctx_mut(ctx_id, |ctx| { + ctx.vsock_ports = vsock_ports; + Ok(()) + }) + .expect("set vsock_ports failed"); + } // Take the context out of the global map and run synchronously. let ctx = vmm::windows::context::take_ctx(ctx_id).expect("take_ctx failed"); diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs index 8181fbf8a..454d3fe9a 100644 --- a/src/vmm/src/windows/boot/loader.rs +++ b/src/vmm/src/windows/boot/loader.rs @@ -8,6 +8,8 @@ use super::super::error::{Result, WkrunError}; #[cfg(any(target_os = "windows", test))] use super::params::{E820Entry, E820_RAM, E820_RESERVED}; +#[cfg(any(target_os = "windows", test))] +use super::super::memory::{MMIO_REGION_SIZE, VIRTIO_MMIO_BASE}; // These imports are only used by the Windows-only load_kernel() function. #[cfg(target_os = "windows")] @@ -148,13 +150,45 @@ fn build_e820_map(ram_mib: u32) -> Vec { }); // High memory: 1MB to end of RAM. + // When RAM extends past the MMIO region, split around the reserved hole + // so the kernel doesn't try to use MMIO addresses as regular RAM. if ram_bytes > 0x100000 { - entries.push(E820Entry { - addr: 0x100000, - size: ram_bytes - 0x100000, - entry_type: E820_RAM, - _pad: 0, - }); + if ram_bytes > VIRTIO_MMIO_BASE { + let mmio_end = VIRTIO_MMIO_BASE + MMIO_REGION_SIZE; + + // High memory below MMIO. + entries.push(E820Entry { + addr: 0x100000, + size: VIRTIO_MMIO_BASE - 0x100000, + entry_type: E820_RAM, + _pad: 0, + }); + + // MMIO region (reserved). + entries.push(E820Entry { + addr: VIRTIO_MMIO_BASE, + size: MMIO_REGION_SIZE, + entry_type: E820_RESERVED, + _pad: 0, + }); + + // High memory above MMIO. + if ram_bytes > mmio_end { + entries.push(E820Entry { + addr: mmio_end, + size: ram_bytes - mmio_end, + entry_type: E820_RAM, + _pad: 0, + }); + } + } else { + entries.push(E820Entry { + addr: 0x100000, + size: ram_bytes - 0x100000, + entry_type: E820_RAM, + _pad: 0, + }); + } } entries @@ -448,4 +482,45 @@ mod tests { let map = build_e820_map(1); assert_eq!(map.len(), 2, "1MB RAM should only have low + reserved"); } + + #[test] + fn test_build_e820_map_4096mb_has_mmio_hole() { + let map = build_e820_map(4096); + // Low + BIOS reserved + high1 + MMIO reserved + high2 = 5 entries. + assert_eq!(map.len(), 5, "4GB RAM should have MMIO hole: {:?}", map); + + // Low memory. + assert_eq!(map[0].addr, 0); + assert_eq!(map[0].entry_type, E820_RAM); + + // BIOS reserved. + assert_eq!(map[1].addr, 0x9FC00); + assert_eq!(map[1].entry_type, E820_RESERVED); + + // High memory below MMIO. + assert_eq!(map[2].addr, 0x100000); + assert_eq!(map[2].size, VIRTIO_MMIO_BASE - 0x100000); + assert_eq!(map[2].entry_type, E820_RAM); + + // MMIO reserved region. + assert_eq!(map[3].addr, VIRTIO_MMIO_BASE); + assert_eq!(map[3].size, MMIO_REGION_SIZE); + assert_eq!(map[3].entry_type, E820_RESERVED); + + // High memory above MMIO. + let mmio_end = VIRTIO_MMIO_BASE + MMIO_REGION_SIZE; + assert_eq!(map[4].addr, mmio_end); + assert_eq!(map[4].size, 4096 * 1024 * 1024 - mmio_end); + assert_eq!(map[4].entry_type, E820_RAM); + } + + #[test] + fn test_build_e820_map_no_hole_below_mmio() { + // 3072 MB = 3GB < VIRTIO_MMIO_BASE (3.25GB) — no hole needed. + let map = build_e820_map(3072); + assert_eq!(map.len(), 3, "3GB RAM should not have MMIO hole"); + assert_eq!(map[2].addr, 0x100000); + assert_eq!(map[2].size, 3072 * 1024 * 1024 - 0x100000); + assert_eq!(map[2].entry_type, E820_RAM); + } } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 104472352..05eab580f 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -130,6 +130,8 @@ pub struct DeviceManager { virtio_9p: Option>, /// Virtio-net device (slot 3) — optional. virtio_net: Option>, + /// Second virtio-blk device (slot 4) — optional, for guest rootfs. + virtio_blk2: Option>, /// Track whether we've requested an interrupt window. window_requested: bool, @@ -170,7 +172,7 @@ impl DeviceManager { Serial::new(COM1_BASE, Box::new(tee)) }; - // Virtio-blk (slot 0). + // Virtio-blk (slot 0) — first disk (container rootfs). let has_root_disk = !ctx.disks.is_empty(); let virtio_blk = if let Some(disk) = ctx.disks.first() { let backend = open_disk_backend(&disk.path, disk.format, disk.read_only)?; @@ -180,9 +182,19 @@ impl DeviceManager { None }; + // Virtio-blk2 (slot 4) — second disk (guest rootfs), if present. + let virtio_blk2 = if let Some(disk) = ctx.disks.get(1) { + let backend = open_disk_backend(&disk.path, disk.format, disk.read_only)?; + let blk = VirtioBlock::new(backend, disk.read_only); + Some(VirtioMmioDevice::new(blk)) + } else { + None + }; + // Virtio-vsock (slot 1) — always present. let mut vsock_backend = VirtioVsock::new(GUEST_CID); - // Listen on configured ports, or defaults. + // Configure ports: listen=true creates TCP listener (host→guest), + // listen=false registers outbound target (guest→host). if ctx.vsock_ports.is_empty() { for &port in DEFAULT_VSOCK_PORTS { let _ = vsock_backend.listen(port); @@ -190,7 +202,12 @@ impl DeviceManager { } else { for vp in &ctx.vsock_ports { let host_port = vp.host_tcp_port.unwrap_or(vp.port as u16); - let _ = vsock_backend.listen_on(vp.port, host_port); + if vp.listen { + let _ = vsock_backend.listen_on(vp.port, host_port); + } else { + let addr = format!("127.0.0.1:{}", host_port); + vsock_backend.connect_to(vp.port, addr); + } } } let virtio_vsock = VirtioMmioDevice::new(vsock_backend); @@ -228,6 +245,10 @@ impl DeviceManager { index: 3, active: virtio_net.is_some(), }, + MmioSlot { + index: 4, + active: virtio_blk2.is_some(), + }, ]; let devices = DeviceManager { @@ -239,6 +260,7 @@ impl DeviceManager { virtio_vsock, virtio_9p, virtio_net, + virtio_blk2, window_requested: false, last_tick: Instant::now(), port61_toggle: false, @@ -322,6 +344,7 @@ impl DeviceManager { let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); + let blk2_offset = address.wrapping_sub(mmio_base_for_slot(4)); if blk_offset < MMIO_SLOT_SIZE { if let Some(ref dev) = self.virtio_blk { @@ -343,6 +366,12 @@ impl DeviceManager { } else { 0 } + } else if blk2_offset < MMIO_SLOT_SIZE { + if let Some(ref dev) = self.virtio_blk2 { + dev.read(blk2_offset, size) as u64 + } else { + 0 + } } else { 0 } @@ -361,8 +390,8 @@ impl DeviceManager { let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); - let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); + let blk2_offset = address.wrapping_sub(mmio_base_for_slot(4)); if blk_offset < MMIO_SLOT_SIZE { if let Some(ref mut dev) = self.virtio_blk { @@ -389,6 +418,12 @@ impl DeviceManager { self.pic.raise_irq(irq_for_slot(3)); } } + } else if blk2_offset < MMIO_SLOT_SIZE { + if let Some(ref mut dev) = self.virtio_blk2 { + if dev.write(blk2_offset, data as u32, size, mem) { + self.pic.raise_irq(irq_for_slot(4)); + } + } } } @@ -480,6 +515,7 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { virtio_vsock: VirtioMmioDevice::new(vsock_backend), virtio_9p: None, virtio_net: None, + virtio_blk2: None, window_requested: false, last_tick: Instant::now(), port61_toggle: false, diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 360d4ec84..0e8ae8a65 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -49,8 +49,14 @@ pub struct VirtioVsock { guest_cid: u64, /// Active connections keyed by (guest_port, host_port). connections: HashMap, - /// TCP listeners on the host side, keyed by host port. + /// TCP listeners on the host side, keyed by vsock port. + /// Used for host-initiated connections (host TCP → guest vsock). listeners: HashMap, + /// Outbound TCP targets keyed by vsock port. + /// Used for guest-initiated connections (guest vsock → host TCP). + /// When the guest connects to a port in this map, the device makes + /// an outbound TCP connection to the specified address. + connect_targets: HashMap, /// Accepted TCP streams, keyed by (guest_port, host_port). streams: HashMap, /// Pending response/control packets to inject into the RX queue. @@ -64,6 +70,7 @@ impl VirtioVsock { guest_cid, connections: HashMap::new(), listeners: HashMap::new(), + connect_targets: HashMap::new(), streams: HashMap::new(), rx_pending: Vec::new(), } @@ -89,6 +96,16 @@ impl VirtioVsock { Ok(()) } + /// Register an outbound TCP target for guest-initiated connections. + /// + /// When the guest connects to `vsock_port`, the device makes an outbound + /// TCP connection to `host_addr` instead of accepting from a listener. + /// Used for notification channels where the guest initiates the connection + /// and the host is already listening. + pub fn connect_to(&mut self, vsock_port: u32, host_addr: String) { + self.connect_targets.insert(vsock_port, host_addr); + } + /// Get the guest CID. pub fn guest_cid(&self) -> u64 { self.guest_cid @@ -198,11 +215,46 @@ impl VirtioVsock { fn handle_connect_request(&mut self, hdr: &VsockHeader) { let key = (hdr.src_port, hdr.dst_port); - // Check if we have a listener on the requested host port. - let has_listener = self.listeners.contains_key(&hdr.dst_port); + // Try outbound connection first (guest-initiated → host TCP target). + if let Some(addr) = self.connect_targets.get(&hdr.dst_port).cloned() { + let stream = match TcpStream::connect(&addr) { + Ok(stream) => { + let _ = stream.set_nonblocking(true); + stream + } + Err(_) => { + let rst = VsockHeader::new_rst( + VSOCK_CID_HOST, + hdr.dst_port, + self.guest_cid, + hdr.src_port, + ); + self.rx_pending.push((rst, Vec::new())); + return; + } + }; + + let mut conn = + VsockConnection::new(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); + + if let Some(resp) = conn.handle_request(hdr) { + self.rx_pending.push((resp, Vec::new())); + self.connections.insert(key, conn); + self.streams.insert(key, stream); + } else { + let rst = VsockHeader::new_rst( + VSOCK_CID_HOST, + hdr.dst_port, + self.guest_cid, + hdr.src_port, + ); + self.rx_pending.push((rst, Vec::new())); + } + return; + } - if !has_listener { - // No listener -> RST. + // Fall back to listener-based connection (host-initiated). + if !self.listeners.contains_key(&hdr.dst_port) { let rst = VsockHeader::new_rst(VSOCK_CID_HOST, hdr.dst_port, self.guest_cid, hdr.src_port); self.rx_pending.push((rst, Vec::new())); @@ -1235,4 +1287,129 @@ mod tests { let payload = mem.read_bytes(rx_buf + VSOCK_HEADER_SIZE as u64, 8); assert_eq!(payload, b"tcp data"); } + + // --- Guest-initiated outbound connection --- + + #[test] + fn test_connect_to_registers_target() { + let mut dev = VirtioVsock::new(3); + dev.connect_to(2696, "127.0.0.1:9999".to_string()); + assert_eq!(dev.connect_targets.len(), 1); + assert!(dev.connect_targets.contains_key(&2696)); + } + + #[test] + fn test_connect_to_outbound_success() { + // Set up a host-side TCP listener to receive the outbound connection. + let host_listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = host_listener.local_addr().unwrap().port(); + + let mut dev = VirtioVsock::new(3); + dev.connect_to(2696, format!("127.0.0.1:{}", host_port)); + + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + // Guest sends CONNECT to vsock port 2696. + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2696, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Should get RESPONSE (not RST) and have a connection + stream. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); + assert_eq!(dev.connection_count(), 1); + assert_eq!(dev.streams.len(), 1); + + // Host listener should have received the connection. + host_listener.set_nonblocking(true).unwrap(); + let accepted = host_listener.accept(); + assert!(accepted.is_ok(), "Host should have received TCP connection"); + } + + #[test] + fn test_connect_to_unreachable_sends_rst() { + let mut dev = VirtioVsock::new(3); + // Port 1 is not listening — connection will fail. + dev.connect_to(2696, "127.0.0.1:1".to_string()); + + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2696, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Should get RST because target is unreachable. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_RST); + assert_eq!(dev.connection_count(), 0); + } + + #[test] + fn test_connect_to_preferred_over_listener() { + // If both connect_target and listener exist for same port, + // connect_target should be used (checked first). + let host_listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = host_listener.local_addr().unwrap().port(); + + let mut dev = VirtioVsock::new(3); + dev.connect_to(2696, format!("127.0.0.1:{}", host_port)); + dev.listen_on(2696, 0).unwrap(); // Also add a listener on same vsock port. + + let mem = MockMem::new(0x10000); + let mut tx_queue = setup_queue(128); + + let hdr = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2696, + len: 0, + type_: 1, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + mem.write_bytes(BUF_BASE, &hdr.to_bytes()); + write_descriptor(&mem, 0, BUF_BASE, VSOCK_HEADER_SIZE as u32, 0, 0); + push_avail(&mem, 0, 0); + + dev.process_tx(&mut tx_queue, &mem); + + // Should get RESPONSE via outbound connection. + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); + assert_eq!(dev.connection_count(), 1); + assert_eq!(dev.streams.len(), 1); + } } diff --git a/src/vmm/src/windows/memory.rs b/src/vmm/src/windows/memory.rs index b8ddf27eb..7a24c6b54 100644 --- a/src/vmm/src/windows/memory.rs +++ b/src/vmm/src/windows/memory.rs @@ -43,6 +43,10 @@ pub const BOOT_STACK_POINTER: u64 = 0x8FF0; /// Virtio-MMIO base address (above guest RAM, below 4GB identity map). pub const VIRTIO_MMIO_BASE: u64 = 0xD000_0000; +/// Size of the MMIO region reserved for virtio devices. +/// 2MB provides room for many devices and aligns with 2MB page table granularity. +pub const MMIO_REGION_SIZE: u64 = 0x20_0000; + // Windows-specific guest memory allocation and mapping. #[cfg(target_os = "windows")] mod imp { @@ -201,14 +205,35 @@ mod imp { } impl GuestMemory { - /// Create guest memory with a single contiguous region starting at GPA 0. + /// Create guest memory, leaving a hole for the MMIO region if RAM exceeds it. + /// + /// When `size_mib` is large enough that RAM overlaps `VIRTIO_MMIO_BASE`, + /// the memory is split into two regions with an unmapped gap so that WHPX + /// generates MMIO exits (instead of treating device accesses as RAM reads). pub fn new(size_mib: u32) -> Result { let size = (size_mib as u64) * 1024 * 1024; - let region = GuestMemoryRegion::new(0, size)?; - Ok(GuestMemory { - regions: vec![region], - total_size: size, - }) + + if size > super::VIRTIO_MMIO_BASE { + // RAM extends past MMIO region — split into two regions. + // Region 1: GPA 0 .. VIRTIO_MMIO_BASE + // (hole): VIRTIO_MMIO_BASE .. VIRTIO_MMIO_BASE + MMIO_REGION_SIZE + // Region 2: VIRTIO_MMIO_BASE + MMIO_REGION_SIZE .. ram_end + let mmio_base = super::VIRTIO_MMIO_BASE; + let mmio_end = mmio_base + super::MMIO_REGION_SIZE; + let region1 = GuestMemoryRegion::new(0, mmio_base)?; + let region2 = GuestMemoryRegion::new(mmio_end, size - mmio_end)?; + Ok(GuestMemory { + regions: vec![region1, region2], + total_size: size, + }) + } else { + // RAM fits below MMIO — single contiguous region. + let region = GuestMemoryRegion::new(0, size)?; + Ok(GuestMemory { + regions: vec![region], + total_size: size, + }) + } } /// Map all guest memory regions into a WHPX partition. diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 1e2812e8e..a50293e4e 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -180,6 +180,15 @@ mod imp { regs.rip, cmdline ); + // Also emit to stderr so shim can capture it (log::* may be silently dropped + // when no log→tracing bridge is installed). + eprintln!( + "[WHPX] Kernel loaded, RIP={:#X}, ram={}MB, mmio_hole={}, cmdline_len={}", + regs.rip, + ctx.ram_mib, + ctx.ram_mib as u64 * 1024 * 1024 > crate::windows::memory::VIRTIO_MMIO_BASE, + cmdline.len() + ); // Create vCPU and set registers. let vcpu = WhpxVcpu::new(&partition, 0)?; @@ -337,6 +346,14 @@ mod imp { ); log::info!(" IO_reads=[{}]", top_reads.join(", ")); log::info!(" IO_writes=[{}]", top_writes.join(", ")); + eprintln!( + "[WHPX] {:.1}s: exits={} RIP={:#X} console={}B mmio={}", + start_time.elapsed().as_secs_f64(), + exit_count, + regs.rip, + console_len, + mmio_count, + ); } } } @@ -393,6 +410,11 @@ mod imp { sregs.as_ref().map_or(0, |s| s.cr4), sregs.as_ref().map_or(0, |s| s.efer), ); + eprintln!( + "[WHPX] TRIPLE FAULT after {} exits, RIP={:#X}", + exit_count, + regs.as_ref().map_or(0, |r| r.rip), + ); exit_code = -1; break; } @@ -419,6 +441,7 @@ mod imp { let _ = timer_thread.join(); log::info!("VM exited with code {} ({} exits)", exit_code, exit_count); + eprintln!("[WHPX] VM exited, code={} exits={}", exit_code, exit_count); Ok(exit_code) } From 7bf9cc746c4979d682b9811681127c11a1234536 Mon Sep 17 00:00:00 2001 From: lile Date: Mon, 20 Apr 2026 16:25:02 +0800 Subject: [PATCH 11/33] perf(vmm): serial FIFO batching and code quality improvements - Add 16-byte TX FIFO to serial device: batch output flushes on newline, buffer full, IIR read, or FCR reset. Reduces per-byte VM exits during kernel boot console output. - Format PIT time-based counter code for readability. - Standardize imports across virtio devices (alphabetical ordering, error types first). - Improve code formatting in whpx.rs, builder.rs, and virtio modules. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/builder.rs | 8 +- src/vmm/src/windows/devices/pit.rs | 6 +- src/vmm/src/windows/devices/serial.rs | 178 +++++++++++++++++- src/vmm/src/windows/devices/virtio/block.rs | 15 +- src/vmm/src/windows/devices/virtio/mmio.rs | 4 +- src/vmm/src/windows/devices/virtio/net.rs | 4 +- src/vmm/src/windows/devices/virtio/p9/mod.rs | 12 +- .../src/windows/devices/virtio/vsock/mod.rs | 16 +- .../windows/devices/virtio/vsock/packet.rs | 4 +- src/vmm/src/windows/whpx.rs | 7 +- 10 files changed, 216 insertions(+), 38 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 898bb56ad..5ddb9deff 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -3,9 +3,9 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. -use crossbeam_channel::Sender; #[cfg(target_os = "macos")] use crossbeam_channel::unbounded; +use crossbeam_channel::Sender; use kernel::cmdline::Cmdline; #[cfg(target_os = "macos")] use std::collections::HashMap; @@ -45,7 +45,7 @@ use devices::legacy::{IoApic, IrqChipT}; use devices::legacy::{IrqChip, IrqChipDevice}; #[cfg(all(target_os = "linux", target_arch = "aarch64"))] use devices::legacy::{KvmGicV2, KvmGicV3}; -use devices::virtio::{MmioTransport, PortDescription, VirtioDevice, Vsock, port_io}; +use devices::virtio::{port_io, MmioTransport, PortDescription, VirtioDevice, Vsock}; #[cfg(feature = "tee")] use kbs_types::Tee; @@ -88,7 +88,7 @@ use nix::unistd::isatty; use polly::event_manager::{Error as EventManagerError, EventManager}; use utils::eventfd::EventFd; use utils::worker_message::WorkerMessage; -#[cfg(all(target_arch = "x86_64", not(feature = "tee")))] +#[cfg(all(target_arch = "x86_64", not(feature = "efi"), not(feature = "tee")))] use vm_memory::mmap::MmapRegion; #[cfg(not(any(feature = "tee", feature = "nitro")))] use vm_memory::Address; @@ -97,8 +97,6 @@ use vm_memory::Bytes; use vm_memory::GuestMemory; #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] use vm_memory::GuestRegionMmap; -#[cfg(all(target_arch = "x86_64", not(feature = "efi"), not(feature = "tee")))] -use vm_memory::mmap::MmapRegion; use vm_memory::{GuestAddress, GuestMemoryMmap}; /// Errors associated with starting the instance. diff --git a/src/vmm/src/windows/devices/pit.rs b/src/vmm/src/windows/devices/pit.rs index 1ffc0e003..586e385c6 100644 --- a/src/vmm/src/windows/devices/pit.rs +++ b/src/vmm/src/windows/devices/pit.rs @@ -178,7 +178,11 @@ impl PitCounter { /// (the maximum 16-bit count). This matches BIOS behavior where the /// PIT is initialized with reload=0 giving ~18.2 Hz. fn effective_reload(&self) -> u64 { - if self.reload == 0 { 65536 } else { self.reload as u64 } + if self.reload == 0 { + 65536 + } else { + self.reload as u64 + } } /// Compute the current counter value based on accumulated time. diff --git a/src/vmm/src/windows/devices/serial.rs b/src/vmm/src/windows/devices/serial.rs index f188354ad..87baee440 100644 --- a/src/vmm/src/windows/devices/serial.rs +++ b/src/vmm/src/windows/devices/serial.rs @@ -41,6 +41,14 @@ const IIR_FIFO_ENABLED: u8 = 0xC0; /// IER bit: Transmitter Holding Register Empty interrupt. const IER_THRE: u8 = 0x02; +/// FCR bit: FIFO Enable. +const FCR_FIFO_ENABLE: u8 = 0x01; +/// FCR bit: Transmit FIFO Reset. +const FCR_TX_RESET: u8 = 0x04; + +/// 16550 FIFO depth (bytes). +const FIFO_SIZE: usize = 16; + /// Serial port state. struct SerialState { /// Interrupt Enable Register. @@ -63,6 +71,12 @@ struct SerialState { output: Box, /// THRE interrupt pending (set after THR write when IER THRE bit is set). thre_pending: bool, + /// Whether FIFO mode is enabled (FCR bit 0). + fifo_enabled: bool, + /// Transmit FIFO buffer. When FIFO is enabled, bytes are buffered here + /// and flushed to `output` when the buffer is full, a newline is written, + /// or the guest reads IIR (polling for completion). + tx_fifo: Vec, } /// 16550 UART emulation. @@ -87,6 +101,8 @@ impl Serial { dlh: 0, output, thre_pending: false, + fifo_enabled: false, + tx_fifo: Vec::with_capacity(FIFO_SIZE), }), } } @@ -130,7 +146,13 @@ impl Serial { } } 2 => { - // IIR — check for pending interrupt + // IIR — check for pending interrupt. + // Flush any buffered FIFO data (guest is polling for completion). + if state.fifo_enabled && !state.tx_fifo.is_empty() { + let pending: Vec = state.tx_fifo.drain(..).collect(); + let _ = state.output.write_all(&pending); + let _ = state.output.flush(); + } if state.thre_pending { state.thre_pending = false; IIR_THRE | IIR_FIFO_ENABLED @@ -162,13 +184,23 @@ impl Serial { 0 => { if dlab { state.dll = data; + } else if state.fifo_enabled { + // THR with FIFO: buffer bytes, flush on newline or full. + state.tx_fifo.push(data); + if data == b'\n' || state.tx_fifo.len() >= FIFO_SIZE { + let pending: Vec = state.tx_fifo.drain(..).collect(); + let _ = state.output.write_all(&pending); + let _ = state.output.flush(); + } + state.lsr |= LSR_THR_EMPTY | LSR_IDLE; + if state.ier & IER_THRE != 0 { + state.thre_pending = true; + } } else { - // THR — transmit holding register: output the character + // THR without FIFO: immediate output per byte. let _ = state.output.write_all(&[data]); let _ = state.output.flush(); - // THR is always ready (we write synchronously) state.lsr |= LSR_THR_EMPTY | LSR_IDLE; - // Signal THRE interrupt if enabled if state.ier & IER_THRE != 0 { state.thre_pending = true; } @@ -190,7 +222,22 @@ impl Serial { } } 2 => { - // FCR — FIFO control (we acknowledge but don't implement FIFO) + // FCR — FIFO Control Register. + state.fifo_enabled = data & FCR_FIFO_ENABLE != 0; + if data & FCR_TX_RESET != 0 { + // TX FIFO reset: flush pending data and clear buffer. + if !state.tx_fifo.is_empty() { + let pending: Vec = state.tx_fifo.drain(..).collect(); + let _ = state.output.write_all(&pending); + let _ = state.output.flush(); + } + } + if !state.fifo_enabled && !state.tx_fifo.is_empty() { + // Disabling FIFO: flush remaining data. + let pending: Vec = state.tx_fifo.drain(..).collect(); + let _ = state.output.write_all(&pending); + let _ = state.output.flush(); + } } 3 => state.lcr = data, 4 => state.mcr = data & 0x1F, // Only lower 5 bits valid @@ -378,4 +425,125 @@ mod tests { let lsr = serial.read(COM1_BASE + 5); assert_ne!(lsr & LSR_THR_EMPTY, 0, "THR should be ready after write"); } + + // ---- FIFO tests ---- + + #[test] + fn test_fifo_enable_via_fcr() { + let (serial, _) = create_test_serial(); + // FIFO should be disabled initially. + assert!(!serial.state.lock().unwrap().fifo_enabled); + // Write FCR with FIFO enable bit. + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + assert!(serial.state.lock().unwrap().fifo_enabled); + // Disable FIFO. + serial.write(COM1_BASE + 2, 0); + assert!(!serial.state.lock().unwrap().fifo_enabled); + } + + #[test] + fn test_fifo_batches_output() { + let (serial, buffer) = create_test_serial(); + // Enable FIFO. + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + // Write bytes that don't trigger flush (no newline, under FIFO_SIZE). + for &b in b"Hello" { + serial.write(COM1_BASE, b); + } + // Buffer should be empty (data is in FIFO, not flushed yet). + assert!( + buffer.lock().unwrap().is_empty(), + "FIFO should batch writes" + ); + + // Write newline to trigger flush. + serial.write(COM1_BASE, b'\n'); + let captured = buffer.lock().unwrap().clone(); + assert_eq!(captured, b"Hello\n", "newline should flush FIFO"); + } + + #[test] + fn test_fifo_flushes_on_full() { + let (serial, buffer) = create_test_serial(); + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + // Write exactly FIFO_SIZE bytes (no newline). + for i in 0..FIFO_SIZE { + serial.write(COM1_BASE, b'A' + (i as u8 % 26)); + } + // Should have flushed on the 16th byte. + let captured = buffer.lock().unwrap().clone(); + assert_eq!(captured.len(), FIFO_SIZE, "FIFO should flush when full"); + } + + #[test] + fn test_fifo_flushes_on_iir_read() { + let (serial, buffer) = create_test_serial(); + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + // Write partial data (no newline, under FIFO_SIZE). + for &b in b"Test" { + serial.write(COM1_BASE, b); + } + assert!(buffer.lock().unwrap().is_empty(), "not flushed yet"); + + // Read IIR — should flush the FIFO. + let _iir = serial.read(COM1_BASE + 2); + let captured = buffer.lock().unwrap().clone(); + assert_eq!(captured, b"Test", "IIR read should flush FIFO"); + } + + #[test] + fn test_fifo_disable_flushes_remaining() { + let (serial, buffer) = create_test_serial(); + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + for &b in b"Data" { + serial.write(COM1_BASE, b); + } + assert!(buffer.lock().unwrap().is_empty()); + + // Disable FIFO — should flush remaining data. + serial.write(COM1_BASE + 2, 0); + let captured = buffer.lock().unwrap().clone(); + assert_eq!(captured, b"Data", "disabling FIFO should flush"); + } + + #[test] + fn test_fifo_tx_reset_flushes() { + let (serial, buffer) = create_test_serial(); + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + for &b in b"Reset" { + serial.write(COM1_BASE, b); + } + assert!(buffer.lock().unwrap().is_empty()); + + // TX FIFO reset. + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE | FCR_TX_RESET); + let captured = buffer.lock().unwrap().clone(); + assert_eq!(captured, b"Reset", "TX reset should flush FIFO"); + } + + #[test] + fn test_no_fifo_immediate_output() { + let (serial, buffer) = create_test_serial(); + // FIFO disabled (default) — each byte goes out immediately. + serial.write(COM1_BASE, b'A'); + assert_eq!(buffer.lock().unwrap().as_slice(), b"A"); + serial.write(COM1_BASE, b'B'); + assert_eq!(buffer.lock().unwrap().as_slice(), b"AB"); + } + + #[test] + fn test_fifo_lsr_stays_ready() { + let (serial, _) = create_test_serial(); + serial.write(COM1_BASE + 2, FCR_FIFO_ENABLE); + + // Even with FIFO buffering, LSR should report THR empty. + serial.write(COM1_BASE, b'X'); + let lsr = serial.read(COM1_BASE + 5); + assert_ne!(lsr & LSR_THR_EMPTY, 0, "THR should be ready in FIFO mode"); + } } diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs index 2ecb66ebb..3a7b1e907 100644 --- a/src/vmm/src/windows/devices/virtio/block.rs +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -128,7 +128,8 @@ impl VirtioBlock { if !desc.is_write() { log::debug!( "BLK READ: desc[{}] not writable, flags=0x{:X}", - i, desc.flags + i, + desc.flags ); return VIRTIO_BLK_S_IOERR; } @@ -136,14 +137,18 @@ impl VirtioBlock { if let Err(e) = self.disk.read_at(offset, &mut buf) { log::debug!( "BLK READ: disk.read_at(0x{:X}, {}) failed: {}", - offset, desc.len, e + offset, + desc.len, + e ); return VIRTIO_BLK_S_IOERR; } if let Err(e) = mem.write_at(desc.addr, &buf) { log::debug!( "BLK READ: mem.write_at(0x{:X}, {}) failed: {}", - desc.addr, buf.len(), e + desc.addr, + buf.len(), + e ); return VIRTIO_BLK_S_IOERR; } @@ -264,9 +269,9 @@ impl VirtioDeviceBackend for VirtioBlock { #[cfg(test)] mod tests { - use super::*; - use super::disk::RawDiskBackend; use super::super::super::error::WkrunError; + use super::disk::RawDiskBackend; + use super::*; use std::cell::RefCell; use std::fs::File; use std::io::Write as IoWrite; diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index fd7646c0f..946eb04ff 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -327,9 +327,9 @@ impl VirtioMmioDevice { #[cfg(test)] mod tests { - use super::*; - use super::queue::GuestMemoryAccessor; use super::super::super::error::Result; + use super::queue::GuestMemoryAccessor; + use super::*; use std::cell::RefCell; /// Null backend for testing the MMIO transport layer. diff --git a/src/vmm/src/windows/devices/virtio/net.rs b/src/vmm/src/windows/devices/virtio/net.rs index b7c63ab9b..950aa2030 100644 --- a/src/vmm/src/windows/devices/virtio/net.rs +++ b/src/vmm/src/windows/devices/virtio/net.rs @@ -394,9 +394,9 @@ impl VirtioDeviceBackend for VirtioNet { #[cfg(test)] mod tests { - use super::*; - use super::queue::Virtqueue; use super::super::super::error::{Result, WkrunError}; + use super::queue::Virtqueue; + use super::*; use std::cell::RefCell; struct MockMem { diff --git a/src/vmm/src/windows/devices/virtio/p9/mod.rs b/src/vmm/src/windows/devices/virtio/p9/mod.rs index 24d19567e..a4816ad8e 100644 --- a/src/vmm/src/windows/devices/virtio/p9/mod.rs +++ b/src/vmm/src/windows/devices/virtio/p9/mod.rs @@ -324,9 +324,9 @@ impl VirtioDeviceBackend for Virtio9p { #[cfg(test)] mod tests { - use super::*; - use super::super::queue::Virtqueue; use super::super::super::super::error::Result; + use super::super::queue::Virtqueue; + use super::*; use std::cell::RefCell; use std::io::Write as IoWrite; use tempfile::TempDir; @@ -372,7 +372,9 @@ mod tests { let a = addr as usize; let data = self.data.borrow(); if a + buf.len() > data.len() { - return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); } buf.copy_from_slice(&data[a..a + buf.len()]); Ok(()) @@ -381,7 +383,9 @@ mod tests { let a = addr as usize; let mut mem = self.data.borrow_mut(); if a + data.len() > mem.len() { - return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); } mem[a..a + data.len()].copy_from_slice(data); Ok(()) diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 0e8ae8a65..9b1eba5fb 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -19,9 +19,7 @@ use std::net::{TcpListener, TcpStream}; use super::mmio::VirtioDeviceBackend; use super::queue::{GuestMemoryAccessor, Virtqueue}; use connection::{ConnState, VsockConnection}; -use packet::{ - VsockHeader, VSOCK_CID_HOST, VSOCK_HEADER_SIZE, VSOCK_OP_REQUEST, -}; +use packet::{VsockHeader, VSOCK_CID_HOST, VSOCK_HEADER_SIZE, VSOCK_OP_REQUEST}; /// Virtio device ID for vsock (spec Section 5.10). const VIRTIO_VSOCK_ID: u32 = 19; @@ -487,10 +485,10 @@ impl VirtioDeviceBackend for VirtioVsock { #[cfg(test)] mod tests { - use super::*; + use super::super::super::super::error::Result; use super::super::queue::Virtqueue; use super::packet::VSOCK_OP_RST; - use super::super::super::super::error::Result; + use super::*; use std::cell::RefCell; struct MockMem { @@ -534,7 +532,9 @@ mod tests { let a = addr as usize; let data = self.data.borrow(); if a + buf.len() > data.len() { - return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); } buf.copy_from_slice(&data[a..a + buf.len()]); Ok(()) @@ -543,7 +543,9 @@ mod tests { let a = addr as usize; let mut mem = self.data.borrow_mut(); if a + data.len() > mem.len() { - return Err(super::super::super::super::error::WkrunError::Memory("out of bounds".into())); + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); } mem[a..a + data.len()].copy_from_slice(data); Ok(()) diff --git a/src/vmm/src/windows/devices/virtio/vsock/packet.rs b/src/vmm/src/windows/devices/virtio/vsock/packet.rs index 87717036a..740450aeb 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/packet.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/packet.rs @@ -4,8 +4,8 @@ //! TX and RX virtqueues. It carries addressing, flow control credits, //! and operation codes for the vsock connection protocol. -use super::super::queue::GuestMemoryAccessor; use super::super::super::super::error::{Result, WkrunError}; +use super::super::queue::GuestMemoryAccessor; // --- CID constants --- @@ -247,8 +247,8 @@ impl VsockHeader { #[cfg(test)] mod tests { - use super::*; use super::super::super::super::error::Result; + use super::*; use std::cell::RefCell; struct MockMem(RefCell>); diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index 7fba6383a..806d358db 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -704,11 +704,8 @@ mod imp { let new_rdx = value >> 32; let names = [WHvX64RegisterRip, WHvX64RegisterRax, WHvX64RegisterRdx]; - let values: Vec = vec![ - reg64(new_rip), - reg64(new_rax), - reg64(new_rdx), - ]; + let values: Vec = + vec![reg64(new_rip), reg64(new_rax), reg64(new_rdx)]; let hr = unsafe { WHvSetVirtualProcessorRegisters( self.partition_handle, From d5e10e01065a8420d1474436c6c7aa9707dc49bc Mon Sep 17 00:00:00 2001 From: lile Date: Mon, 20 Apr 2026 16:25:41 +0800 Subject: [PATCH 12/33] feat(vmm): ACPI S5 instant shutdown with quiet/verbose boot modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add minimal ACPI tables (RSDP, RSDT, FADT, DSDT ~227 bytes) to guest memory so Linux discovers PM1a_CNT at port 0x604. When the kernel writes S5 sleep type (SLP_EN=1, SLP_TYP=5), the VMM detects shutdown instantly instead of waiting for the HLT loop timeout. Key changes: - New acpi.rs: generates ACPI tables with \_S5_ AML package at 0xE0000 - loader.rs: writes ACPI tables to guest memory, adds E820_ACPI entry - manager.rs: PM1a_CNT/EVT port handlers, shutdown_requested flag - runner.rs: checks shutdown_requested after IO out, MAX_HALTS 5000→50 - cmdline.rs: removes noacpi, adds quiet/verbose modes - context.rs: verbose field for debug cmdline selection - boot_kernel.rs: --verbose CLI flag Results: VM exits 27K→13.5K (50% reduction), shutdown 5s→0ms. Co-Authored-By: Claude Opus 4.6 --- src/vmm/examples/boot_kernel.rs | 32 +++- src/vmm/src/windows/boot/acpi.rs | 234 +++++++++++++++++++++++++ src/vmm/src/windows/boot/loader.rs | 109 ++++++++---- src/vmm/src/windows/boot/mod.rs | 1 + src/vmm/src/windows/cmdline.rs | 124 +++++++++---- src/vmm/src/windows/context.rs | 7 + src/vmm/src/windows/devices/manager.rs | 91 ++++++++-- src/vmm/src/windows/runner.rs | 87 ++++++--- 8 files changed, 580 insertions(+), 105 deletions(-) create mode 100644 src/vmm/src/windows/boot/acpi.rs diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs index 88594f288..56dd4f0d3 100644 --- a/src/vmm/examples/boot_kernel.rs +++ b/src/vmm/examples/boot_kernel.rs @@ -11,6 +11,7 @@ //! --argv Arguments passed to init after `--` separator (repeat for each arg) //! --vsock-listen : VMM listens on TCP, bridges to guest vsock //! --vsock-connect : VMM connects to TCP when guest connects to vsock +//! --verbose Enable serial console output (slower boot, useful for debugging) //! //! Examples: //! # Boot with initramfs only (existing behavior) @@ -58,6 +59,7 @@ fn main() { let mut init_argv: Vec = Vec::new(); let mut vsock_ports: Vec = Vec::new(); let mut extra_cmdline: Vec<&str> = Vec::new(); + let mut verbose = false; let mut past_separator = false; let mut i = 2; @@ -119,6 +121,9 @@ fn main() { } init_argv.push(args[i].clone()); } + "--verbose" => { + verbose = true; + } "--vsock-listen" | "--vsock-connect" => { let is_listen = arg == "--vsock-listen"; i += 1; @@ -152,7 +157,10 @@ fn main() { if p.exists() { initrd_path = Some(p); } else { - eprintln!("Warning: initrd not found: {}, treating as cmdline arg", arg); + eprintln!( + "Warning: initrd not found: {}, treating as cmdline arg", + arg + ); extra_cmdline.push(arg); } } else { @@ -190,6 +198,9 @@ fn main() { ctx.exec_path = init_path.clone(); ctx.argv = init_argv.clone(); + // Verbose mode: enable serial console output for debugging. + ctx.verbose = verbose; + // Extra cmdline args are appended after the base cmdline and MMIO // device lines that build_kernel_cmdline() generates automatically. if !extra_cmdline.is_empty() { @@ -217,7 +228,11 @@ fn main() { .unwrap_or_else(|| "(none)".to_string()) ); if let Some(ref root) = root_device { - println!("Root: {} (fstype: {})", root, root_fstype.as_deref().unwrap_or("auto")); + println!( + "Root: {} (fstype: {})", + root, + root_fstype.as_deref().unwrap_or("auto") + ); } if let Some(ref init) = init_path { println!("Init: {}", init); @@ -225,12 +240,21 @@ fn main() { if !init_argv.is_empty() { println!("Argv: {:?}", init_argv); } + if verbose { + println!("Verbose: enabled (serial console on, slower boot)"); + } for vp in &vsock_ports { let host_port = vp.host_tcp_port.unwrap_or(vp.port as u16); if vp.listen { - println!("Vsock: guest:{} <- TCP listen:{} (host→guest)", vp.port, host_port); + println!( + "Vsock: guest:{} <- TCP listen:{} (host→guest)", + vp.port, host_port + ); } else { - println!("Vsock: guest:{} -> TCP connect:127.0.0.1:{} (guest→host)", vp.port, host_port); + println!( + "Vsock: guest:{} -> TCP connect:127.0.0.1:{} (guest→host)", + vp.port, host_port + ); } } diff --git a/src/vmm/src/windows/boot/acpi.rs b/src/vmm/src/windows/boot/acpi.rs new file mode 100644 index 000000000..a743935c2 --- /dev/null +++ b/src/vmm/src/windows/boot/acpi.rs @@ -0,0 +1,234 @@ +//! Minimal ACPI table generation for WHPX guest boot. +//! +//! Generates RSDP, RSDT, FADT, and DSDT tables so the Linux kernel can +//! discover the PM1a_CNT register and perform clean ACPI S5 shutdown +//! instead of falling back to an HLT loop. + +/// Total size of the ACPI region in guest memory. +pub const ACPI_REGION_SIZE: u64 = 0x200; // 512 bytes + +// Table offsets within the ACPI region. +const RSDP_OFFSET: usize = 0x00; +const RSDT_OFFSET: usize = 0x20; +const FADT_OFFSET: usize = 0x60; +const DSDT_OFFSET: usize = 0x100; + +// Table sizes. +const RSDP_SIZE: usize = 20; +const RSDT_SIZE: usize = 40; // 36-byte header + 4-byte entry +const FADT_SIZE: usize = 116; +const DSDT_HEADER_SIZE: usize = 36; + +// ACPI PM1a I/O port addresses (must match manager.rs constants). +const PM1A_EVT_BLK: u32 = 0x600; +const PM1A_CNT_BLK: u32 = 0x604; + +/// SCI interrupt number for ACPI. +/// +/// Must not conflict with timer (IRQ 0), serial (IRQ 4), or +/// virtio-MMIO devices (IRQ 5-9). IRQ 11 is unused. +const SCI_INT: u16 = 11; + +/// AML bytecode for the `\_S5_` sleep package. +/// +/// Encodes: `Name(\_S5_, Package(4) { 5, 5, 0, 0 })` +/// - `08` = NameOp +/// - `5C 5F 53 35 5F` = `\_S5_` +/// - `12 0A 04` = Package, 10 bytes, 4 elements +/// - `0A 05` = ByteConst 5 (SLP_TYPa) +/// - `0A 05` = ByteConst 5 (SLP_TYPb) +/// - `00` = zero +/// - `00` = zero +const S5_AML: &[u8] = &[ + 0x08, 0x5C, 0x5F, 0x53, 0x35, 0x5F, 0x12, 0x0A, 0x04, 0x0A, 0x05, 0x0A, 0x05, 0x00, 0x00, +]; + +/// Build ACPI tables (RSDP, RSDT, FADT, DSDT) for the given base address. +/// +/// Returns a `Vec` of exactly `ACPI_REGION_SIZE` bytes. The caller +/// writes this to guest memory at `acpi_base`. +pub fn build_acpi_tables(acpi_base: u64) -> Vec { + let mut region = vec![0u8; ACPI_REGION_SIZE as usize]; + + let rsdt_addr = acpi_base + RSDT_OFFSET as u64; + let fadt_addr = acpi_base + FADT_OFFSET as u64; + let dsdt_addr = acpi_base + DSDT_OFFSET as u64; + + // ---- RSDP (20 bytes at offset 0x00) ---- + let rsdp = &mut region[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; + rsdp[0..8].copy_from_slice(b"RSD PTR "); // Signature + // rsdp[8] = checksum (computed below) + rsdp[9..15].copy_from_slice(b"BOXLTE"); // OEMID + rsdp[15] = 0; // Revision: ACPI 1.0 + rsdp[16..20].copy_from_slice(&(rsdt_addr as u32).to_le_bytes()); // RsdtAddress + acpi_checksum(&mut region[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE], 8); + + // ---- RSDT (40 bytes at offset 0x20) ---- + let rsdt = &mut region[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; + rsdt[0..4].copy_from_slice(b"RSDT"); // Signature + rsdt[4..8].copy_from_slice(&(RSDT_SIZE as u32).to_le_bytes()); // Length + rsdt[8] = 1; // Revision + // rsdt[9] = checksum (computed below) + rsdt[10..16].copy_from_slice(b"BOXLTE"); // OEMID + rsdt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID + rsdt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision + rsdt[28..32].copy_from_slice(b"BXLT"); // Creator ID + rsdt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision + // Entry[0]: pointer to FADT + rsdt[36..40].copy_from_slice(&(fadt_addr as u32).to_le_bytes()); + acpi_checksum(&mut region[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE], 9); + + // ---- FADT (116 bytes at offset 0x60) ---- + let fadt = &mut region[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; + fadt[0..4].copy_from_slice(b"FACP"); // Signature (NOT "FADT") + fadt[4..8].copy_from_slice(&(FADT_SIZE as u32).to_le_bytes()); // Length + fadt[8] = 1; // Revision + // fadt[9] = checksum (computed below) + fadt[10..16].copy_from_slice(b"BOXLTE"); // OEMID + fadt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID + fadt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision + fadt[28..32].copy_from_slice(b"BXLT"); // Creator ID + fadt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision + // FACS pointer (offset 36) — 0, not needed for shutdown. + // DSDT pointer (offset 40). + fadt[40..44].copy_from_slice(&(dsdt_addr as u32).to_le_bytes()); + // SCI_INT (offset 46) — interrupt for ACPI System Control. + fadt[46..48].copy_from_slice(&SCI_INT.to_le_bytes()); + // PM1a_EVT_BLK (offset 56). + fadt[56..60].copy_from_slice(&PM1A_EVT_BLK.to_le_bytes()); + // PM1a_CNT_BLK (offset 64). + fadt[64..68].copy_from_slice(&PM1A_CNT_BLK.to_le_bytes()); + // PM1_EVT_LEN (offset 88). + fadt[88] = 4; + // PM1_CNT_LEN (offset 89). + fadt[89] = 2; + acpi_checksum(&mut region[FADT_OFFSET..FADT_OFFSET + FADT_SIZE], 9); + + // ---- DSDT (header + AML at offset 0x100) ---- + let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); + let dsdt = &mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size]; + dsdt[0..4].copy_from_slice(b"DSDT"); // Signature + dsdt[4..8].copy_from_slice(&(dsdt_size as u32).to_le_bytes()); // Length + dsdt[8] = 1; // Revision + // dsdt[9] = checksum (computed below) + dsdt[10..16].copy_from_slice(b"BOXLTE"); // OEMID + dsdt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID + dsdt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision + dsdt[28..32].copy_from_slice(b"BXLT"); // Creator ID + dsdt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision + // AML body: \_S5_ package. + dsdt[DSDT_HEADER_SIZE..DSDT_HEADER_SIZE + S5_AML.len()].copy_from_slice(S5_AML); + acpi_checksum( + &mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size], + 9, + ); + + region +} + +/// Compute ACPI checksum and store it at `checksum_offset`. +/// +/// The checksum byte is set so that the sum of all bytes in the table +/// equals zero (mod 256). +fn acpi_checksum(table: &mut [u8], checksum_offset: usize) { + table[checksum_offset] = 0; + let sum: u8 = table.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + table[checksum_offset] = 0u8.wrapping_sub(sum); +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST_BASE: u64 = 0xE0000; + + #[test] + fn test_rsdp_signature_and_checksum() { + let region = build_acpi_tables(TEST_BASE); + let rsdp = ®ion[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; + + assert_eq!(&rsdp[0..8], b"RSD PTR "); + + let sum: u8 = rsdp.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "RSDP checksum must be zero"); + } + + #[test] + fn test_rsdt_signature_and_length() { + let region = build_acpi_tables(TEST_BASE); + let rsdt = ®ion[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; + + assert_eq!(&rsdt[0..4], b"RSDT"); + let length = u32::from_le_bytes(rsdt[4..8].try_into().unwrap()); + assert_eq!(length, RSDT_SIZE as u32); + + let sum: u8 = rsdt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "RSDT checksum must be zero"); + } + + #[test] + fn test_fadt_signature_and_pm1a_cnt() { + let region = build_acpi_tables(TEST_BASE); + let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; + + assert_eq!(&fadt[0..4], b"FACP"); + + let pm1a_cnt = u32::from_le_bytes(fadt[64..68].try_into().unwrap()); + assert_eq!(pm1a_cnt, 0x604); + + let pm1a_evt = u32::from_le_bytes(fadt[56..60].try_into().unwrap()); + assert_eq!(pm1a_evt, 0x600); + + assert_eq!(fadt[88], 4, "PM1_EVT_LEN"); + assert_eq!(fadt[89], 2, "PM1_CNT_LEN"); + + let sum: u8 = fadt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "FADT checksum must be zero"); + } + + #[test] + fn test_dsdt_contains_s5_package() { + let region = build_acpi_tables(TEST_BASE); + let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); + let dsdt = ®ion[DSDT_OFFSET..DSDT_OFFSET + dsdt_size]; + + assert_eq!(&dsdt[0..4], b"DSDT"); + + // Verify \_S5_ AML is present. + let aml = &dsdt[DSDT_HEADER_SIZE..]; + assert_eq!(aml, S5_AML); + + let sum: u8 = dsdt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "DSDT checksum must be zero"); + } + + #[test] + fn test_total_region_size() { + let region = build_acpi_tables(TEST_BASE); + assert_eq!(region.len(), ACPI_REGION_SIZE as usize); + } + + #[test] + fn test_rsdp_points_to_rsdt() { + let region = build_acpi_tables(TEST_BASE); + let rsdp = ®ion[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; + let rsdt_addr = u32::from_le_bytes(rsdp[16..20].try_into().unwrap()); + assert_eq!(rsdt_addr, (TEST_BASE + RSDT_OFFSET as u64) as u32); + } + + #[test] + fn test_fadt_sci_int() { + let region = build_acpi_tables(TEST_BASE); + let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; + let sci_int = u16::from_le_bytes(fadt[46..48].try_into().unwrap()); + assert_eq!(sci_int, 11, "SCI_INT must be on an unused IRQ"); + } + + #[test] + fn test_fadt_points_to_dsdt() { + let region = build_acpi_tables(TEST_BASE); + let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; + let dsdt_addr = u32::from_le_bytes(fadt[40..44].try_into().unwrap()); + assert_eq!(dsdt_addr, (TEST_BASE + DSDT_OFFSET as u64) as u32); + } +} diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs index 454d3fe9a..cf213ea12 100644 --- a/src/vmm/src/windows/boot/loader.rs +++ b/src/vmm/src/windows/boot/loader.rs @@ -3,26 +3,28 @@ //! Parses a bzImage file, loads the protected-mode kernel into guest memory, //! sets up page tables, GDT, boot parameters, and kernel command line. -use super::params::HDRS_MAGIC; use super::super::error::{Result, WkrunError}; +use super::params::HDRS_MAGIC; -#[cfg(any(target_os = "windows", test))] -use super::params::{E820Entry, E820_RAM, E820_RESERVED}; #[cfg(any(target_os = "windows", test))] use super::super::memory::{MMIO_REGION_SIZE, VIRTIO_MMIO_BASE}; +#[cfg(any(target_os = "windows", test))] +use super::params::{E820Entry, E820_ACPI, E820_RAM, E820_RESERVED}; // These imports are only used by the Windows-only load_kernel() function. #[cfg(target_os = "windows")] -use super::params::BootParams; -#[cfg(target_os = "windows")] -use super::setup::{build_gdt, build_page_tables, configure_boot_registers, gdt_bytes}; -#[cfg(target_os = "windows")] use super::super::memory::{ - CMDLINE_MAX_SIZE, CMDLINE_START, KERNEL_64BIT_ENTRY_OFFSET, KERNEL_START, PDPT_START, PD_START, - PML4_START, ZERO_PAGE_START, + ACPI_START, CMDLINE_MAX_SIZE, CMDLINE_START, KERNEL_64BIT_ENTRY_OFFSET, KERNEL_START, + PDPT_START, PD_START, PML4_START, ZERO_PAGE_START, }; #[cfg(target_os = "windows")] +use super::acpi; +#[cfg(target_os = "windows")] use super::super::types::{SpecialRegisters, StandardRegisters}; +#[cfg(target_os = "windows")] +use super::params::BootParams; +#[cfg(target_os = "windows")] +use super::setup::{build_gdt, build_page_tables, configure_boot_registers, gdt_bytes}; /// Loadflags bit: kernel was loaded high (at 0x100000). #[cfg(any(target_os = "windows", test))] @@ -126,9 +128,10 @@ pub fn parse_bzimage(kernel_image: &[u8]) -> Result { /// Creates a standard memory map with: /// - Low memory (0 .. 0x9FC00) — 640KB conventional /// - Reserved (0x9FC00 .. 0x100000) — BIOS area +/// - ACPI tables (acpi_base .. acpi_base + acpi_size) /// - High memory (0x100000 .. ram_end) — main RAM #[cfg(any(target_os = "windows", test))] -fn build_e820_map(ram_mib: u32) -> Vec { +fn build_e820_map(ram_mib: u32, acpi_base: u64, acpi_size: u64) -> Vec { let ram_bytes = (ram_mib as u64) * 1024 * 1024; let mut entries = Vec::new(); @@ -149,6 +152,14 @@ fn build_e820_map(ram_mib: u32) -> Vec { _pad: 0, }); + // ACPI tables (within the BIOS reserved region). + entries.push(E820Entry { + addr: acpi_base, + size: acpi_size, + entry_type: E820_ACPI, + _pad: 0, + }); + // High memory: 1MB to end of RAM. // When RAM extends past the MMIO region, split around the reserved hole // so the kernel doesn't try to use MMIO addresses as regular RAM. @@ -282,8 +293,12 @@ pub fn load_kernel_with_initrd( boot_params.set_cmdline_ptr(CMDLINE_START as u32); boot_params.set_cmdline_size(cmdline_bytes.len() as u32); + // Write ACPI tables to guest memory. + let acpi_data = acpi::build_acpi_tables(ACPI_START); + guest_mem.write_at_addr(ACPI_START, &acpi_data)?; + // Set E820 memory map. - let e820_map = build_e820_map(ram_mib); + let e820_map = build_e820_map(ram_mib, ACPI_START, acpi::ACPI_REGION_SIZE); boot_params.set_e820_map(&e820_map); // Load initrd if provided. Place at the end of RAM (page-aligned). @@ -456,10 +471,14 @@ mod tests { ); } + // ACPI constants for test assertions. + const TEST_ACPI_BASE: u64 = 0xE0000; + const TEST_ACPI_SIZE: u64 = 0x200; + #[test] fn test_build_e820_map_256mb() { - let map = build_e820_map(256); - assert_eq!(map.len(), 3); + let map = build_e820_map(256, TEST_ACPI_BASE, TEST_ACPI_SIZE); + assert_eq!(map.len(), 4); // Low memory: 0 .. 640KB assert_eq!(map[0].addr, 0); @@ -470,24 +489,29 @@ mod tests { assert_eq!(map[1].addr, 0x9FC00); assert_eq!(map[1].entry_type, E820_RESERVED); + // ACPI tables + assert_eq!(map[2].addr, TEST_ACPI_BASE); + assert_eq!(map[2].size, TEST_ACPI_SIZE); + assert_eq!(map[2].entry_type, E820_ACPI); + // High memory: 1MB .. 256MB - assert_eq!(map[2].addr, 0x100000); - assert_eq!(map[2].size, 256 * 1024 * 1024 - 0x100000); - assert_eq!(map[2].entry_type, E820_RAM); + assert_eq!(map[3].addr, 0x100000); + assert_eq!(map[3].size, 256 * 1024 * 1024 - 0x100000); + assert_eq!(map[3].entry_type, E820_RAM); } #[test] fn test_build_e820_map_1mb_no_high_memory() { // With only 1MB of RAM, high memory region should be empty (1MB - 1MB = 0). - let map = build_e820_map(1); - assert_eq!(map.len(), 2, "1MB RAM should only have low + reserved"); + let map = build_e820_map(1, TEST_ACPI_BASE, TEST_ACPI_SIZE); + assert_eq!(map.len(), 3, "1MB RAM should only have low + reserved + ACPI"); } #[test] fn test_build_e820_map_4096mb_has_mmio_hole() { - let map = build_e820_map(4096); - // Low + BIOS reserved + high1 + MMIO reserved + high2 = 5 entries. - assert_eq!(map.len(), 5, "4GB RAM should have MMIO hole: {:?}", map); + let map = build_e820_map(4096, TEST_ACPI_BASE, TEST_ACPI_SIZE); + // Low + BIOS reserved + ACPI + high1 + MMIO reserved + high2 = 6 entries. + assert_eq!(map.len(), 6, "4GB RAM should have MMIO hole: {:?}", map); // Low memory. assert_eq!(map[0].addr, 0); @@ -497,30 +521,45 @@ mod tests { assert_eq!(map[1].addr, 0x9FC00); assert_eq!(map[1].entry_type, E820_RESERVED); + // ACPI tables. + assert_eq!(map[2].addr, TEST_ACPI_BASE); + assert_eq!(map[2].size, TEST_ACPI_SIZE); + assert_eq!(map[2].entry_type, E820_ACPI); + // High memory below MMIO. - assert_eq!(map[2].addr, 0x100000); - assert_eq!(map[2].size, VIRTIO_MMIO_BASE - 0x100000); - assert_eq!(map[2].entry_type, E820_RAM); + assert_eq!(map[3].addr, 0x100000); + assert_eq!(map[3].size, VIRTIO_MMIO_BASE - 0x100000); + assert_eq!(map[3].entry_type, E820_RAM); // MMIO reserved region. - assert_eq!(map[3].addr, VIRTIO_MMIO_BASE); - assert_eq!(map[3].size, MMIO_REGION_SIZE); - assert_eq!(map[3].entry_type, E820_RESERVED); + assert_eq!(map[4].addr, VIRTIO_MMIO_BASE); + assert_eq!(map[4].size, MMIO_REGION_SIZE); + assert_eq!(map[4].entry_type, E820_RESERVED); // High memory above MMIO. let mmio_end = VIRTIO_MMIO_BASE + MMIO_REGION_SIZE; - assert_eq!(map[4].addr, mmio_end); - assert_eq!(map[4].size, 4096 * 1024 * 1024 - mmio_end); - assert_eq!(map[4].entry_type, E820_RAM); + assert_eq!(map[5].addr, mmio_end); + assert_eq!(map[5].size, 4096 * 1024 * 1024 - mmio_end); + assert_eq!(map[5].entry_type, E820_RAM); } #[test] fn test_build_e820_map_no_hole_below_mmio() { // 3072 MB = 3GB < VIRTIO_MMIO_BASE (3.25GB) — no hole needed. - let map = build_e820_map(3072); - assert_eq!(map.len(), 3, "3GB RAM should not have MMIO hole"); - assert_eq!(map[2].addr, 0x100000); - assert_eq!(map[2].size, 3072 * 1024 * 1024 - 0x100000); - assert_eq!(map[2].entry_type, E820_RAM); + let map = build_e820_map(3072, TEST_ACPI_BASE, TEST_ACPI_SIZE); + assert_eq!(map.len(), 4, "3GB RAM should not have MMIO hole"); + assert_eq!(map[3].addr, 0x100000); + assert_eq!(map[3].size, 3072 * 1024 * 1024 - 0x100000); + assert_eq!(map[3].entry_type, E820_RAM); + } + + #[test] + fn test_build_e820_map_has_acpi_entry() { + let map = build_e820_map(256, TEST_ACPI_BASE, TEST_ACPI_SIZE); + let acpi_entry = map.iter().find(|e| e.entry_type == E820_ACPI); + assert!(acpi_entry.is_some(), "E820 map must contain ACPI entry"); + let entry = acpi_entry.unwrap(); + assert_eq!(entry.addr, TEST_ACPI_BASE); + assert_eq!(entry.size, TEST_ACPI_SIZE); } } diff --git a/src/vmm/src/windows/boot/mod.rs b/src/vmm/src/windows/boot/mod.rs index 80f348f48..d999ad054 100644 --- a/src/vmm/src/windows/boot/mod.rs +++ b/src/vmm/src/windows/boot/mod.rs @@ -1,5 +1,6 @@ //! Linux kernel boot support for x86_64 (Windows WHPX backend). +pub mod acpi; pub mod loader; pub mod params; pub mod setup; diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index 717d8ae77..60fee6430 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -8,8 +8,14 @@ pub const MMIO_SLOT_SIZE: u64 = 0x200; /// IRQ number for the first MMIO device slot. pub const FIRST_MMIO_IRQ: u8 = 5; -/// Base kernel command line parameters. +/// Base kernel command line parameters (quiet mode — fast boot). /// +/// Quiet mode suppresses serial console output and i8042 keyboard probing, +/// eliminating ~36K VM exits per boot (~26K serial + ~10K i8042). This reduces +/// WHPX boot time from ~5s to ~1-2s. +/// +/// - `quiet loglevel=1`: Suppress kernel printk to console. +/// - `i8042.nokbd i8042.noaux`: Skip PS/2 keyboard/mouse probe (10K+ exits). /// - `nohyperv`: Disable Hyper-V guest enlightenments. WHPX exposes Hyper-V /// CPUID leaves but doesn't fully support synthetic timers/SynIC, causing /// clock stalls if the kernel tries to use them. @@ -18,7 +24,13 @@ pub const FIRST_MMIO_IRQ: u8 = 5; /// - `nokaslr`: Disable kernel address space randomization for deterministic /// boot in our controlled single-vCPU environment. const BASE_CMDLINE: &str = - "console=ttyS0 earlyprintk=serial,ttyS0,115200 noapic nolapic noacpi nosmp nohyperv lpj=1000000 nokaslr"; + "console=ttyS0 quiet loglevel=1 i8042.nokbd i8042.noaux noapic nolapic nosmp nohyperv lpj=1000000 nokaslr"; + +/// Serial console parameters appended in verbose mode. +/// +/// Enables full kernel boot output on the serial console. Useful for debugging +/// but adds ~26K VM exits (~3s) due to per-byte serial I/O port access. +const VERBOSE_CONSOLE: &str = "console=ttyS0 earlyprintk=serial,ttyS0,115200"; /// Description of a virtio-MMIO device slot for command line generation. #[derive(Debug, Clone)] @@ -39,6 +51,8 @@ pub struct MmioSlot { /// - `root_disk_fstype`: Filesystem type for root device (e.g., "ext4"). /// - `exec_path`: Path to init binary (added as `init=`). /// - `exec_argv`: Arguments passed after `--` separator for the init process. +/// - `verbose`: Enable serial console output. Adds `console=ttyS0` and removes +/// `quiet`/`i8042.nokbd` for full kernel boot logging. Slower (~5s vs ~1-2s). pub fn build_kernel_cmdline( user_cmdline: Option<&str>, has_root_disk: bool, @@ -47,8 +61,17 @@ pub fn build_kernel_cmdline( root_disk_fstype: Option<&str>, exec_path: Option<&str>, exec_argv: &[String], + verbose: bool, ) -> String { - let mut cmdline = BASE_CMDLINE.to_string(); + let mut cmdline = if verbose { + // Verbose mode: serial console + full i8042 probe for debugging. + format!( + "{} noapic nolapic nosmp nohyperv lpj=1000000 nokaslr", + VERBOSE_CONSOLE + ) + } else { + BASE_CMDLINE.to_string() + }; // Root device: explicit override takes priority over default. if let Some(device) = root_disk_device { @@ -110,7 +133,7 @@ mod tests { /// Helper: build cmdline with only the legacy params (no root override, no init). fn build_simple(user: Option<&str>, has_root: bool, slots: &[MmioSlot]) -> String { - build_kernel_cmdline(user, has_root, slots, None, None, None, &[]) + build_kernel_cmdline(user, has_root, slots, None, None, None, &[], false) } #[test] @@ -119,6 +142,31 @@ mod tests { assert_eq!(cmdline, BASE_CMDLINE); } + #[test] + fn test_quiet_mode_default() { + let cmdline = build_simple(None, false, &[]); + assert!(cmdline.contains("console=ttyS0")); + assert!(cmdline.contains("quiet")); + assert!(cmdline.contains("loglevel=1")); + assert!(cmdline.contains("i8042.nokbd")); + assert!(cmdline.contains("i8042.noaux")); + assert!(!cmdline.contains("earlyprintk")); + } + + #[test] + fn test_verbose_mode() { + let cmdline = build_kernel_cmdline(None, false, &[], None, None, None, &[], true); + assert!(cmdline.contains("console=ttyS0")); + assert!(cmdline.contains("earlyprintk=serial,ttyS0,115200")); + assert!(!cmdline.contains("quiet")); + assert!(!cmdline.contains("loglevel=1")); + assert!(!cmdline.contains("i8042.nokbd")); + // Common params present in both modes. + assert!(cmdline.contains("nohyperv")); + assert!(cmdline.contains("lpj=1000000")); + assert!(cmdline.contains("nokaslr")); + } + #[test] fn test_with_root_disk() { let cmdline = build_simple(None, true, &[]); @@ -184,6 +232,28 @@ mod tests { assert!(cmdline.contains("nohyperv")); assert!(cmdline.contains("lpj=1000000")); assert!(cmdline.contains("nokaslr")); + // noacpi must NOT be present (ACPI tables are provided). + assert!(!cmdline.contains("noacpi")); + } + + #[test] + fn test_cmdline_no_noacpi() { + // Verify neither quiet nor verbose mode includes noacpi. + let quiet = build_simple(None, false, &[]); + assert!( + !quiet.contains("noacpi"), + "quiet cmdline must not contain noacpi" + ); + + let verbose = build_kernel_cmdline(None, false, &[], None, None, None, &[], true); + assert!( + !verbose.contains("noacpi"), + "verbose cmdline must not contain noacpi" + ); + + // Ensure noapic (APIC disable) is still present — it's different from noacpi. + assert!(quiet.contains("noapic")); + assert!(verbose.contains("noapic")); } #[test] @@ -216,14 +286,14 @@ mod tests { active: true, }, ]; - let cmdline = build_simple(Some("quiet"), true, &slots); + let cmdline = build_simple(Some("custom_test=1"), true, &slots); let base_pos = cmdline.find(BASE_CMDLINE).unwrap(); let root_pos = cmdline.find("root=/dev/vda").unwrap(); let mmio0_pos = cmdline.find("0xd0000000:5").unwrap(); let mmio1_pos = cmdline.find("0xd0000200:6").unwrap(); let mmio2_pos = cmdline.find("0xd0000400:7").unwrap(); - let user_pos = cmdline.find("quiet").unwrap(); + let user_pos = cmdline.find("custom_test=1").unwrap(); assert!(base_pos < root_pos); assert!(root_pos < mmio0_pos); @@ -244,6 +314,7 @@ mod tests { Some("ext4"), None, &[], + false, ); assert!(cmdline.contains("root=/dev/vdb")); assert!(cmdline.contains("rootfstype=ext4")); @@ -263,6 +334,7 @@ mod tests { Some("ext4"), None, &[], + false, ); assert!(cmdline.contains("root=/dev/vdb")); assert!(!cmdline.contains("root=/dev/vda")); @@ -270,15 +342,8 @@ mod tests { #[test] fn test_root_disk_device_without_fstype() { - let cmdline = build_kernel_cmdline( - None, - false, - &[], - Some("/dev/vdb"), - None, - None, - &[], - ); + let cmdline = + build_kernel_cmdline(None, false, &[], Some("/dev/vdb"), None, None, &[], false); assert!(cmdline.contains("root=/dev/vdb")); assert!(!cmdline.contains("rootfstype=")); assert!(cmdline.contains("rw")); @@ -294,6 +359,7 @@ mod tests { None, Some("/boxlite/bin/boxlite-guest"), &[], + false, ); assert!(cmdline.contains("init=/boxlite/bin/boxlite-guest")); } @@ -314,6 +380,7 @@ mod tests { None, Some("/boxlite/bin/boxlite-guest"), &argv, + false, ); assert!(cmdline.contains("init=/boxlite/bin/boxlite-guest")); assert!(cmdline.ends_with("-- --listen vsock://2695 --notify vsock://2696")); @@ -321,15 +388,8 @@ mod tests { #[test] fn test_no_separator_when_argv_empty() { - let cmdline = build_kernel_cmdline( - None, - false, - &[], - None, - None, - Some("/bin/init"), - &[], - ); + let cmdline = + build_kernel_cmdline(None, false, &[], None, None, Some("/bin/init"), &[], false); assert!(cmdline.contains("init=/bin/init")); assert!(!cmdline.contains("--")); } @@ -340,13 +400,16 @@ mod tests { // root=/dev/vdb rootfstype=ext4 rw init=/boxlite/bin/boxlite-guest // virtio_mmio devices, then -- let slots = vec![ - MmioSlot { index: 0, active: true }, - MmioSlot { index: 1, active: true }, - ]; - let argv = vec![ - "--listen".to_string(), - "vsock://2695".to_string(), + MmioSlot { + index: 0, + active: true, + }, + MmioSlot { + index: 1, + active: true, + }, ]; + let argv = vec!["--listen".to_string(), "vsock://2695".to_string()]; let cmdline = build_kernel_cmdline( None, true, @@ -355,6 +418,7 @@ mod tests { Some("ext4"), Some("/boxlite/bin/boxlite-guest"), &argv, + false, ); // Verify ordering: base < root < init < mmio < argv diff --git a/src/vmm/src/windows/context.rs b/src/vmm/src/windows/context.rs index f33a92481..de0e4ac1f 100644 --- a/src/vmm/src/windows/context.rs +++ b/src/vmm/src/windows/context.rs @@ -67,6 +67,12 @@ pub struct VmContext { pub root_disk_device: Option, /// Root disk filesystem type (e.g., "ext4"). pub root_disk_fstype: Option, + /// Enable verbose serial console output for debugging. + /// + /// When true, the kernel cmdline includes `console=ttyS0` for full boot + /// logging. When false (default), quiet mode suppresses serial output and + /// i8042 probing for faster boot (~1-2s vs ~5s). + pub verbose: bool, } /// Network device configuration. @@ -126,6 +132,7 @@ impl VmContext { net_config: None, root_disk_device: None, root_disk_fstype: None, + verbose: false, } } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 05eab580f..93ec0727f 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -10,6 +10,10 @@ use std::path::Path; use std::sync::{Arc, LazyLock, Mutex}; use std::time::Instant; +use super::super::cmdline::{irq_for_slot, mmio_base_for_slot, MmioSlot, MMIO_SLOT_SIZE}; +use super::super::context::VmContext; +use super::super::error::{Result, WkrunError}; +use super::super::vcpu::IoHandler; use super::pic::Pic; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; @@ -20,10 +24,6 @@ use super::virtio::net::VirtioNet; use super::virtio::p9::Virtio9p; use super::virtio::queue::GuestMemoryAccessor; use super::virtio::vsock::VirtioVsock; -use super::super::error::{Result, WkrunError}; -use super::super::cmdline::{irq_for_slot, mmio_base_for_slot, MmioSlot, MMIO_SLOT_SIZE}; -use super::super::context::VmContext; -use super::super::vcpu::IoHandler; /// Shared console output buffer. pub type ConsoleBuffer = Arc>>; @@ -73,6 +73,12 @@ pub fn remove_console_buffer(ctx_id: u32) { /// Default guest CID for vsock (standard value for single-VM hosts). const GUEST_CID: u64 = 3; +/// ACPI PM1a event block base port (4 bytes wide). +const PM1A_EVT_BLK: u16 = 0x600; + +/// ACPI PM1a control block base port (2 bytes wide). +const PM1A_CNT_BLK: u16 = 0x604; + /// Default vsock listen ports (BoxLite: 2695=gRPC, 2696=ready signal). const DEFAULT_VSOCK_PORTS: &[u32] = &[2695, 2696]; @@ -142,6 +148,8 @@ pub struct DeviceManager { /// Linux's `pit_calibrate_tsc()` loops reading port 0x61 waiting for /// bit 5 to toggle. Without toggling, TSC calibration stalls forever. port61_toggle: bool, + /// ACPI shutdown detected (PM1a_CNT S5 sleep type written). + shutdown_requested: bool, } impl DeviceManager { @@ -264,6 +272,7 @@ impl DeviceManager { window_requested: false, last_tick: Instant::now(), port61_toggle: false, + shutdown_requested: false, }; Ok(DeviceSetup { @@ -289,6 +298,15 @@ impl DeviceManager { } else if self.pit.handles_port(port) { log::trace!("PIT write: port={:#X} data={:#X}", port, data as u8); self.pit.write_port(port, data as u8); + } else if port == PM1A_CNT_BLK { + // ACPI PM1a control register: detect S5 shutdown. + // SLP_EN = bit 13, SLP_TYP = bits 12:10. + let slp_en = (data >> 13) & 1; + let slp_typ = (data >> 10) & 0x7; + if slp_en == 1 && slp_typ == 5 { + log::info!("ACPI S5 shutdown detected (PM1a_CNT={:#X})", data); + self.shutdown_requested = true; + } } else if port == 0x70 { self.cmos_addr = (data as u8) & 0x7F; } @@ -311,6 +329,10 @@ impl DeviceManager { self.pit.read_port(port) as u32 } else if port == 0x71 { cmos_read(self.cmos_addr) as u32 + } else if (PM1A_EVT_BLK..PM1A_EVT_BLK + 4).contains(&port) { + 0x00 // PM1a event: no events pending + } else if (PM1A_CNT_BLK..PM1A_CNT_BLK + 2).contains(&port) { + 0x00 // PM1a control: clear state } else if (0xCF8..=0xCFF).contains(&port) { 0xFFFF_FFFF // PCI config: no devices. } else if port == 0x61 { @@ -321,7 +343,11 @@ impl DeviceManager { // loop that stalls kernel boot. Toggling on each read lets the // calibration complete. self.port61_toggle = !self.port61_toggle; - if self.port61_toggle { 0x20 } else { 0x00 } + if self.port61_toggle { + 0x20 + } else { + 0x00 + } } else if port == 0x92 { 0x02 // System control port A: A20 enabled. } else if port == 0x60 || port == 0x64 { @@ -472,10 +498,9 @@ impl DeviceManager { e )) })?; - let transport = - super::virtio::net::UnixStreamTransport::new(stream).map_err(|e| { - WkrunError::Device(format!("failed to configure net socket: {}", e)) - })?; + let transport = super::virtio::net::UnixStreamTransport::new(stream).map_err(|e| { + WkrunError::Device(format!("failed to configure net socket: {}", e)) + })?; Ok(Some(Box::new(transport))) } #[cfg(not(unix))] @@ -484,14 +509,18 @@ impl DeviceManager { let stream = std::net::TcpStream::connect(addr.as_ref()).map_err(|e| { WkrunError::Device(format!("failed to connect to net proxy '{}': {}", addr, e)) })?; - let transport = - super::virtio::net::TcpTransport::new(stream).map_err(|e| { - WkrunError::Device(format!("failed to configure net socket: {}", e)) - })?; + let transport = super::virtio::net::TcpTransport::new(stream).map_err(|e| { + WkrunError::Device(format!("failed to configure net socket: {}", e)) + })?; Ok(Some(Box::new(transport))) } } + /// Whether an ACPI S5 shutdown was detected. + pub fn shutdown_requested(&self) -> bool { + self.shutdown_requested + } + /// Whether an interrupt window has been requested. pub fn window_requested(&self) -> bool { self.window_requested @@ -519,6 +548,7 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { window_requested: false, last_tick: Instant::now(), port61_toggle: false, + shutdown_requested: false, } } @@ -775,4 +805,39 @@ mod tests { let data = dm.handle_mmio_read(mmio_base_for_slot(3), 4); assert_eq!(data, 0); } + + #[test] + fn test_acpi_shutdown_not_requested_initially() { + let dm = make_test_devices(); + assert!(!dm.shutdown_requested()); + } + + #[test] + fn test_acpi_s5_shutdown_detected() { + let mut dm = make_test_devices(); + // Write SLP_TYP=5, SLP_EN=1 → bits 12:10 = 0b101, bit 13 = 1. + // Value = (1 << 13) | (5 << 10) = 0x2000 | 0x1400 = 0x3400. + dm.handle_io_out(0x604, 2, 0x3400); + assert!(dm.shutdown_requested()); + } + + #[test] + fn test_acpi_non_s5_write_ignored() { + let mut dm = make_test_devices(); + // SLP_EN=1, SLP_TYP=0 → not S5. + dm.handle_io_out(0x604, 2, 0x2000); + assert!(!dm.shutdown_requested()); + } + + #[test] + fn test_acpi_pm1a_evt_read_zero() { + let mut dm = make_test_devices(); + assert_eq!(dm.handle_io_in(0x600, 4), 0x00); + } + + #[test] + fn test_acpi_pm1a_cnt_read_zero() { + let mut dm = make_test_devices(); + assert_eq!(dm.handle_io_in(0x604, 2), 0x00); + } } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index a50293e4e..0023f7075 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -15,11 +15,11 @@ mod imp { use std::time::{Duration, Instant}; use super::super::boot::loader::load_kernel_with_initrd; - use super::super::devices::virtio::queue::GuestMemoryAccessor; - use super::super::error::{Result, WkrunError}; use super::super::cmdline::build_kernel_cmdline; use super::super::context::VmContext; use super::super::devices::manager::{self as devices, DeviceManager}; + use super::super::devices::virtio::queue::GuestMemoryAccessor; + use super::super::error::{Result, WkrunError}; use super::super::memory::GuestMemory; use super::super::types::VcpuExit; use super::super::vcpu::VcpuRunConfig; @@ -40,12 +40,12 @@ mod imp { /// Maximum vCPU exits before giving up. const MAX_EXITS: u64 = 500_000_000; - /// Maximum consecutive HLT instructions before giving up. + /// Maximum consecutive HLT instructions before assuming shutdown. /// - /// When the guest executes `poweroff -f` on WHPX (no ACPI), the kernel - /// enters an HLT loop. With the 1ms timer tick, this translates to - /// ~5 seconds of wall-clock time before we detect the shutdown. - const MAX_HALTS: u64 = 5_000; + /// With ACPI tables, `poweroff` is detected instantly via PM1a_CNT. + /// MAX_HALTS is a safety fallback for non-ACPI shutdown paths. + /// At 1ms per tick, 50 = ~50ms timeout. + const MAX_HALTS: u64 = 50; /// Handle for a running VM, stored in `RUNNING_VMS`. struct VmHandle { @@ -72,33 +72,51 @@ mod imp { // PML4 entry let mut buf = [0u8; 8]; - guest_mem.read_at_addr(pml4_base + (pml4_idx as u64) * 8, &mut buf).ok()?; + guest_mem + .read_at_addr(pml4_base + (pml4_idx as u64) * 8, &mut buf) + .ok()?; let pml4e = u64::from_le_bytes(buf); - if pml4e & 1 == 0 { return None; } // not present + if pml4e & 1 == 0 { + return None; + } // not present // PDPT entry let pdpt_base = pml4e & 0x000F_FFFF_FFFF_F000; - guest_mem.read_at_addr(pdpt_base + (pdpt_idx as u64) * 8, &mut buf).ok()?; + guest_mem + .read_at_addr(pdpt_base + (pdpt_idx as u64) * 8, &mut buf) + .ok()?; let pdpte = u64::from_le_bytes(buf); - if pdpte & 1 == 0 { return None; } - if pdpte & 0x80 != 0 { // 1GB page + if pdpte & 1 == 0 { + return None; + } + if pdpte & 0x80 != 0 { + // 1GB page return Some((pdpte & 0x000F_FFFF_C000_0000) | (gva & 0x3FFF_FFFF)); } // PD entry let pd_base = pdpte & 0x000F_FFFF_FFFF_F000; - guest_mem.read_at_addr(pd_base + (pd_idx as u64) * 8, &mut buf).ok()?; + guest_mem + .read_at_addr(pd_base + (pd_idx as u64) * 8, &mut buf) + .ok()?; let pde = u64::from_le_bytes(buf); - if pde & 1 == 0 { return None; } - if pde & 0x80 != 0 { // 2MB page + if pde & 1 == 0 { + return None; + } + if pde & 0x80 != 0 { + // 2MB page return Some((pde & 0x000F_FFFF_FFE0_0000) | (gva & 0x1F_FFFF)); } // PT entry let pt_base = pde & 0x000F_FFFF_FFFF_F000; - guest_mem.read_at_addr(pt_base + (pt_idx as u64) * 8, &mut buf).ok()?; + guest_mem + .read_at_addr(pt_base + (pt_idx as u64) * 8, &mut buf) + .ok()?; let pte = u64::from_le_bytes(buf); - if pte & 1 == 0 { return None; } + if pte & 1 == 0 { + return None; + } Some((pte & 0x000F_FFFF_FFFF_F000) | offset) } @@ -168,6 +186,7 @@ mod imp { ctx.root_disk_fstype.as_deref(), ctx.exec_path.as_deref(), &ctx.argv, + ctx.verbose, ); // Load kernel. @@ -256,6 +275,11 @@ mod imp { halt_count = 0; *io_write_counts.entry(port).or_insert(0) += 1; devices.handle_io_out(port, size, data); + if devices.shutdown_requested() { + log::info!("ACPI shutdown detected after {} exits", exit_count); + exit_code = 0; + break; + } vcpu.skip_instruction()?; } VcpuExit::IoIn { port, size } => { @@ -323,12 +347,14 @@ mod imp { // Sort IO ports by frequency (descending), show all. let mut reads: Vec<_> = io_read_counts.iter().collect(); reads.sort_by(|a, b| b.1.cmp(a.1)); - let top_reads: Vec = reads.iter() + let top_reads: Vec = reads + .iter() .map(|(p, c)| format!("{:#X}:{}", p, c)) .collect(); let mut writes: Vec<_> = io_write_counts.iter().collect(); writes.sort_by(|a, b| b.1.cmp(a.1)); - let top_writes: Vec = writes.iter() + let top_writes: Vec = writes + .iter() .map(|(p, c)| format!("{:#X}:{}", p, c)) .collect(); log::info!( @@ -357,7 +383,12 @@ mod imp { } } } - VcpuExit::MsrAccess { msr_number, is_write, rax, rdx } => { + VcpuExit::MsrAccess { + msr_number, + is_write, + rax, + rdx, + } => { halt_count = 0; msr_count += 1; if is_write { @@ -389,12 +420,22 @@ mod imp { // doesn't fully support. let (out_rax, out_rbx, out_rcx, out_rdx) = match leaf { // Leaf 1: clear "hypervisor present" bit (ECX bit 31). - 1 => (default_rax, default_rbx, default_rcx & !(1u64 << 31), default_rdx), + 1 => ( + default_rax, + default_rbx, + default_rcx & !(1u64 << 31), + default_rdx, + ), // Hyper-V CPUID range: return zeros (no hypervisor features). 0x40000000..=0x400000FF => (0, 0, 0, 0), _ => (default_rax, default_rbx, default_rcx, default_rdx), }; - log::trace!("CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", rax, rcx, out_rax); + log::trace!( + "CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", + rax, + rcx, + out_rax + ); vcpu.complete_cpuid(out_rax, out_rbx, out_rcx, out_rdx)?; } VcpuExit::UnrecoverableException => { @@ -568,9 +609,9 @@ pub fn stop(_ctx_id: u32) -> super::error::Result<()> { #[cfg(test)] mod tests { - use super::*; use super::super::context::VmContext; use super::super::vcpu::VcpuRunConfig; + use super::*; use std::sync::{Arc, Mutex}; #[test] From 9bec914dfbcd19e3866cb7f53e249d0e6b8361d6 Mon Sep 17 00:00:00 2001 From: lile Date: Wed, 22 Apr 2026 11:58:03 +0800 Subject: [PATCH 13/33] feat(vmm/windows): vsock host-initiated connections and flexible port routing Add host-to-guest connection initiation for vsock: - VsockHeader::new_request() for REQUEST packets from host to guest - VsockConnection::initiate_connect() state transition (Idle -> Connecting) - poll_tcp_listeners() accepts incoming TCP, sends REQUEST to guest - Skip TCP reads on non-Connected streams during handshake Improve vsock port routing in DeviceManager: - Support host_path as "host:port" format (from krun_add_vsock_port2 API) - Fallback chain: explicit host_tcp_port > host_path > vsock port number Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/manager.rs | 27 +- .../devices/virtio/vsock/connection.rs | 70 ++++ .../src/windows/devices/virtio/vsock/mod.rs | 315 ++++++++++++++++++ .../windows/devices/virtio/vsock/packet.rs | 37 ++ 4 files changed, 445 insertions(+), 4 deletions(-) diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 93ec0727f..fa428eb27 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -209,12 +209,31 @@ impl DeviceManager { } } else { for vp in &ctx.vsock_ports { - let host_port = vp.host_tcp_port.unwrap_or(vp.port as u16); + // Resolve the host TCP address from either: + // 1. Explicit host_tcp_port (set by boot_kernel CLI) + // 2. host_path as "host:port" string (set by krun_add_vsock_port2 API) + // 3. Fallback: vsock port number as TCP port + let host_addr = if let Some(tcp_port) = vp.host_tcp_port { + format!("127.0.0.1:{}", tcp_port) + } else { + let path_str = vp.host_path.to_string_lossy(); + if path_str.contains(':') { + // host_path is "host:port" format (e.g., "127.0.0.1:55008") + path_str.to_string() + } else { + format!("127.0.0.1:{}", vp.port) + } + }; if vp.listen { - let _ = vsock_backend.listen_on(vp.port, host_port); + // Parse port from host_addr for listen_on + let port = host_addr + .rsplit(':') + .next() + .and_then(|s| s.parse::().ok()) + .unwrap_or(vp.port as u16); + let _ = vsock_backend.listen_on(vp.port, port); } else { - let addr = format!("127.0.0.1:{}", host_port); - vsock_backend.connect_to(vp.port, addr); + vsock_backend.connect_to(vp.port, host_addr); } } } diff --git a/src/vmm/src/windows/devices/virtio/vsock/connection.rs b/src/vmm/src/windows/devices/virtio/vsock/connection.rs index 4b97eee34..15673b714 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/connection.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/connection.rs @@ -115,6 +115,25 @@ impl VsockConnection { self.credit_update_needed = false; } + /// Initiate a host-to-guest connection (host-initiated). + /// + /// Transitions Idle -> Connecting and returns a REQUEST header to send + /// to the guest via the RX queue. Returns None if not in Idle state. + pub fn initiate_connect(&mut self) -> Option { + if self.state != ConnState::Idle { + return None; + } + self.state = ConnState::Connecting; + Some(VsockHeader::new_request( + self.local_cid, + self.local_port, + self.peer_cid, + self.peer_port, + self.buf_alloc, + self.fwd_cnt, + )) + } + /// Handle a REQUEST from the guest. /// /// Transitions Idle -> Connected and returns a RESPONSE header. @@ -373,6 +392,57 @@ mod tests { assert_eq!(r.buf_alloc, DEFAULT_BUF_ALLOC); } + // --- Host-initiated connection --- + + #[test] + fn test_initiate_connect_transitions_to_connecting() { + let mut conn = guest_conn(); + let req = conn.initiate_connect(); + assert!(req.is_some()); + assert_eq!(conn.state(), ConnState::Connecting); + + let r = req.unwrap(); + assert_eq!(r.op, VSOCK_OP_REQUEST); + assert_eq!(r.src_cid, 2); // HOST + assert_eq!(r.dst_cid, 3); // guest + assert_eq!(r.src_port, 2695); // local_port + assert_eq!(r.dst_port, 5000); // peer_port + assert_eq!(r.buf_alloc, DEFAULT_BUF_ALLOC); + } + + #[test] + fn test_initiate_connect_on_non_idle_returns_none() { + let mut conn = guest_conn(); + conn.initiate_connect(); + // Second call should fail (already Connecting). + assert!(conn.initiate_connect().is_none()); + } + + #[test] + fn test_response_transitions_connecting_to_connected() { + let mut conn = guest_conn(); + conn.initiate_connect(); + assert_eq!(conn.state(), ConnState::Connecting); + + let resp = VsockHeader { + src_cid: 3, + dst_cid: 2, + src_port: 5000, + dst_port: 2695, + len: 0, + type_: 1, + op: VSOCK_OP_RESPONSE, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + let (hdr, data) = conn.dispatch(&resp, &[]); + assert!(hdr.is_none()); // No response to a RESPONSE + assert!(data.is_none()); + assert_eq!(conn.state(), ConnState::Connected); + assert_eq!(conn.peer_credit(), 32768); + } + #[test] fn test_request_on_non_idle_returns_none() { let mut conn = guest_conn(); diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 9b1eba5fb..10b06b0a2 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -41,6 +41,9 @@ const QUEUE_MAX_SIZE: u16 = 128; /// Connection key: (guest_port, host_port). type ConnKey = (u32, u32); +/// Starting ephemeral port for host-initiated vsock connections. +const EPHEMERAL_PORT_START: u32 = 49152; + /// Virtio-vsock device with TCP host-side bridge. pub struct VirtioVsock { /// Guest CID (typically 3 for the first guest). @@ -59,6 +62,8 @@ pub struct VirtioVsock { streams: HashMap, /// Pending response/control packets to inject into the RX queue. rx_pending: Vec<(VsockHeader, Vec)>, + /// Next ephemeral port for host-initiated connections. + next_host_port: u32, } impl VirtioVsock { @@ -71,6 +76,7 @@ impl VirtioVsock { connect_targets: HashMap::new(), streams: HashMap::new(), rx_pending: Vec::new(), + next_host_port: EPHEMERAL_PORT_START, } } @@ -303,12 +309,69 @@ impl VirtioVsock { } } + /// Allocate the next ephemeral host port for host-initiated connections. + fn alloc_host_port(&mut self) -> u32 { + let port = self.next_host_port; + self.next_host_port = self.next_host_port.wrapping_add(1); + if self.next_host_port < EPHEMERAL_PORT_START { + self.next_host_port = EPHEMERAL_PORT_START; + } + port + } + + /// Poll TCP listeners for pending connections and initiate vsock handshakes. + /// + /// When a host TCP client connects to a listener, this method: + /// 1. Accepts the TCP connection + /// 2. Allocates an ephemeral host port for the vsock side + /// 3. Creates a VsockConnection in Connecting state + /// 4. Generates a REQUEST packet to send to the guest via RX queue + /// 5. Stores the TCP stream (data is NOT read until Connected) + fn poll_tcp_listeners(&mut self) { + let vsock_ports: Vec = self.listeners.keys().copied().collect(); + + for vsock_port in vsock_ports { + let stream = if let Some(listener) = self.listeners.get(&vsock_port) { + match listener.accept() { + Ok((stream, _addr)) => { + let _ = stream.set_nonblocking(true); + stream + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, + Err(_) => continue, + } + } else { + continue; + }; + + let host_port = self.alloc_host_port(); + let key = (vsock_port, host_port); + + let mut conn = + VsockConnection::new(VSOCK_CID_HOST, host_port, self.guest_cid, vsock_port); + + if let Some(req) = conn.initiate_connect() { + self.rx_pending.push((req, Vec::new())); + self.connections.insert(key, conn); + self.streams.insert(key, stream); + } + } + } + /// Poll TCP streams for incoming data and queue it for RX injection. fn poll_tcp_streams(&mut self) { // Collect keys first to avoid borrow issues. let keys: Vec = self.streams.keys().copied().collect(); for key in keys { + // Skip streams whose vsock connection is still handshaking. + // TCP data stays in the kernel receive buffer until Connected. + if let Some(conn) = self.connections.get(&key) { + if conn.state() != ConnState::Connected { + continue; + } + } + let mut buf = [0u8; 4096]; let data = if let Some(stream) = self.streams.get_mut(&key) { match stream.read(&mut buf) { @@ -471,6 +534,9 @@ impl VirtioDeviceBackend for VirtioVsock { } fn poll(&mut self, queues: &mut [Virtqueue], mem: &dyn GuestMemoryAccessor) -> bool { + // Accept new TCP connections and initiate vsock handshakes. + self.poll_tcp_listeners(); + // Poll TCP streams for incoming data. self.poll_tcp_streams(); @@ -1414,4 +1480,253 @@ mod tests { assert_eq!(dev.connection_count(), 1); assert_eq!(dev.streams.len(), 1); } + + // --- Host-initiated connections (poll_tcp_listeners) --- + + #[test] + fn test_poll_tcp_listeners_accepts_and_sends_request() { + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Host TCP client connects BEFORE any guest action. + let _client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + // poll_tcp_listeners should accept and generate a REQUEST. + dev.poll_tcp_listeners(); + + assert_eq!(dev.rx_pending.len(), 1); + assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_REQUEST); + assert_eq!(dev.rx_pending[0].0.src_cid, VSOCK_CID_HOST); + assert_eq!(dev.rx_pending[0].0.dst_cid, 3); // guest CID + assert_eq!(dev.rx_pending[0].0.dst_port, port); // guest vsock port + assert!(dev.rx_pending[0].0.src_port >= EPHEMERAL_PORT_START); + assert_eq!(dev.connection_count(), 1); + assert_eq!(dev.streams.len(), 1); + } + + #[test] + fn test_poll_tcp_listeners_no_pending_is_noop() { + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + + // No TCP client connected. + dev.poll_tcp_listeners(); + + assert!(dev.rx_pending.is_empty()); + assert_eq!(dev.connection_count(), 0); + } + + #[test] + fn test_host_initiated_full_lifecycle() { + use std::io::Write as IoWrite; + + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Step 1: Host TCP client connects. + let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Step 2: VMM accepts and sends REQUEST to guest. + dev.poll_tcp_listeners(); + assert_eq!(dev.rx_pending.len(), 1); + let req = &dev.rx_pending[0].0; + assert_eq!(req.op, VSOCK_OP_REQUEST); + let host_ephemeral = req.src_port; + let key = (port, host_ephemeral); + dev.rx_pending.clear(); + + // Step 3: Guest sends RESPONSE (simulated via handle_guest_packet). + let resp = VsockHeader { + src_cid: 3, + dst_cid: VSOCK_CID_HOST, + src_port: port, + dst_port: host_ephemeral, + len: 0, + type_: 1, + op: packet::VSOCK_OP_RESPONSE, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + dev.handle_guest_packet(&resp, &[]); + + // Connection should now be Connected. + assert_eq!( + dev.connections.get(&key).unwrap().state(), + ConnState::Connected + ); + + // Step 4: Host sends data via TCP -> forwarded to guest via vsock. + client.write_all(b"hello from host").unwrap(); + client.flush().unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + dev.poll_tcp_streams(); + let conn = dev.connections.get(&key).unwrap(); + assert!(conn.tx_buf_len() > 0); + } + + #[test] + fn test_host_initiated_skips_data_during_handshake() { + use std::io::Write as IoWrite; + + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Host connects and sends data before handshake completes. + let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + dev.poll_tcp_listeners(); + let host_ephemeral = dev.rx_pending[0].0.src_port; + let key = (port, host_ephemeral); + dev.rx_pending.clear(); + + // Connection is in Connecting state. + assert_eq!( + dev.connections.get(&key).unwrap().state(), + ConnState::Connecting + ); + + // Host sends data while still Connecting. + client.write_all(b"premature data").unwrap(); + client.flush().unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + // poll_tcp_streams should SKIP this stream (not Connected yet). + dev.poll_tcp_streams(); + // Data stays in kernel buffer, connection tx_buf is empty. + assert_eq!(dev.connections.get(&key).unwrap().tx_buf_len(), 0); + + // Now complete the handshake. + let resp = VsockHeader { + src_cid: 3, + dst_cid: VSOCK_CID_HOST, + src_port: port, + dst_port: host_ephemeral, + len: 0, + type_: 1, + op: packet::VSOCK_OP_RESPONSE, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + dev.handle_guest_packet(&resp, &[]); + assert_eq!( + dev.connections.get(&key).unwrap().state(), + ConnState::Connected + ); + + // NOW poll_tcp_streams reads the data. + dev.poll_tcp_streams(); + assert!(dev.connections.get(&key).unwrap().tx_buf_len() > 0); + } + + #[test] + fn test_ephemeral_port_allocation() { + let mut dev = VirtioVsock::new(3); + let p1 = dev.alloc_host_port(); + let p2 = dev.alloc_host_port(); + let p3 = dev.alloc_host_port(); + assert_eq!(p1, EPHEMERAL_PORT_START); + assert_eq!(p2, EPHEMERAL_PORT_START + 1); + assert_eq!(p3, EPHEMERAL_PORT_START + 2); + } + + #[test] + fn test_host_initiated_guest_data_to_host_tcp() { + use std::io::Read as IoRead; + + let mut dev = VirtioVsock::new(3); + dev.listen(0).unwrap(); + let port = dev + .listeners + .values() + .next() + .unwrap() + .local_addr() + .unwrap() + .port() as u32; + let listener = dev.listeners.remove(&0).unwrap(); + dev.listeners.insert(port, listener); + + // Host connects. + let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + client.set_nonblocking(true).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + // Accept + REQUEST. + dev.poll_tcp_listeners(); + let host_ephemeral = dev.rx_pending[0].0.src_port; + let key = (port, host_ephemeral); + dev.rx_pending.clear(); + + // Guest RESPONSE. + let resp = VsockHeader { + src_cid: 3, + dst_cid: VSOCK_CID_HOST, + src_port: port, + dst_port: host_ephemeral, + len: 0, + type_: 1, + op: packet::VSOCK_OP_RESPONSE, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + dev.handle_guest_packet(&resp, &[]); + + // Guest sends data (RW) → should be forwarded to host TCP stream. + let rw_hdr = VsockHeader { + src_cid: 3, + dst_cid: VSOCK_CID_HOST, + src_port: port, + dst_port: host_ephemeral, + len: 11, + type_: 1, + op: packet::VSOCK_OP_RW, + flags: 0, + buf_alloc: 32768, + fwd_cnt: 0, + }; + dev.handle_guest_packet(&rw_hdr, b"hello guest"); + + // Read from TCP client. + std::thread::sleep(std::time::Duration::from_millis(50)); + let mut buf = [0u8; 128]; + let n = client.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], b"hello guest"); + } } diff --git a/src/vmm/src/windows/devices/virtio/vsock/packet.rs b/src/vmm/src/windows/devices/virtio/vsock/packet.rs index 740450aeb..4913612cd 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/packet.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/packet.rs @@ -119,6 +119,29 @@ impl VsockHeader { buf } + /// Create a REQUEST header (host -> guest) for a host-initiated connection. + pub fn new_request( + src_cid: u64, + src_port: u32, + dst_cid: u64, + dst_port: u32, + buf_alloc: u32, + fwd_cnt: u32, + ) -> Self { + VsockHeader { + src_cid, + dst_cid, + src_port, + dst_port, + len: 0, + type_: VIRTIO_VSOCK_TYPE_STREAM, + op: VSOCK_OP_REQUEST, + flags: 0, + buf_alloc, + fwd_cnt, + } + } + /// Create a RESPONSE header (host -> guest) for a given REQUEST. pub fn new_response( src_cid: u64, @@ -370,6 +393,20 @@ mod tests { assert_eq!(read_back, hdr); } + #[test] + fn test_new_request() { + let hdr = VsockHeader::new_request(2, 49152, 3, 2695, 65536, 0); + assert_eq!(hdr.src_cid, 2); + assert_eq!(hdr.dst_cid, 3); + assert_eq!(hdr.src_port, 49152); + assert_eq!(hdr.dst_port, 2695); + assert_eq!(hdr.len, 0); + assert_eq!(hdr.type_, VIRTIO_VSOCK_TYPE_STREAM); + assert_eq!(hdr.op, VSOCK_OP_REQUEST); + assert_eq!(hdr.buf_alloc, 65536); + assert_eq!(hdr.fwd_cnt, 0); + } + #[test] fn test_new_response() { let hdr = VsockHeader::new_response(2, 2695, 3, 5000, 65536, 0); From 66b6f41145eabc2ceac84bb738a6941175563c6f Mon Sep 17 00:00:00 2001 From: lile Date: Wed, 22 Apr 2026 11:58:09 +0800 Subject: [PATCH 14/33] style(vmm): apply rustfmt to ACPI tables and boot loader Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/boot/acpi.rs | 21 +++++++++------------ src/vmm/src/windows/boot/loader.rs | 10 +++++++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/vmm/src/windows/boot/acpi.rs b/src/vmm/src/windows/boot/acpi.rs index a743935c2..cee46ccbd 100644 --- a/src/vmm/src/windows/boot/acpi.rs +++ b/src/vmm/src/windows/boot/acpi.rs @@ -57,7 +57,7 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { // ---- RSDP (20 bytes at offset 0x00) ---- let rsdp = &mut region[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; rsdp[0..8].copy_from_slice(b"RSD PTR "); // Signature - // rsdp[8] = checksum (computed below) + // rsdp[8] = checksum (computed below) rsdp[9..15].copy_from_slice(b"BOXLTE"); // OEMID rsdp[15] = 0; // Revision: ACPI 1.0 rsdp[16..20].copy_from_slice(&(rsdt_addr as u32).to_le_bytes()); // RsdtAddress @@ -68,13 +68,13 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { rsdt[0..4].copy_from_slice(b"RSDT"); // Signature rsdt[4..8].copy_from_slice(&(RSDT_SIZE as u32).to_le_bytes()); // Length rsdt[8] = 1; // Revision - // rsdt[9] = checksum (computed below) + // rsdt[9] = checksum (computed below) rsdt[10..16].copy_from_slice(b"BOXLTE"); // OEMID rsdt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID rsdt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision rsdt[28..32].copy_from_slice(b"BXLT"); // Creator ID rsdt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision - // Entry[0]: pointer to FADT + // Entry[0]: pointer to FADT rsdt[36..40].copy_from_slice(&(fadt_addr as u32).to_le_bytes()); acpi_checksum(&mut region[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE], 9); @@ -83,14 +83,14 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { fadt[0..4].copy_from_slice(b"FACP"); // Signature (NOT "FADT") fadt[4..8].copy_from_slice(&(FADT_SIZE as u32).to_le_bytes()); // Length fadt[8] = 1; // Revision - // fadt[9] = checksum (computed below) + // fadt[9] = checksum (computed below) fadt[10..16].copy_from_slice(b"BOXLTE"); // OEMID fadt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID fadt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision fadt[28..32].copy_from_slice(b"BXLT"); // Creator ID fadt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision - // FACS pointer (offset 36) — 0, not needed for shutdown. - // DSDT pointer (offset 40). + // FACS pointer (offset 36) — 0, not needed for shutdown. + // DSDT pointer (offset 40). fadt[40..44].copy_from_slice(&(dsdt_addr as u32).to_le_bytes()); // SCI_INT (offset 46) — interrupt for ACPI System Control. fadt[46..48].copy_from_slice(&SCI_INT.to_le_bytes()); @@ -110,18 +110,15 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { dsdt[0..4].copy_from_slice(b"DSDT"); // Signature dsdt[4..8].copy_from_slice(&(dsdt_size as u32).to_le_bytes()); // Length dsdt[8] = 1; // Revision - // dsdt[9] = checksum (computed below) + // dsdt[9] = checksum (computed below) dsdt[10..16].copy_from_slice(b"BOXLTE"); // OEMID dsdt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID dsdt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision dsdt[28..32].copy_from_slice(b"BXLT"); // Creator ID dsdt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision - // AML body: \_S5_ package. + // AML body: \_S5_ package. dsdt[DSDT_HEADER_SIZE..DSDT_HEADER_SIZE + S5_AML.len()].copy_from_slice(S5_AML); - acpi_checksum( - &mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size], - 9, - ); + acpi_checksum(&mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size], 9); region } diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs index cf213ea12..b0da1dc1b 100644 --- a/src/vmm/src/windows/boot/loader.rs +++ b/src/vmm/src/windows/boot/loader.rs @@ -18,10 +18,10 @@ use super::super::memory::{ PDPT_START, PD_START, PML4_START, ZERO_PAGE_START, }; #[cfg(target_os = "windows")] -use super::acpi; -#[cfg(target_os = "windows")] use super::super::types::{SpecialRegisters, StandardRegisters}; #[cfg(target_os = "windows")] +use super::acpi; +#[cfg(target_os = "windows")] use super::params::BootParams; #[cfg(target_os = "windows")] use super::setup::{build_gdt, build_page_tables, configure_boot_registers, gdt_bytes}; @@ -504,7 +504,11 @@ mod tests { fn test_build_e820_map_1mb_no_high_memory() { // With only 1MB of RAM, high memory region should be empty (1MB - 1MB = 0). let map = build_e820_map(1, TEST_ACPI_BASE, TEST_ACPI_SIZE); - assert_eq!(map.len(), 3, "1MB RAM should only have low + reserved + ACPI"); + assert_eq!( + map.len(), + 3, + "1MB RAM should only have low + reserved + ACPI" + ); } #[test] From 3b174eeee7af2dac481f364f32c3ddd3b19c692f Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 24 Apr 2026 23:33:51 +0800 Subject: [PATCH 15/33] =?UTF-8?q?fix(windows):=20WHPX=20reliability=20?= =?UTF-8?q?=E2=80=94=20PIC=20priority=20masking,=20HLT=20clear=5Fhalt,=20d?= =?UTF-8?q?iagnostics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix three root causes of WHPX flakiness, achieving 100% pass rate on both Win10 and Win11 (previously ~40%): PIC 8259A priority masking (40% → 80%): - Fix pending_irq() to implement proper priority blocking: when an IRQ is in-service, all equal-or-lower-priority IRQs are blocked - Previously only blocked re-delivery of the same IRQ (irr & !imr & !isr) - Root cause: PIT (IRQ 0) and vsock (IRQ 6) both stuck in-service (ISR=0x41), causing kernel handler deadlock HLT clear_halt — QEMU-style active wake (90% → 100%): - On HLT exit, poll devices before sleeping; if PIC has pending interrupt, clear halt suspend via WHvRegisterInternalActivityState - Prevents lost wakeups where guest sleeps through pending vsock data - Matches QEMU's whpx-all.c HLT handling strategy Safety and diagnostics: - Increase MAX_HALTS from 50 to 50,000 (guest HLTs normally in idle loop) - Add vcpu.run() error handling with exit reason tracking - Add MMIO decode and HRESULT error context logging - Add vsock TCP nodelay, write retry loop, connection state logging - Add PIC master_state() diagnostic accessor - Track and report exit reason (ACPI_SHUTDOWN, HALT_STOP_REQUESTED, etc.) Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/cmdline.rs | 3 + src/vmm/src/windows/devices/manager.rs | 1 + src/vmm/src/windows/devices/pic.rs | 130 ++++++++++++++++-- src/vmm/src/windows/devices/virtio/mmio.rs | 1 + .../devices/virtio/vsock/connection.rs | 19 +++ .../src/windows/devices/virtio/vsock/mod.rs | 99 +++++++++++-- src/vmm/src/windows/runner.rs | 122 ++++++++++------ src/vmm/src/windows/whpx.rs | 60 +++++++- 8 files changed, 361 insertions(+), 74 deletions(-) diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index 60fee6430..fe5c00a94 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -14,8 +14,11 @@ pub const FIRST_MMIO_IRQ: u8 = 5; /// eliminating ~36K VM exits per boot (~26K serial + ~10K i8042). This reduces /// WHPX boot time from ~5s to ~1-2s. /// +/// - `console=ttyS0`: Route kernel console to serial port (required — no VGA). /// - `quiet loglevel=1`: Suppress kernel printk to console. /// - `i8042.nokbd i8042.noaux`: Skip PS/2 keyboard/mouse probe (10K+ exits). +/// - `noapic nolapic nosmp`: Use legacy PIC mode (no APIC/SMP). Required for +/// our 8259 PIC emulation. /// - `nohyperv`: Disable Hyper-V guest enlightenments. WHPX exposes Hyper-V /// CPUID leaves but doesn't fully support synthetic timers/SynIC, causing /// clock stalls if the kernel tries to use them. diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index fa428eb27..d5483f600 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -490,6 +490,7 @@ impl DeviceManager { // Poll vsock for host-initiated data. if self.virtio_vsock.poll(mem) { + log::debug!("vsock poll raised IRQ {}", irq_for_slot(1)); self.pic.raise_irq(irq_for_slot(1)); } diff --git a/src/vmm/src/windows/devices/pic.rs b/src/vmm/src/windows/devices/pic.rs index 68f57db53..b0b6ce0f7 100644 --- a/src/vmm/src/windows/devices/pic.rs +++ b/src/vmm/src/windows/devices/pic.rs @@ -164,14 +164,31 @@ impl PicChip { self.irr &= !(1 << (irq & 7)); } - /// Get the highest-priority pending (unmasked, not in-service) IRQ, if any. + /// Get the highest-priority pending (unmasked, deliverable) IRQ, if any. + /// + /// Implements proper 8259A priority masking: when an interrupt is + /// in-service, all equal-or-lower-priority interrupts are blocked. + /// IRQ 0 has highest priority, IRQ 7 has lowest (default fixed + /// priority mode). fn pending_irq(&self) -> Option { - let pending = self.irr & !self.imr & !self.isr; - if pending == 0 { + let requested = self.irr & !self.imr; + if requested == 0 { return None; } - // Lowest bit number = highest priority. - (0..8u8).find(|&i| pending & (1 << i) != 0) + // Find the highest-priority (lowest-numbered) in-service IRQ. + // All IRQs at that level or lower are blocked. + let priority_ceiling = (0..8u8).find(|&i| self.isr & (1 << i) != 0); + match priority_ceiling { + Some(ceil) => { + // Only IRQs with higher priority (lower number) than the + // in-service IRQ can be delivered. + (0..ceil).find(|&i| requested & (1 << i) != 0) + } + None => { + // No interrupt in-service — deliver highest priority pending. + (0..8u8).find(|&i| requested & (1 << i) != 0) + } + } } /// Acknowledge the highest-priority pending interrupt. @@ -268,6 +285,16 @@ impl Pic { } } + /// Get master PIC state for diagnostics: (IRR, ISR, IMR, vector_base). + pub fn master_state(&self) -> (u8, u8, u8, u8) { + ( + self.master.irr, + self.master.isr, + self.master.imr, + self.master.vector_base, + ) + } + /// Check if the given I/O port belongs to either PIC. pub fn handles_port(&self, port: u16) -> bool { matches!( @@ -377,7 +404,12 @@ mod tests { let vector = chip.acknowledge(); assert_eq!(vector, Some(0x21)); // 0x20 + 1 - // IRQ 3 still pending. + + // IRQ 3 is blocked while IRQ 1 is in-service (lower priority). + assert_eq!(chip.pending_irq(), None, "IRQ 3 blocked while IRQ 1 in-service"); + + // After EOI for IRQ 1, IRQ 3 becomes deliverable. + chip.write_command(0x61); // Specific EOI for IRQ 1. assert_eq!(chip.pending_irq(), Some(3)); } @@ -390,12 +422,16 @@ mod tests { chip.raise_irq(0); chip.acknowledge(); // IRQ 0 now in ISR. - // Raise IRQ 1 — it's pending but IRQ 0 is in-service. - // Since ISR bit 0 is set, IRQ 0 blocks IRQ 0 but not IRQ 1. - // Actually, pending = irr & !imr & !isr. IRQ 1 is not in ISR, - // so it should be deliverable. + // Raise IRQ 1 — lower priority than IRQ 0. + // With proper 8259A priority masking, IRQ 1 is blocked while + // IRQ 0 is in-service (all equal-or-lower priority blocked). chip.raise_irq(1); - assert_eq!(chip.pending_irq(), Some(1)); + assert_eq!(chip.pending_irq(), None, "IRQ 1 must be blocked while IRQ 0 is in-service"); + + // After EOI for IRQ 0, IRQ 1 becomes deliverable. + chip.write_command(0x60); // Specific EOI for IRQ 0. + assert_eq!(chip.isr, 0, "ISR cleared after specific EOI"); + assert_eq!(chip.pending_irq(), Some(1), "IRQ 1 deliverable after EOI"); } #[test] @@ -419,15 +455,24 @@ mod tests { chip.imr = 0; chip.vector_base = 0x20; + // Acknowledge IRQ 0, then EOI it, then acknowledge IRQ 2. chip.raise_irq(0); chip.raise_irq(2); - chip.acknowledge(); // IRQ 0 acknowledged. - chip.acknowledge(); // IRQ 2 acknowledged. - assert_eq!(chip.isr, 0x05); // bits 0 and 2. + chip.acknowledge(); // IRQ 0 acknowledged → ISR bit 0. + assert_eq!(chip.isr, 0x01); + + // IRQ 2 is blocked while IRQ 0 in-service (priority masking). + assert_eq!(chip.pending_irq(), None); + + // EOI IRQ 0, then IRQ 2 becomes deliverable. + chip.write_command(0x60); // Specific EOI for IRQ 0. + assert_eq!(chip.isr, 0x00); + chip.acknowledge(); // IRQ 2 acknowledged → ISR bit 2. + assert_eq!(chip.isr, 0x04); // Specific EOI for IRQ 2 (OCW2: 0x60 | 2 = 0x62). chip.write_command(0x62); - assert_eq!(chip.isr, 0x01, "only IRQ 0 should remain in ISR"); + assert_eq!(chip.isr, 0x00, "ISR should be clear after both EOIs"); } #[test] @@ -521,6 +566,61 @@ mod tests { assert_eq!(chip.isr, 0, "ISR should not be set in auto-EOI mode"); } + /// Validates the fix for the WHPX flakiness root cause: PIT (IRQ 0) + /// in-service must block vsock (IRQ 6) delivery. Without this fix, + /// both interrupts end up in ISR simultaneously (ISR=0x41), causing + /// a deadlock where the kernel can't service either handler. + #[test] + fn test_pic_chip_pit_blocks_vsock_priority() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x30; // Linux programs master PIC to base 0x30. + + // PIT fires (IRQ 0) and gets acknowledged. + chip.raise_irq(0); + assert_eq!(chip.acknowledge(), Some(0x30)); + assert_eq!(chip.isr, 0x01); // PIT in-service. + + // While PIT handler runs, vsock (IRQ 6) fires. + chip.raise_irq(6); + + // IRQ 6 must NOT be deliverable (lower priority than IRQ 0). + assert_eq!( + chip.pending_irq(), + None, + "vsock IRQ 6 must be blocked while PIT IRQ 0 is in-service" + ); + + // Kernel sends specific EOI for PIT (0x60 | 0 = 0x60). + chip.write_command(0x60); + assert_eq!(chip.isr, 0x00); + + // Now vsock IRQ 6 is deliverable. + assert_eq!(chip.pending_irq(), Some(6)); + assert_eq!(chip.acknowledge(), Some(0x36)); + assert_eq!(chip.isr, 0x40); // Only vsock in-service, NOT 0x41. + } + + #[test] + fn test_pic_chip_higher_priority_preempts() { + let mut chip = PicChip::new(); + chip.imr = 0; + chip.vector_base = 0x30; + + // IRQ 6 (vsock) in-service. + chip.raise_irq(6); + chip.acknowledge(); + assert_eq!(chip.isr, 0x40); + + // IRQ 0 (PIT) fires — higher priority, should preempt. + chip.raise_irq(0); + assert_eq!( + chip.pending_irq(), + Some(0), + "higher-priority IRQ 0 should preempt IRQ 6" + ); + } + #[test] fn test_pic_chip_clear_irq() { let mut chip = PicChip::new(); diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index 946eb04ff..32af8735d 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -97,6 +97,7 @@ pub trait VirtioDeviceBackend { fn poll(&mut self, _queues: &mut [Virtqueue], _mem: &dyn GuestMemoryAccessor) -> bool { false } + } /// Virtio-MMIO device wrapping a backend. diff --git a/src/vmm/src/windows/devices/virtio/vsock/connection.rs b/src/vmm/src/windows/devices/virtio/vsock/connection.rs index 15673b714..5f22fe880 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/connection.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/connection.rs @@ -147,6 +147,10 @@ impl VsockConnection { self.peer_buf_alloc = hdr.buf_alloc; self.peer_fwd_cnt = hdr.fwd_cnt; self.state = ConnState::Connected; + log::debug!( + "vsock conn ({},{}) → Connected (guest REQUEST, buf_alloc={})", + self.local_port, self.peer_port, hdr.buf_alloc + ); Some(VsockHeader::new_response( self.local_cid, @@ -184,6 +188,7 @@ impl VsockConnection { /// Handle a SHUTDOWN from the guest. pub fn handle_shutdown(&mut self, flags: u32) { + let old_state = self.state; match self.state { ConnState::Connected => { if flags & (VSOCK_SHUTDOWN_SEND | VSOCK_SHUTDOWN_RECV) @@ -199,10 +204,20 @@ impl VsockConnection { } _ => {} } + if self.state != old_state { + log::debug!( + "vsock conn ({},{}) → {:?} (SHUTDOWN flags=0x{:x})", + self.local_port, self.peer_port, self.state, flags + ); + } } /// Handle a RST from the guest. pub fn handle_rst(&mut self) { + log::debug!( + "vsock conn ({},{}) → Closed (RST)", + self.local_port, self.peer_port + ); self.state = ConnState::Closed; } @@ -324,6 +339,10 @@ impl VsockConnection { self.peer_buf_alloc = hdr.buf_alloc; self.peer_fwd_cnt = hdr.fwd_cnt; self.state = ConnState::Connected; + log::debug!( + "vsock conn ({},{}) → Connected (guest RESPONSE, buf_alloc={})", + self.local_port, self.peer_port, hdr.buf_alloc + ); } (None, None) } diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 10b06b0a2..4b6a1a3ca 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -191,9 +191,34 @@ impl VirtioVsock { let (resp_hdr, fwd_data) = conn.dispatch(hdr, payload); // Forward data to host TCP socket. + // Use retry loop for non-blocking sockets (write_all fails on WouldBlock). if let Some(data) = fwd_data { if let Some(stream) = self.streams.get_mut(&key) { - let _ = stream.write_all(&data); + let mut written = 0; + let mut retries = 0; + while written < data.len() { + match stream.write(&data[written..]) { + Ok(n) => written += n, + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { + retries += 1; + if retries > 1000 { + log::warn!( + "vsock write stuck: {}/{} bytes after {} retries, key=({},{})", + written, data.len(), retries, key.0, key.1 + ); + break; + } + std::thread::yield_now(); + } + Err(e) => { + log::warn!( + "vsock write failed: {}/{} bytes, err={}, key=({},{})", + written, data.len(), e, key.0, key.1 + ); + break; + } + } + } } } @@ -221,12 +246,20 @@ impl VirtioVsock { // Try outbound connection first (guest-initiated → host TCP target). if let Some(addr) = self.connect_targets.get(&hdr.dst_port).cloned() { + log::debug!("guest-initiated CONNECT: port={} → {}", hdr.dst_port, addr); let stream = match TcpStream::connect(&addr) { Ok(stream) => { - let _ = stream.set_nonblocking(true); + if let Err(e) = stream.set_nonblocking(true) { + log::warn!("guest-connect: set_nonblocking failed: {}", e); + } + if let Err(e) = stream.set_nodelay(true) { + log::warn!("guest-connect: set_nodelay failed: {}", e); + } + log::debug!("TCP connect OK to {}", addr); stream } - Err(_) => { + Err(ref e) => { + log::warn!("TCP connect FAILED to {}: {}", addr, e); let rst = VsockHeader::new_rst( VSOCK_CID_HOST, hdr.dst_port, @@ -333,8 +366,19 @@ impl VirtioVsock { for vsock_port in vsock_ports { let stream = if let Some(listener) = self.listeners.get(&vsock_port) { match listener.accept() { - Ok((stream, _addr)) => { - let _ = stream.set_nonblocking(true); + Ok((stream, addr)) => { + if let Err(e) = stream.set_nonblocking(true) { + log::warn!( + "vsock set_nonblocking failed: {} (addr={:?})", + e, addr + ); + } + if let Err(e) = stream.set_nodelay(true) { + log::warn!( + "vsock set_nodelay failed: {} (addr={:?})", + e, addr + ); + } stream } Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, @@ -351,6 +395,10 @@ impl VirtioVsock { VsockConnection::new(VSOCK_CID_HOST, host_port, self.guest_cid, vsock_port); if let Some(req) = conn.initiate_connect() { + log::debug!( + "host-initiated CONNECT: vsock_port={}, host_port={}, queuing REQUEST", + vsock_port, host_port + ); self.rx_pending.push((req, Vec::new())); self.connections.insert(key, conn); self.streams.insert(key, stream); @@ -377,6 +425,7 @@ impl VirtioVsock { match stream.read(&mut buf) { Ok(0) => { // TCP connection closed. Send SHUTDOWN to guest. + log::debug!("TCP EOF, key=({},{})", key.0, key.1); if let Some(conn) = self.connections.get(&key) { let hdr = VsockHeader::new_shutdown( conn.local_cid, @@ -395,10 +444,19 @@ impl VirtioVsock { } continue; } - Ok(n) => Some(buf[..n].to_vec()), - Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => None, - Err(_) => { + Ok(n) => { + log::trace!("TCP read {} bytes, key=({},{})", n, key.0, key.1); + Some(buf[..n].to_vec()) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { + None + } + Err(ref e) => { // I/O error on TCP stream. RST the vsock connection. + log::warn!( + "vsock TCP read error: {} (raw={:?}), key=({},{})", + e, e.raw_os_error(), key.0, key.1 + ); if let Some(conn) = self.connections.get(&key) { let rst = conn.make_rst(); self.rx_pending.push((rst, Vec::new())); @@ -415,7 +473,13 @@ impl VirtioVsock { // Enqueue data from TCP into the connection's TX buffer. if let Some(data) = data { if let Some(conn) = self.connections.get_mut(&key) { - conn.enqueue_tx(&data); + let enqueued = conn.enqueue_tx(&data); + if enqueued < data.len() { + log::debug!( + "vsock enqueue_tx partial: {}/{} bytes, credit={}, key=({},{})", + enqueued, data.len(), conn.peer_credit(), key.0, key.1 + ); + } } } } @@ -447,7 +511,13 @@ impl VirtioVsock { while !self.rx_pending.is_empty() { let head = match rx_queue.pop_avail(mem) { Ok(Some(h)) => h, - _ => break, // No available RX buffers. + _ => { + log::debug!( + "vsock inject_rx: no available RX buffers, {} packets pending", + self.rx_pending.len() + ); + break; + } }; let chain = match rx_queue.read_desc_chain(head, mem) { @@ -542,7 +612,14 @@ impl VirtioDeviceBackend for VirtioVsock { // Inject any pending data into the RX queue. if queues.len() > RX_QUEUE { - self.inject_rx(&mut queues[RX_QUEUE], mem) + let injected = self.inject_rx(&mut queues[RX_QUEUE], mem); + if injected { + log::debug!( + "vsock poll: injected data into RX queue, conns={}", + self.connections.len() + ); + } + injected } else { false } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 0023f7075..47e6383f7 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -44,8 +44,13 @@ mod imp { /// /// With ACPI tables, `poweroff` is detected instantly via PM1a_CNT. /// MAX_HALTS is a safety fallback for non-ACPI shutdown paths. - /// At 1ms per tick, 50 = ~50ms timeout. - const MAX_HALTS: u64 = 50; + /// At 1ms per tick, 50000 = ~50 second timeout. + /// + /// Must be high enough to tolerate normal guest idle periods (e.g. + /// waiting for gRPC data after boot). The guest HLTs in its idle + /// loop whenever there are no interrupts; this is normal and does + /// NOT indicate the VM is stuck. + const MAX_HALTS: u64 = 50_000; /// Handle for a running VM, stored in `RUNNING_VMS`. struct VmHandle { @@ -235,12 +240,8 @@ mod imp { let mut halt_count: u64 = 0; let start_time = Instant::now(); let mut last_progress = Instant::now(); - // IO/MMIO access counters for debugging boot stalls. - let mut io_read_counts: HashMap = HashMap::new(); - let mut io_write_counts: HashMap = HashMap::new(); let mut mmio_count: u64 = 0; - let mut msr_count: u64 = 0; - let mut cpuid_count: u64 = 0; + let mut last_exit_reason = "none"; let exit_code; loop { @@ -252,7 +253,7 @@ mod imp { match vcpu.interrupts_enabled() { Ok(true) => { if let Some(vector) = devices.pic.acknowledge() { - log::trace!("Injecting interrupt vector {:#X}", vector); + log::debug!("Injecting interrupt vector {:#X}", vector); vcpu.inject_interrupt(vector)?; devices.set_window_requested(false); } @@ -263,20 +264,38 @@ mod imp { devices.set_window_requested(true); } } - Err(_) => {} + Err(ref e) => { + log::warn!("interrupts_enabled() error: {:?}", e); + } } } - let exit = vcpu.run()?; + let exit = match vcpu.run() { + Ok(exit) => exit, + Err(e) => { + log::error!( + "vcpu.run() FAILED after {} exits: {:?}", + exit_count, + e + ); + eprintln!( + "[WHPX] vcpu.run() FAILED after {} exits: {:?}", + exit_count, e + ); + last_exit_reason = "VCPU_RUN_ERROR"; + exit_code = 1; + break; + } + }; exit_count += 1; match exit { VcpuExit::IoOut { port, size, data } => { halt_count = 0; - *io_write_counts.entry(port).or_insert(0) += 1; devices.handle_io_out(port, size, data); if devices.shutdown_requested() { log::info!("ACPI shutdown detected after {} exits", exit_count); + last_exit_reason = "ACPI_SHUTDOWN"; exit_code = 0; break; } @@ -284,7 +303,6 @@ mod imp { } VcpuExit::IoIn { port, size } => { halt_count = 0; - *io_read_counts.entry(port).or_insert(0) += 1; let data = devices.handle_io_in(port, size); vcpu.complete_io_in(data, size)?; } @@ -305,14 +323,40 @@ mod imp { vcpu.skip_instruction()?; } VcpuExit::InterruptWindow => { + halt_count = 0; devices.set_window_requested(false); } VcpuExit::Halt => { if !run_config.should_run() { log::info!("VM stop requested, exiting on Halt"); + last_exit_reason = "HALT_STOP_REQUESTED"; exit_code = 0; break; } + + // Active HLT wake: poll devices before sleeping. + // If an interrupt arrived (e.g. vsock data) while the guest + // was halted, clear the HLT suspend state so the vCPU resumes + // immediately. This prevents lost wakeups where the guest + // sleeps through pending interrupts. Matches QEMU's WHPX + // HLT handling strategy. + devices.tick_and_poll(&mem_adapter); + + if devices.pic.has_pending() { + log::debug!( + "HLT with pending interrupt, clearing halt (exits={})", + exit_count + ); + if let Err(e) = vcpu.clear_halt() { + log::warn!("clear_halt failed: {:?}", e); + // Fall through to sleep (graceful degradation). + } else { + halt_count = 0; + continue; + } + } + + // No pending interrupts — guest is genuinely idle. halt_count += 1; if halt_count > MAX_HALTS { log::warn!( @@ -320,6 +364,7 @@ mod imp { halt_count, exit_count ); + last_exit_reason = "HALT_MAX_REACHED"; exit_code = 0; break; } @@ -327,6 +372,7 @@ mod imp { } VcpuExit::Shutdown => { log::info!("VM shutdown after {} exits", exit_count); + last_exit_reason = "VM_SHUTDOWN"; exit_code = 0; break; } @@ -334,6 +380,7 @@ mod imp { // Timer thread or stop() cancelled vCPU. Check if we should exit. if !run_config.should_run() { log::info!("VM stop requested, exiting on Cancelled"); + last_exit_reason = "CANCELLED_STOP"; exit_code = 0; break; } @@ -344,36 +391,8 @@ mod imp { let console_len = devices::get_console_output(ctx_id) .map(|b| b.len()) .unwrap_or(0); - // Sort IO ports by frequency (descending), show all. - let mut reads: Vec<_> = io_read_counts.iter().collect(); - reads.sort_by(|a, b| b.1.cmp(a.1)); - let top_reads: Vec = reads - .iter() - .map(|(p, c)| format!("{:#X}:{}", p, c)) - .collect(); - let mut writes: Vec<_> = io_write_counts.iter().collect(); - writes.sort_by(|a, b| b.1.cmp(a.1)); - let top_writes: Vec = writes - .iter() - .map(|(p, c)| format!("{:#X}:{}", p, c)) - .collect(); log::info!( - "Progress @ {:.1}s: exits={} RIP={:#X} RSP={:#X} RFLAGS={:#X} console={}B \ - mmio={} msr={} cpuid={}", - start_time.elapsed().as_secs_f64(), - exit_count, - regs.rip, - regs.rsp, - regs.rflags, - console_len, - mmio_count, - msr_count, - cpuid_count, - ); - log::info!(" IO_reads=[{}]", top_reads.join(", ")); - log::info!(" IO_writes=[{}]", top_writes.join(", ")); - eprintln!( - "[WHPX] {:.1}s: exits={} RIP={:#X} console={}B mmio={}", + "Progress @ {:.1}s: exits={} RIP={:#X} console={}B mmio={}", start_time.elapsed().as_secs_f64(), exit_count, regs.rip, @@ -390,7 +409,6 @@ mod imp { rdx, } => { halt_count = 0; - msr_count += 1; if is_write { log::trace!( "MSR write: 0x{:08X} <- 0x{:016X}", @@ -412,7 +430,6 @@ mod imp { default_rdx, } => { halt_count = 0; - cpuid_count += 1; let leaf = rax as u32; // Mask hypervisor-related CPUID leaves to prevent the Linux // guest from detecting Hyper-V and trying to use enlightenments @@ -456,6 +473,7 @@ mod imp { exit_count, regs.as_ref().map_or(0, |r| r.rip), ); + last_exit_reason = "TRIPLE_FAULT"; exit_code = -1; break; } @@ -465,6 +483,7 @@ mod imp { reason, exit_count ); + last_exit_reason = "UNKNOWN_EXIT"; exit_code = -1; break; } @@ -472,6 +491,7 @@ mod imp { if exit_count >= MAX_EXITS { log::warn!("Reached {} exit limit", MAX_EXITS); + last_exit_reason = "MAX_EXITS"; exit_code = -1; break; } @@ -481,8 +501,20 @@ mod imp { run_config.request_stop(); let _ = timer_thread.join(); - log::info!("VM exited with code {} ({} exits)", exit_code, exit_count); - eprintln!("[WHPX] VM exited, code={} exits={}", exit_code, exit_count); + log::info!( + "VM exited with code {} ({} exits), reason={}", + exit_code, + exit_count, + last_exit_reason + ); + eprintln!( + "[WHPX] VM exited, code={} exits={} reason={} elapsed={:.1}s", + exit_code, + exit_count, + last_exit_reason, + start_time.elapsed().as_secs_f64(), + ); + Ok(exit_code) } diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index 806d358db..1e1a45d02 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -560,7 +560,14 @@ mod imp { std::mem::size_of::() as u32, ) }; - check_hresult("WHvRunVirtualProcessor", hr)?; + check_hresult("WHvRunVirtualProcessor", hr).map_err(|e| { + log::error!( + "WHvRunVirtualProcessor FAILED: {:?} (HRESULT=0x{:08X})", + e, + hr as u32 + ); + e + })?; // Cache RIP from the VP context for skip_instruction/complete_io_in. self.exit_rip.set(exit_context.VpContext.Rip); @@ -603,8 +610,31 @@ mod imp { // Decode the faulting instruction to get access size and write data. let byte_count = mem_ctx.InstructionByteCount as usize; let insn_bytes = &mem_ctx.InstructionBytes[..byte_count.min(16)]; - let regs = self.get_registers()?; - let insn = super::super::insn::decode_mmio_insn(insn_bytes, ®s)?; + let regs = self.get_registers().map_err(|e| { + log::error!( + "MMIO get_registers FAILED at GPA 0x{:x}: {:?}", + address, + e + ); + e + })?; + let insn = match super::super::insn::decode_mmio_insn(insn_bytes, ®s) { + Ok(insn) => insn, + Err(e) => { + log::error!( + "MMIO decode FAILED at GPA 0x{:x}: {:?}, bytes: {:02x?}, is_write={}", + address, + e, + insn_bytes, + is_write + ); + eprintln!( + "[WHPX] MMIO decode FAILED at GPA 0x{:x}, bytes: {:02x?}", + address, insn_bytes + ); + return Err(e); + } + }; self.exit_instruction_len.set(insn.len); self.exit_mmio_gpr_index.set(insn.gpr_index); @@ -905,6 +935,30 @@ mod imp { check_hresult("WHvSetVirtualProcessorRegisters(interrupt_window)", hr) } + /// Clear the HLT suspend state, waking the vCPU from halt. + /// + /// Uses `WHvRegisterInternalActivityState` to zero out suspend bits, + /// allowing the vCPU to resume execution (e.g., to process a pending + /// interrupt that arrived while halted). Matches QEMU's WHPX HLT + /// handling strategy. + pub fn clear_halt(&self) -> Result<()> { + // WHvRegisterInternalActivityState = 0x00040004 + // Bits: 0=StartupSuspend, 1=HaltSuspend, 2=IdleSuspend + // Write 0 to clear all suspend states. + let names: [i32; 1] = [0x00040004_i32]; + let values: Vec = vec![reg64(0)]; + let hr = unsafe { + WHvSetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 1, + values.as_ptr(), + ) + }; + check_hresult("WHvSetVirtualProcessorRegisters(clear_halt)", hr) + } + /// Cancel a running vCPU (causes it to exit with Cancelled). pub fn cancel(&self) -> Result<()> { let hr = unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, self.index, 0) }; From a2afcfbde2aca84359bf1cb5dceebe6aeff660cd Mon Sep 17 00:00:00 2001 From: lile Date: Sun, 26 Apr 2026 10:08:05 +0800 Subject: [PATCH 16/33] fix(windows): CMOS RTC BCD encoding and MMIO config space byte reads - CMOS RTC: use host UTC time with proper BCD encoding (to_bcd). Previous hardcoded decimal values caused guest to read year 2019 instead of 2026, breaking SSL certificate validation. - Virtio-MMIO: handle byte/word config space reads by aligning to 4-byte boundary and extracting the correct byte. Linux's virtio-mmio driver reads MAC address one byte at a time via readb(); unaligned reads previously returned 0, causing wrong MAC and NO-CARRIER. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/manager.rs | 123 +++++++++++++++++++-- src/vmm/src/windows/devices/virtio/mmio.rs | 36 +++++- 2 files changed, 147 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index d5483f600..0acc5f4f9 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -82,16 +82,92 @@ const PM1A_CNT_BLK: u16 = 0x604; /// Default vsock listen ports (BoxLite: 2695=gRPC, 2696=ready signal). const DEFAULT_VSOCK_PORTS: &[u32] = &[2695, 2696]; -/// CMOS register read values (static, read-only clock). +/// Convert a value to BCD (Binary-Coded Decimal). +/// E.g. 26 → 0x26, 59 → 0x59. +fn to_bcd(val: u8) -> u8 { + ((val / 10) << 4) | (val % 10) +} + +/// Snapshot of host UTC time, captured once at VM start and stored as +/// BCD values for CMOS register reads. +struct CmosTime { + seconds: u8, + minutes: u8, + hours: u8, + day_of_week: u8, + day_of_month: u8, + month: u8, + year: u8, // Two-digit year in BCD (e.g. 0x26 for 2026) + century: u8, // Century in BCD (e.g. 0x20) +} + +impl CmosTime { + /// Capture the current host UTC time. + fn now() -> Self { + use std::time::{SystemTime, UNIX_EPOCH}; + + let secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + // Break Unix timestamp into calendar components. + // Algorithm from Howard Hinnant's chrono-compatible date library. + let days = (secs / 86400) as i64; + let time_of_day = secs % 86400; + + // Civil date from days since epoch (March-based, then adjusted). + let z = days + 719468; + let era = if z >= 0 { z } else { z - 146096 } / 146097; + let doe = (z - era * 146097) as u64; // day of era [0, 146096] + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + + let hour = (time_of_day / 3600) as u8; + let minute = ((time_of_day % 3600) / 60) as u8; + let second = (time_of_day % 60) as u8; + + // Day of week: 1970-01-01 was Thursday (4). 1=Sun..7=Sat for CMOS. + let dow_zero = ((days % 7) + 4) % 7; // 0=Sun..6=Sat + let dow = dow_zero as u8 + 1; // 1=Sun..7=Sat + + let year_full = y as u16; + let century = (year_full / 100) as u8; + let year_2digit = (year_full % 100) as u8; + + Self { + seconds: to_bcd(second), + minutes: to_bcd(minute), + hours: to_bcd(hour), + day_of_week: to_bcd(dow), + day_of_month: to_bcd(d as u8), + month: to_bcd(m as u8), + year: to_bcd(year_2digit), + century: to_bcd(century), + } + } +} + +/// Host time snapshot, captured once at process start. +static CMOS_TIME: LazyLock = LazyLock::new(CmosTime::now); + +/// CMOS register read values. Time fields use the host-UTC snapshot; +/// everything else is static hardware description. fn cmos_read(addr: u8) -> u8 { + let t = &*CMOS_TIME; match addr { - 0x00 => 0, // Seconds - 0x02 => 0, // Minutes - 0x04 => 12, // Hours (12 noon) - 0x06 => 3, // Day of week (Wednesday) - 0x07 => 1, // Day of month - 0x08 => 1, // Month (January) - 0x09 => 25, // Year (2025) + 0x00 => t.seconds, + 0x02 => t.minutes, + 0x04 => t.hours, + 0x06 => t.day_of_week, + 0x07 => t.day_of_month, + 0x08 => t.month, + 0x09 => t.year, 0x0A => 0x26, // Status A: no update in progress, 32.768 kHz 0x0B => 0x02, // Status B: 24-hour, BCD mode 0x0C => 0x00, // Status C: no interrupt source @@ -104,7 +180,7 @@ fn cmos_read(addr: u8) -> u8 { 0x16 => 0x02, // Base memory high byte 0x17 => 0x00, // Extended memory low (kernel uses E820) 0x18 => 0x00, // Extended memory high - 0x32 => 0x20, // Century (20xx) + 0x32 => t.century, _ => 0x00, } } @@ -671,14 +747,39 @@ mod tests { assert_eq!(dm.handle_io_in(0x999, 1), 0xFF); } + #[test] + fn test_to_bcd() { + assert_eq!(to_bcd(0), 0x00); + assert_eq!(to_bcd(9), 0x09); + assert_eq!(to_bcd(10), 0x10); + assert_eq!(to_bcd(26), 0x26); + assert_eq!(to_bcd(59), 0x59); + assert_eq!(to_bcd(99), 0x99); + } + + #[test] + fn test_cmos_time_now_is_reasonable() { + let t = CmosTime::now(); + // Year should be 2025–2099 in BCD (0x25..0x99). + assert!(t.year >= 0x25, "year BCD too low: {:#04x}", t.year); + // Month 1..12 in BCD (0x01..0x12). + assert!(t.month >= 0x01 && t.month <= 0x12, "month: {:#04x}", t.month); + // Day 1..31 in BCD. + assert!(t.day_of_month >= 0x01 && t.day_of_month <= 0x31); + // Hours 0..23 in BCD. + assert!(t.hours <= 0x23); + // Century should be 0x20. + assert_eq!(t.century, 0x20); + } + #[test] fn test_cmos_read_via_io() { let mut dm = make_test_devices(); // Select CMOS register 0x09 (year). dm.handle_io_out(0x70, 1, 0x09); - // Read CMOS data. let year = dm.handle_io_in(0x71, 1); - assert_eq!(year, 25); // 2025. + // Year must be valid BCD (>= 0x25 for 2025+). + assert!(year >= 0x25, "year BCD: {:#04x}", year); } #[test] diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index 32af8735d..2591d2bdf 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -184,7 +184,20 @@ impl VirtioMmioDevice { INTERRUPT_STATUS => self.interrupt_status, STATUS => self.status, CONFIG_GENERATION => 0, // Config doesn't change dynamically. - off if off >= CONFIG_SPACE => self.backend.read_config(off - CONFIG_SPACE), + off if off >= CONFIG_SPACE => { + let config_offset = off - CONFIG_SPACE; + let aligned_offset = config_offset & !3; + let word = self.backend.read_config(aligned_offset); + if size == 4 { + word + } else { + // Byte/word access: extract the correct portion. + // Config space is little-endian; byte N within the u32 is + // at bits (N*8)..(N*8+8). + let byte_index = (config_offset & 3) as u32; + (word >> (byte_index * 8)) & ((1u32 << (size as u32 * 8)) - 1) + } + } _ => 0, } } @@ -574,6 +587,27 @@ mod tests { assert_eq!(dev.read(CONFIG_SPACE, 4), 1024); } + #[test] + fn test_config_space_byte_reads() { + // Simulates how the Linux virtio-mmio driver reads the MAC: one byte at a time. + // TestBackend returns 1024 (= 0x00000400) at config offset 0. + let dev = VirtioMmioDevice::new(TestBackend::new()); + // 1024 as LE bytes: [0x00, 0x04, 0x00, 0x00] + assert_eq!(dev.read(CONFIG_SPACE + 0, 1), 0x00); // byte 0 + assert_eq!(dev.read(CONFIG_SPACE + 1, 1), 0x04); // byte 1 + assert_eq!(dev.read(CONFIG_SPACE + 2, 1), 0x00); // byte 2 + assert_eq!(dev.read(CONFIG_SPACE + 3, 1), 0x00); // byte 3 + } + + #[test] + fn test_config_space_word_reads() { + let dev = VirtioMmioDevice::new(TestBackend::new()); + // 1024 = 0x0400. Two-byte read at offset 0 should give 0x0400. + assert_eq!(dev.read(CONFIG_SPACE + 0, 2), 0x0400); + // Two-byte read at offset 2 should give 0x0000. + assert_eq!(dev.read(CONFIG_SPACE + 2, 2), 0x0000); + } + // --- Non-32-bit access --- #[test] From 3f855779614049df72c647bc1efe205e78f03d73 Mon Sep 17 00:00:00 2001 From: lile Date: Mon, 27 Apr 2026 18:54:52 +0800 Subject: [PATCH 17/33] =?UTF-8?q?perf(windows/vsock):=20increase=20TCP=20r?= =?UTF-8?q?ead=20buffer=204KB=20=E2=86=92=2064KB=20+=20add=20trace=20loggi?= =?UTF-8?q?ng?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase the per-poll TCP read buffer from 4096 to 65536 bytes. This improves vsock throughput for large data transfers (e.g. container image layers, HTTP downloads) by reducing the number of poll cycles needed per transfer. Also add trace-level logging for guest→host packets and TCP poll activity to aid future debugging without impacting performance at info level. Co-Authored-By: Claude Opus 4.6 --- .../src/windows/devices/virtio/vsock/mod.rs | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 4b6a1a3ca..e0d1a1cdd 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -181,6 +181,15 @@ impl VirtioVsock { /// Handle a packet from the guest. fn handle_guest_packet(&mut self, hdr: &VsockHeader, payload: &[u8]) { let key = (hdr.src_port, hdr.dst_port); + if !payload.is_empty() { + log::trace!( + "vsock TX: guest→host {} bytes, op={}, key=({},{})", + payload.len(), + hdr.op, + key.0, + key.1 + ); + } if hdr.op == VSOCK_OP_REQUEST { self.handle_connect_request(hdr); @@ -420,7 +429,7 @@ impl VirtioVsock { } } - let mut buf = [0u8; 4096]; + let mut buf = [0u8; 65536]; let data = if let Some(stream) = self.streams.get_mut(&key) { match stream.read(&mut buf) { Ok(0) => { @@ -608,7 +617,16 @@ impl VirtioDeviceBackend for VirtioVsock { self.poll_tcp_listeners(); // Poll TCP streams for incoming data. + let pending_before = self.rx_pending.len(); self.poll_tcp_streams(); + let new_data = self.rx_pending.len() - pending_before; + if new_data > 0 { + log::trace!( + "vsock poll: TCP produced {} new packets, total pending={}", + new_data, + self.rx_pending.len() + ); + } // Inject any pending data into the RX queue. if queues.len() > RX_QUEUE { From e98ee48f069ce003eef63571c8b9085ae8b71279 Mon Sep 17 00:00:00 2001 From: lile Date: Tue, 28 Apr 2026 19:05:35 +0800 Subject: [PATCH 18/33] feat(windows): add async virtio-blk worker (disabled) + HLT/interrupt fixes Add async block I/O worker infrastructure that moves disk reads/writes to a dedicated thread via mpsc channels. This prevents heavy disk I/O from starving vsock/net devices in the vCPU loop. However, the async worker is DISABLED on Windows/WHPX because the worker thread's raw pointer writes to guest memory conflict with WHPX memory tracking, causing ~60% boot failure rate. Sync disk I/O remains the default and provides 100% boot reliability (verified 20/20 on Win10). Other changes in this commit: - HLT handler: replace clear_halt() with inject_interrupt() for more reliable interrupt delivery on WHPX - Add APIC emulation guard comment (broken on Win10 MBP 2014) - Add diagnostic HLT counters and periodic logging - GuestMemory implements GuestMemoryAccessor directly (Arc-compatible) - whpx: add request_interrupt() method and warn-level logging - mmio: add poll_backend() for async completion draining - disk: add Send bound to DiskBackend trait Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/manager.rs | 41 ++ src/vmm/src/windows/devices/virtio/block.rs | 334 +++++++++- .../windows/devices/virtio/block_worker.rs | 575 ++++++++++++++++++ src/vmm/src/windows/devices/virtio/disk.rs | 2 +- src/vmm/src/windows/devices/virtio/mmio.rs | 24 + src/vmm/src/windows/devices/virtio/mod.rs | 1 + src/vmm/src/windows/runner.rs | 109 ++-- src/vmm/src/windows/whpx.rs | 70 ++- 8 files changed, 1070 insertions(+), 86 deletions(-) create mode 100644 src/vmm/src/windows/devices/virtio/block_worker.rs diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 0acc5f4f9..52f98a6d8 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -548,6 +548,35 @@ impl DeviceManager { } } + /// Start async block I/O workers for virtio-blk devices. + /// + /// Must be called after the guest memory is set up, before the vCPU loop. + pub fn start_blk_workers( + &mut self, + guest_mem: Arc, + ) { + if let Some(ref mut dev) = self.virtio_blk { + dev.backend_mut() + .start_worker(Arc::clone(&guest_mem), "blk-worker-0"); + } + if let Some(ref mut dev) = self.virtio_blk2 { + dev.backend_mut() + .start_worker(Arc::clone(&guest_mem), "blk-worker-1"); + } + } + + /// Stop async block I/O workers. + /// + /// Called during shutdown. Also called by Drop if not explicitly called. + pub fn stop_blk_workers(&mut self) { + if let Some(ref mut dev) = self.virtio_blk { + dev.backend_mut().stop_worker(); + } + if let Some(ref mut dev) = self.virtio_blk2 { + dev.backend_mut().stop_worker(); + } + } + /// Tick the PIT timer based on wall clock time and poll devices. /// /// Call this at the top of each vCPU run loop iteration. @@ -564,6 +593,18 @@ impl DeviceManager { } } + // Drain async block I/O completions. + if let Some(ref mut dev) = self.virtio_blk { + if dev.poll_backend(mem) { + self.pic.raise_irq(irq_for_slot(0)); + } + } + if let Some(ref mut dev) = self.virtio_blk2 { + if dev.poll_backend(mem) { + self.pic.raise_irq(irq_for_slot(4)); + } + } + // Poll vsock for host-initiated data. if self.virtio_vsock.poll(mem) { log::debug!("vsock poll raised IRQ {}", irq_for_slot(1)); diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs index 3a7b1e907..16d70604b 100644 --- a/src/vmm/src/windows/devices/virtio/block.rs +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -2,7 +2,16 @@ //! //! Provides a file-backed block device that processes read, write, //! and flush requests through the virtqueue. +//! +//! When a worker thread is started via `start_worker()`, disk I/O is +//! dispatched asynchronously to avoid blocking the vCPU loop. Without +//! a worker, requests are processed synchronously (fallback mode). + +use std::sync::mpsc; +use std::sync::Arc; +use std::thread; +use super::block_worker::{BlockCompletion, BlockRequest, BlockWorker, BufferDesc, RequestType}; use super::disk::DiskBackend; use super::mmio::VirtioDeviceBackend; use super::queue::{Descriptor, GuestMemoryAccessor, Virtqueue}; @@ -36,10 +45,23 @@ const VIRTIO_BLK_S_IOERR: u8 = 1; const VIRTIO_BLK_S_UNSUPP: u8 = 2; /// Virtio-blk device backed by a `DiskBackend`. +/// +/// Supports two modes: +/// - **Sync** (default): disk I/O in the vCPU thread (simple, but blocks) +/// - **Async** (after `start_worker()`): disk I/O in a dedicated thread pub struct VirtioBlock { - disk: Box, + /// Disk backend — owned here in sync mode, moved to worker in async mode. + disk: Option>, capacity: u64, // In sectors. read_only: bool, + + // --- Async worker fields (None until start_worker is called) --- + /// Channel to send requests to the worker thread. + request_tx: Option>, + /// Channel to receive completions from the worker thread. + completion_rx: Option>, + /// Worker thread join handle. + worker_handle: Option>, } impl VirtioBlock { @@ -49,9 +71,12 @@ impl VirtioBlock { pub fn new(disk: Box, read_only: bool) -> Self { let capacity = disk.capacity_bytes() / SECTOR_SIZE; VirtioBlock { - disk, + disk: Some(disk), capacity, read_only, + request_tx: None, + completion_rx: None, + worker_handle: None, } } @@ -60,6 +85,146 @@ impl VirtioBlock { self.capacity } + /// Whether the async worker is active. + pub fn has_worker(&self) -> bool { + self.request_tx.is_some() + } + + /// Start the async block I/O worker thread. + /// + /// Moves the disk backend to the worker. After this call, `queue_notify` + /// dispatches requests asynchronously instead of blocking. + pub fn start_worker( + &mut self, + guest_mem: Arc, + name: &str, + ) { + let disk = match self.disk.take() { + Some(d) => d, + None => { + log::warn!("start_worker called but disk already moved to worker"); + return; + } + }; + + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let worker = BlockWorker::new(req_rx, comp_tx, disk, guest_mem, self.read_only); + let handle = worker.run(name); + + self.request_tx = Some(req_tx); + self.completion_rx = Some(comp_rx); + self.worker_handle = Some(handle); + + log::info!("block worker '{}' started", name); + } + + /// Stop the worker thread and reclaim resources. + /// + /// Drops the request channel (worker exits on recv error), then joins. + pub fn stop_worker(&mut self) { + // Drop the sender to signal the worker to exit. + self.request_tx.take(); + self.completion_rx.take(); + + if let Some(handle) = self.worker_handle.take() { + let _ = handle.join(); + log::info!("block worker stopped"); + } + } + + /// Drain pending completions from the worker and update the used ring. + /// + /// Called from `tick_and_poll()` in the vCPU loop. Returns `true` if + /// any completions were processed (interrupt should be raised). + pub fn drain_completions( + &mut self, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + let rx = match self.completion_rx { + Some(ref rx) => rx, + None => return false, + }; + + let mut drained = false; + while let Ok(comp) = rx.try_recv() { + let _ = queue.add_used(comp.head_index, comp.bytes_written, mem); + drained = true; + } + drained + } + + /// Parse a descriptor chain header and build a BlockRequest. + /// + /// Returns None if the chain is malformed. + fn parse_request( + chain: &[Descriptor], + head_index: u16, + mem: &dyn GuestMemoryAccessor, + ) -> Option { + if chain.len() < 2 { + log::debug!("BLK: short chain len={}", chain.len()); + return None; + } + + let header_desc = &chain[0]; + if header_desc.len < 16 { + log::debug!("BLK: short header len={}", header_desc.len); + return None; + } + + let mut header_buf = [0u8; 16]; + if mem.read_at(header_desc.addr, &mut header_buf).is_err() { + log::debug!("BLK: header read failed addr=0x{:X}", header_desc.addr); + return None; + } + + let raw_type = + u32::from_le_bytes([header_buf[0], header_buf[1], header_buf[2], header_buf[3]]); + let sector = u64::from_le_bytes([ + header_buf[8], + header_buf[9], + header_buf[10], + header_buf[11], + header_buf[12], + header_buf[13], + header_buf[14], + header_buf[15], + ]); + + let req_type = match raw_type { + VIRTIO_BLK_T_IN => RequestType::Read, + VIRTIO_BLK_T_OUT => RequestType::Write, + VIRTIO_BLK_T_FLUSH => RequestType::Flush, + _ => RequestType::Unsupported, + }; + + // Middle descriptors: data buffers. Last descriptor: status byte. + let data_descs = &chain[1..chain.len() - 1]; + let status_desc = chain.last().unwrap(); + + let data_buffers: Vec = data_descs + .iter() + .map(|d| BufferDesc { + addr: d.addr, + len: d.len, + is_write: d.is_write(), + }) + .collect(); + + Some(BlockRequest { + head_index, + req_type, + sector, + data_buffers, + status_addr: status_desc.addr, + }) + } + + // --- Synchronous fallback (used when no worker is active) --- + /// Process a single virtio-blk request from a descriptor chain. fn process_request(&mut self, chain: &[Descriptor], mem: &dyn GuestMemoryAccessor) -> u8 { // Minimum: header + status (flush has no data descriptor). @@ -122,6 +287,10 @@ impl VirtioBlock { data_descs: &[Descriptor], mem: &dyn GuestMemoryAccessor, ) -> u8 { + let disk = match self.disk { + Some(ref mut d) => d, + None => return VIRTIO_BLK_S_IOERR, + }; let mut offset = sector * SECTOR_SIZE; for (i, desc) in data_descs.iter().enumerate() { @@ -134,7 +303,7 @@ impl VirtioBlock { return VIRTIO_BLK_S_IOERR; } let mut buf = vec![0u8; desc.len as usize]; - if let Err(e) = self.disk.read_at(offset, &mut buf) { + if let Err(e) = disk.read_at(offset, &mut buf) { log::debug!( "BLK READ: disk.read_at(0x{:X}, {}) failed: {}", offset, @@ -167,6 +336,10 @@ impl VirtioBlock { return VIRTIO_BLK_S_IOERR; } + let disk = match self.disk { + Some(ref mut d) => d, + None => return VIRTIO_BLK_S_IOERR, + }; let mut offset = sector * SECTOR_SIZE; for desc in data_descs { @@ -177,7 +350,7 @@ impl VirtioBlock { if mem.read_at(desc.addr, &mut buf).is_err() { return VIRTIO_BLK_S_IOERR; } - if self.disk.write_at(offset, &buf).is_err() { + if disk.write_at(offset, &buf).is_err() { return VIRTIO_BLK_S_IOERR; } offset += desc.len as u64; @@ -186,7 +359,11 @@ impl VirtioBlock { } fn handle_flush(&mut self) -> u8 { - if self.disk.flush().is_err() { + let disk = match self.disk { + Some(ref mut d) => d, + None => return VIRTIO_BLK_S_IOERR, + }; + if disk.flush().is_err() { VIRTIO_BLK_S_IOERR } else { VIRTIO_BLK_S_OK @@ -194,6 +371,12 @@ impl VirtioBlock { } } +impl Drop for VirtioBlock { + fn drop(&mut self) { + self.stop_worker(); + } +} + impl VirtioDeviceBackend for VirtioBlock { fn device_id(&self) -> u32 { VIRTIO_BLK_ID @@ -228,34 +411,85 @@ impl VirtioDeviceBackend for VirtioBlock { queue: &mut Virtqueue, mem: &dyn GuestMemoryAccessor, ) -> bool { - let mut processed = false; - - while let Ok(Some(head)) = queue.pop_avail(mem) { - let chain = match queue.read_desc_chain(head, mem) { - Ok(c) => c, - Err(_) => { - // Write error status and continue. - let _ = queue.add_used(head, 0, mem); - processed = true; - continue; + // Async mode: parse descriptors and dispatch to worker. + if let Some(ref tx) = self.request_tx { + let mut dispatched = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + dispatched = true; + continue; + } + }; + + match Self::parse_request(&chain, head, mem) { + Some(req) => { + if tx.send(req).is_err() { + // Worker died — fall through with error status. + if let Some(status_desc) = chain.last() { + let _ = mem.write_at(status_desc.addr, &[VIRTIO_BLK_S_IOERR]); + } + let total_written: u32 = + chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); + let _ = queue.add_used(head, total_written, mem); + dispatched = true; + } + } + None => { + // Malformed chain — write error status directly. + let _ = queue.add_used(head, 0, mem); + dispatched = true; + } } - }; + } - let status = self.process_request(&chain, mem); + // In async mode, don't raise interrupt here — completions + // arrive via drain_completions() during tick_and_poll(). + // Return dispatched for malformed chains that were handled inline. + dispatched + } else { + // Sync fallback: process requests directly (original behavior). + let mut processed = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(_) => { + let _ = queue.add_used(head, 0, mem); + processed = true; + continue; + } + }; + + let status = self.process_request(&chain, mem); + + if let Some(status_desc) = chain.last() { + let _ = mem.write_at(status_desc.addr, &[status]); + } - // Write status byte to last descriptor. - if let Some(status_desc) = chain.last() { - let _ = mem.write_at(status_desc.addr, &[status]); + let total_written: u32 = + chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); + let _ = queue.add_used(head, total_written, mem); + processed = true; } - // Calculate total bytes written to device-writable descriptors. - let total_written: u32 = chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); - - let _ = queue.add_used(head, total_written, mem); - processed = true; + processed } + } - processed + fn drain_completions( + &mut self, + queues: &mut [Virtqueue], + mem: &dyn GuestMemoryAccessor, + ) -> bool { + if let Some(queue) = queues.first_mut() { + self.drain_completions(queue, mem) + } else { + false + } } fn num_queues(&self) -> usize { @@ -358,6 +592,7 @@ mod tests { let blk = VirtioBlock::new(backend, false); assert_eq!(blk.capacity(), 8); assert_eq!(blk.device_id(), VIRTIO_BLK_ID); + assert!(!blk.has_worker()); } #[test] @@ -387,7 +622,7 @@ mod tests { assert_eq!(blk.read_config(4), 0); // High. } - // --- Request processing (direct) --- + // --- Request processing (direct/sync) --- #[test] fn test_read_request() { @@ -652,4 +887,49 @@ mod tests { let blk = VirtioBlock::new(backend, false); assert_eq!(blk.queue_max_size(0), 256); } + + // --- parse_request --- + + #[test] + fn test_parse_request_read() { + let mem = MockMem::new(0x10000); + + let mut header = [0u8; 16]; + header[0..4].copy_from_slice(&VIRTIO_BLK_T_IN.to_le_bytes()); + header[8..16].copy_from_slice(&5u64.to_le_bytes()); + mem.write_bytes(0x1000, &header); + + let chain = vec![ + Descriptor { addr: 0x1000, len: 16, flags: 0, next: 0 }, + Descriptor { addr: 0x2000, len: 512, flags: 2, next: 0 }, + Descriptor { addr: 0x3000, len: 1, flags: 2, next: 0 }, + ]; + + let req = VirtioBlock::parse_request(&chain, 10, &mem).unwrap(); + assert_eq!(req.head_index, 10); + assert_eq!(req.req_type, RequestType::Read); + assert_eq!(req.sector, 5); + assert_eq!(req.data_buffers.len(), 1); + assert!(req.data_buffers[0].is_write); + assert_eq!(req.status_addr, 0x3000); + } + + #[test] + fn test_parse_request_short_chain_returns_none() { + let mem = MockMem::new(0x10000); + let chain = vec![ + Descriptor { addr: 0x1000, len: 16, flags: 0, next: 0 }, + ]; + assert!(VirtioBlock::parse_request(&chain, 0, &mem).is_none()); + } + + // --- stop_worker --- + + #[test] + fn test_stop_worker_without_start_is_noop() { + let tmp = create_test_disk(4); + let backend = open_raw_backend(&tmp, false); + let mut blk = VirtioBlock::new(backend, false); + blk.stop_worker(); // Should not panic. + } } diff --git a/src/vmm/src/windows/devices/virtio/block_worker.rs b/src/vmm/src/windows/devices/virtio/block_worker.rs new file mode 100644 index 000000000..04021934e --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/block_worker.rs @@ -0,0 +1,575 @@ +//! Async block I/O worker thread for virtio-blk. +//! +//! Moves disk I/O off the vCPU loop into a dedicated thread so that +//! long-running reads/writes don't starve vsock or net devices. +//! +//! The vCPU thread sends `BlockRequest`s (parsed descriptor chains) +//! via an mpsc channel. The worker performs disk I/O, writes data and +//! status bytes to guest memory, and sends `BlockCompletion`s back. +//! The vCPU thread drains completions during `tick_and_poll()` and +//! updates the used ring. + +use std::sync::mpsc; +use std::sync::Arc; +use std::thread; + +use super::disk::DiskBackend; +use super::queue::GuestMemoryAccessor; + +/// Block size in bytes (standard sector size). +const SECTOR_SIZE: u64 = 512; + +// Virtio-blk status values. +const VIRTIO_BLK_S_OK: u8 = 0; +const VIRTIO_BLK_S_IOERR: u8 = 1; +const VIRTIO_BLK_S_UNSUPP: u8 = 2; + +/// Type of block request. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RequestType { + Read, + Write, + Flush, + Unsupported, +} + +/// A single buffer descriptor from the virtqueue chain. +#[derive(Debug, Clone)] +pub struct BufferDesc { + /// Guest physical address. + pub addr: u64, + /// Length in bytes. + pub len: u32, + /// Whether this buffer is device-writable (guest reads from it). + pub is_write: bool, +} + +/// A block request dispatched from the vCPU thread to the worker. +#[derive(Debug)] +pub struct BlockRequest { + /// Descriptor chain head index (for add_used later). + pub head_index: u16, + /// Request type (read/write/flush). + pub req_type: RequestType, + /// Starting sector (for read/write). + pub sector: u64, + /// Data buffer descriptors (between header and status). + pub data_buffers: Vec, + /// Guest address of the status byte (last descriptor). + pub status_addr: u64, +} + +/// Completion sent from the worker back to the vCPU thread. +#[derive(Debug)] +pub struct BlockCompletion { + /// Descriptor chain head index. + pub head_index: u16, + /// Total bytes written to device-writable descriptors (for used ring). + pub bytes_written: u32, +} + +/// Worker thread that processes block I/O requests. +pub struct BlockWorker { + request_rx: mpsc::Receiver, + completion_tx: mpsc::Sender, + disk: Box, + guest_mem: Arc, + read_only: bool, +} + +impl BlockWorker { + /// Create a new block worker. + pub fn new( + request_rx: mpsc::Receiver, + completion_tx: mpsc::Sender, + disk: Box, + guest_mem: Arc, + read_only: bool, + ) -> Self { + BlockWorker { + request_rx, + completion_tx, + disk, + guest_mem, + read_only, + } + } + + /// Spawn the worker on a named thread. Returns the join handle. + pub fn run(self, name: &str) -> thread::JoinHandle<()> { + let thread_name = name.to_string(); + thread::Builder::new() + .name(thread_name) + .spawn(move || self.work()) + .expect("failed to spawn block worker thread") + } + + /// Blocking recv loop: process requests until the channel closes. + fn work(mut self) { + log::info!("block worker started"); + + while let Ok(req) = self.request_rx.recv() { + let (status, bytes_written) = self.process_request(&req); + + // Write status byte to guest memory. + let _ = self.guest_mem.write_at(req.status_addr, &[status]); + + let completion = BlockCompletion { + head_index: req.head_index, + bytes_written, + }; + + // If the vCPU thread dropped its receiver, the VM is shutting down. + if self.completion_tx.send(completion).is_err() { + break; + } + } + + log::info!("block worker exiting"); + } + + /// Process a single block request. Returns (status, bytes_written). + fn process_request(&mut self, req: &BlockRequest) -> (u8, u32) { + match req.req_type { + RequestType::Read => self.handle_read(req.sector, &req.data_buffers), + RequestType::Write => self.handle_write(req.sector, &req.data_buffers), + RequestType::Flush => (self.handle_flush(), 0), + RequestType::Unsupported => (VIRTIO_BLK_S_UNSUPP, 0), + } + } + + fn handle_read(&mut self, sector: u64, data_buffers: &[BufferDesc]) -> (u8, u32) { + let mut offset = sector * SECTOR_SIZE; + let mut bytes_written: u32 = 0; + + for buf in data_buffers { + if !buf.is_write { + log::debug!("BLK worker READ: buffer not device-writable"); + return (VIRTIO_BLK_S_IOERR, bytes_written); + } + let mut data = vec![0u8; buf.len as usize]; + if let Err(e) = self.disk.read_at(offset, &mut data) { + log::debug!("BLK worker READ: disk.read_at failed: {}", e); + return (VIRTIO_BLK_S_IOERR, bytes_written); + } + if let Err(e) = self.guest_mem.write_at(buf.addr, &data) { + log::debug!("BLK worker READ: mem.write_at failed: {}", e); + return (VIRTIO_BLK_S_IOERR, bytes_written); + } + offset += buf.len as u64; + bytes_written += buf.len; + } + + // +1 for the status byte (also device-writable). + (VIRTIO_BLK_S_OK, bytes_written + 1) + } + + fn handle_write(&mut self, sector: u64, data_buffers: &[BufferDesc]) -> (u8, u32) { + if self.read_only { + return (VIRTIO_BLK_S_IOERR, 0); + } + + let mut offset = sector * SECTOR_SIZE; + + for buf in data_buffers { + if buf.is_write { + // Data for write must be device-readable (not device-writable). + return (VIRTIO_BLK_S_IOERR, 0); + } + let mut data = vec![0u8; buf.len as usize]; + if self.guest_mem.read_at(buf.addr, &mut data).is_err() { + return (VIRTIO_BLK_S_IOERR, 0); + } + if self.disk.write_at(offset, &data).is_err() { + return (VIRTIO_BLK_S_IOERR, 0); + } + offset += buf.len as u64; + } + + // Only status byte is device-writable for writes. + (VIRTIO_BLK_S_OK, 1) + } + + fn handle_flush(&mut self) -> u8 { + if self.disk.flush().is_err() { + VIRTIO_BLK_S_IOERR + } else { + VIRTIO_BLK_S_OK + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::RefCell; + use std::sync::mpsc; + + /// In-memory disk backend for testing. + struct MemDisk { + data: Vec, + read_only: bool, + } + + impl MemDisk { + fn new(size: usize) -> Self { + MemDisk { + data: vec![0u8; size], + read_only: false, + } + } + + fn with_pattern(sectors: u64) -> Self { + let size = (sectors * SECTOR_SIZE) as usize; + let mut data = vec![0u8; size]; + for sector in 0..sectors { + let start = (sector * SECTOR_SIZE) as usize; + let end = start + SECTOR_SIZE as usize; + data[start..end].fill((sector & 0xFF) as u8); + } + MemDisk { + data, + read_only: false, + } + } + } + + // Safety: MemDisk only uses Vec which is Send. + unsafe impl Send for MemDisk {} + + impl DiskBackend for MemDisk { + fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> super::super::super::super::error::Result<()> { + let start = offset as usize; + let end = start + buf.len(); + if end > self.data.len() { + return Err(super::super::super::super::error::WkrunError::Device( + "read out of bounds".into(), + )); + } + buf.copy_from_slice(&self.data[start..end]); + Ok(()) + } + + fn write_at(&mut self, offset: u64, buf: &[u8]) -> super::super::super::super::error::Result<()> { + if self.read_only { + return Err(super::super::super::super::error::WkrunError::Device( + "read-only disk".into(), + )); + } + let start = offset as usize; + let end = start + buf.len(); + if end > self.data.len() { + return Err(super::super::super::super::error::WkrunError::Device( + "write out of bounds".into(), + )); + } + self.data[start..end].copy_from_slice(buf); + Ok(()) + } + + fn flush(&mut self) -> super::super::super::super::error::Result<()> { + Ok(()) + } + + fn capacity_bytes(&self) -> u64 { + self.data.len() as u64 + } + } + + /// Thread-safe mock guest memory for testing the worker. + struct MockMem { + data: std::sync::Mutex>, + } + + impl MockMem { + fn new(size: usize) -> Self { + MockMem { + data: std::sync::Mutex::new(vec![0u8; size]), + } + } + + fn write_bytes(&self, addr: u64, bytes: &[u8]) { + let a = addr as usize; + let mut data = self.data.lock().unwrap(); + data[a..a + bytes.len()].copy_from_slice(bytes); + } + + fn read_bytes(&self, addr: u64, len: usize) -> Vec { + let a = addr as usize; + let data = self.data.lock().unwrap(); + data[a..a + len].to_vec() + } + } + + impl GuestMemoryAccessor for MockMem { + fn read_at(&self, addr: u64, buf: &mut [u8]) -> super::super::super::super::error::Result<()> { + let a = addr as usize; + let data = self.data.lock().unwrap(); + if a + buf.len() > data.len() { + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); + } + buf.copy_from_slice(&data[a..a + buf.len()]); + Ok(()) + } + fn write_at(&self, addr: u64, data: &[u8]) -> super::super::super::super::error::Result<()> { + let a = addr as usize; + let mut mem = self.data.lock().unwrap(); + if a + data.len() > mem.len() { + return Err(super::super::super::super::error::WkrunError::Memory( + "out of bounds".into(), + )); + } + mem[a..a + data.len()].copy_from_slice(data); + Ok(()) + } + } + + #[test] + fn test_worker_read_request() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::with_pattern(4); + let mem = Arc::new(MockMem::new(0x10000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let handle = worker.run("test-blk-read"); + + // Send a read request for sector 2 (pattern = 0x02). + req_tx + .send(BlockRequest { + head_index: 42, + req_type: RequestType::Read, + sector: 2, + data_buffers: vec![BufferDesc { + addr: 0x2000, + len: 512, + is_write: true, + }], + status_addr: 0x3000, + }) + .unwrap(); + + // Close the channel to let the worker exit. + drop(req_tx); + handle.join().unwrap(); + + // Check completion. + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.head_index, 42); + assert_eq!(comp.bytes_written, 513); // 512 data + 1 status + + // Verify data written to guest memory. + let data = mem.read_bytes(0x2000, 512); + assert!(data.iter().all(|&b| b == 0x02)); + + // Verify status byte. + let status = mem.read_bytes(0x3000, 1); + assert_eq!(status[0], VIRTIO_BLK_S_OK); + } + + #[test] + fn test_worker_write_request() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(2048); + let mem = Arc::new(MockMem::new(0x10000)); + + // Write data to guest memory that the worker will read. + mem.write_bytes(0x2000, &vec![0xAB; 512]); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let handle = worker.run("test-blk-write"); + + req_tx + .send(BlockRequest { + head_index: 7, + req_type: RequestType::Write, + sector: 1, + data_buffers: vec![BufferDesc { + addr: 0x2000, + len: 512, + is_write: false, // Device-readable for writes. + }], + status_addr: 0x3000, + }) + .unwrap(); + + drop(req_tx); + handle.join().unwrap(); + + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.head_index, 7); + assert_eq!(comp.bytes_written, 1); // Only status byte is writable. + + let status = mem.read_bytes(0x3000, 1); + assert_eq!(status[0], VIRTIO_BLK_S_OK); + } + + #[test] + fn test_worker_flush_request() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(1024); + let mem = Arc::new(MockMem::new(0x10000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let handle = worker.run("test-blk-flush"); + + req_tx + .send(BlockRequest { + head_index: 3, + req_type: RequestType::Flush, + sector: 0, + data_buffers: vec![], + status_addr: 0x3000, + }) + .unwrap(); + + drop(req_tx); + handle.join().unwrap(); + + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.head_index, 3); + assert_eq!(comp.bytes_written, 0); + + let status = mem.read_bytes(0x3000, 1); + assert_eq!(status[0], VIRTIO_BLK_S_OK); + } + + #[test] + fn test_worker_unsupported_request() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(1024); + let mem = Arc::new(MockMem::new(0x10000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let handle = worker.run("test-blk-unsupp"); + + req_tx + .send(BlockRequest { + head_index: 5, + req_type: RequestType::Unsupported, + sector: 0, + data_buffers: vec![], + status_addr: 0x3000, + }) + .unwrap(); + + drop(req_tx); + handle.join().unwrap(); + + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.head_index, 5); + assert_eq!(comp.bytes_written, 0); + + let status = mem.read_bytes(0x3000, 1); + assert_eq!(status[0], VIRTIO_BLK_S_UNSUPP); + } + + #[test] + fn test_worker_multiple_requests() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::with_pattern(8); + let mem = Arc::new(MockMem::new(0x10000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let handle = worker.run("test-blk-multi"); + + // Send 3 read requests for sectors 0, 1, 2. + for i in 0..3u16 { + req_tx + .send(BlockRequest { + head_index: i, + req_type: RequestType::Read, + sector: i as u64, + data_buffers: vec![BufferDesc { + addr: 0x2000 + (i as u64) * 0x1000, + len: 512, + is_write: true, + }], + status_addr: 0x8000 + i as u64, + }) + .unwrap(); + } + + drop(req_tx); + handle.join().unwrap(); + + // All 3 completions should arrive. + let mut completions: Vec = Vec::new(); + while let Ok(c) = comp_rx.try_recv() { + completions.push(c); + } + assert_eq!(completions.len(), 3); + + // Verify each sector's data. + for i in 0..3u16 { + let data = mem.read_bytes(0x2000 + (i as u64) * 0x1000, 512); + assert!( + data.iter().all(|&b| b == i as u8), + "sector {} data mismatch", + i + ); + } + } + + #[test] + fn test_worker_read_only_rejects_write() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(1024); + let mem = Arc::new(MockMem::new(0x10000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), true); + let handle = worker.run("test-blk-ro"); + + req_tx + .send(BlockRequest { + head_index: 1, + req_type: RequestType::Write, + sector: 0, + data_buffers: vec![BufferDesc { + addr: 0x2000, + len: 512, + is_write: false, + }], + status_addr: 0x3000, + }) + .unwrap(); + + drop(req_tx); + handle.join().unwrap(); + + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.bytes_written, 0); + + let status = mem.read_bytes(0x3000, 1); + assert_eq!(status[0], VIRTIO_BLK_S_IOERR); + } + + #[test] + fn test_worker_graceful_shutdown_on_channel_close() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, _comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(1024); + let mem = Arc::new(MockMem::new(0x1000)); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem, false); + let handle = worker.run("test-blk-shutdown"); + + // Drop the sender — worker should exit gracefully. + drop(req_tx); + handle.join().unwrap(); // Should not hang or panic. + } +} diff --git a/src/vmm/src/windows/devices/virtio/disk.rs b/src/vmm/src/windows/devices/virtio/disk.rs index f98f3d4eb..87586e142 100644 --- a/src/vmm/src/windows/devices/virtio/disk.rs +++ b/src/vmm/src/windows/devices/virtio/disk.rs @@ -17,7 +17,7 @@ pub const DISK_FORMAT_QCOW2: u32 = 1; /// /// Backends translate guest sector reads/writes to the underlying /// storage format (raw file, qcow2 image, etc.). -pub trait DiskBackend { +pub trait DiskBackend: Send { /// Read `buf.len()` bytes starting at `offset` into `buf`. fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> Result<()>; diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index 2591d2bdf..d30eaee01 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -98,6 +98,19 @@ pub trait VirtioDeviceBackend { false } + /// Drain async I/O completions from a worker thread. + /// + /// Called from the vCPU run loop for devices with async backends + /// (e.g., virtio-blk with a worker thread). Returns `true` if + /// completions were processed and an interrupt should be raised. + /// Default: no async completions. + fn drain_completions( + &mut self, + _queues: &mut [Virtqueue], + _mem: &dyn GuestMemoryAccessor, + ) -> bool { + false + } } /// Virtio-MMIO device wrapping a backend. @@ -294,6 +307,17 @@ impl VirtioMmioDevice { raised } + /// Drain async I/O completions from the backend's worker thread. + /// + /// Returns `true` if completions were processed (interrupt should be raised). + pub fn poll_backend(&mut self, mem: &dyn GuestMemoryAccessor) -> bool { + let raised = self.backend.drain_completions(&mut self.queues, mem); + if raised { + self.interrupt_status |= INTERRUPT_USED_RING; + } + raised + } + fn current_queue(&self) -> Option<&Virtqueue> { self.queues.get(self.queue_sel as usize) } diff --git a/src/vmm/src/windows/devices/virtio/mod.rs b/src/vmm/src/windows/devices/virtio/mod.rs index b3be92879..89c4a9e25 100644 --- a/src/vmm/src/windows/devices/virtio/mod.rs +++ b/src/vmm/src/windows/devices/virtio/mod.rs @@ -8,6 +8,7 @@ //! - virtio-net: network device (userspace proxy via passt/gvproxy) pub mod block; +pub mod block_worker; pub mod disk; pub mod mmio; pub mod net; diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 47e6383f7..8f28cab92 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -25,15 +25,16 @@ mod imp { use super::super::vcpu::VcpuRunConfig; use super::super::whpx::{VcpuCanceller, WhpxPartition, WhpxVcpu}; - /// Adapter to implement GuestMemoryAccessor for GuestMemory. - struct GuestMemoryAdapter<'a>(&'a GuestMemory); - - impl GuestMemoryAccessor for GuestMemoryAdapter<'_> { + /// Implement GuestMemoryAccessor directly on GuestMemory. + /// + /// This allows `Arc` to be passed to block worker threads + /// (GuestMemory is Send+Sync since its regions are Send+Sync). + impl GuestMemoryAccessor for GuestMemory { fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { - self.0.read_at_addr(addr, buf) + self.read_at_addr(addr, buf) } fn write_at(&self, addr: u64, data: &[u8]) -> Result<()> { - self.0.write_at_addr(addr, data) + self.write_at_addr(addr, data) } } @@ -166,14 +167,20 @@ mod imp { )); } - // Create partition (no APIC emulation — avoids crash on some Win10 hardware). + // Create partition. let partition = WhpxPartition::new()?; partition.set_processor_count(ctx.num_vcpus as u32)?; partition.set_extended_vm_exits(true, true)?; + + // NOTE: Do NOT enable APIC emulation here. On Win10 MBP 2014, + // set_local_apic_emulation(true) returns success but then the APIC + // doesn't function — no interrupts get delivered and the kernel hangs + // before producing any console output. Software PIC is required. + partition.setup()?; - // Allocate and map guest memory. - let guest_mem = GuestMemory::new(ctx.ram_mib)?; + // Allocate and map guest memory (Arc for sharing with block worker threads). + let guest_mem = Arc::new(GuestMemory::new(ctx.ram_mib)?); guest_mem.map_to_partition(&partition)?; // Create devices from context. @@ -182,6 +189,13 @@ mod imp { devices::store_console_buffer(ctx_id, setup.console_buffer); let mut devices = setup.devices; + // NOTE: Async block workers are NOT started on Windows/WHPX. + // The worker thread writes to guest memory from a non-vCPU thread, + // which conflicts with WHPX's memory tracking and causes ~60% boot + // failure rate on Win10. Sync disk I/O is reliable (100% pass rate) + // and sufficient for typical workloads. + // devices.start_blk_workers(guest_mem.clone()); + // Build kernel command line. let cmdline = build_kernel_cmdline( ctx.kernel_cmdline.as_deref(), @@ -234,10 +248,12 @@ mod imp { } }); - // vCPU run loop. - let mem_adapter = GuestMemoryAdapter(&guest_mem); + // vCPU run loop — GuestMemory implements GuestMemoryAccessor directly. + let mem_ref: &GuestMemory = &guest_mem; let mut exit_count: u64 = 0; let mut halt_count: u64 = 0; + let mut total_halt_exits: u64 = 0; + let mut halt_with_irq: u64 = 0; let start_time = Instant::now(); let mut last_progress = Instant::now(); let mut mmio_count: u64 = 0; @@ -246,7 +262,7 @@ mod imp { loop { // Tick PIT and poll devices. - devices.tick_and_poll(&mem_adapter); + devices.tick_and_poll(mem_ref); // Try to inject pending interrupt. if devices.pic.has_pending() { @@ -319,7 +335,7 @@ mod imp { } => { halt_count = 0; mmio_count += 1; - devices.handle_mmio_write(address, size, data, &mem_adapter); + devices.handle_mmio_write(address, size, data, mem_ref); vcpu.skip_instruction()?; } VcpuExit::InterruptWindow => { @@ -327,6 +343,7 @@ mod imp { devices.set_window_requested(false); } VcpuExit::Halt => { + total_halt_exits += 1; if !run_config.should_run() { log::info!("VM stop requested, exiting on Halt"); last_exit_reason = "HALT_STOP_REQUESTED"; @@ -334,35 +351,58 @@ mod imp { break; } - // Active HLT wake: poll devices before sleeping. - // If an interrupt arrived (e.g. vsock data) while the guest - // was halted, clear the HLT suspend state so the vCPU resumes - // immediately. This prevents lost wakeups where the guest - // sleeps through pending interrupts. Matches QEMU's WHPX - // HLT handling strategy. - devices.tick_and_poll(&mem_adapter); + // Poll devices before sleeping — a pending interrupt may + // have arrived (e.g. PIT tick, vsock data) while the guest + // was halted. + devices.tick_and_poll(mem_ref); if devices.pic.has_pending() { - log::debug!( - "HLT with pending interrupt, clearing halt (exits={})", - exit_count - ); - if let Err(e) = vcpu.clear_halt() { - log::warn!("clear_halt failed: {:?}", e); - // Fall through to sleep (graceful degradation). - } else { - halt_count = 0; - continue; + if let Some(vector) = devices.pic.acknowledge() { + vcpu.inject_interrupt(vector)?; + devices.set_window_requested(false); } + halt_with_irq += 1; + halt_count = 0; + continue; } // No pending interrupts — guest is genuinely idle. halt_count += 1; + + // Log diagnostic info every 1000 consecutive halts + if halt_count % 1000 == 0 { + if let Ok(regs) = vcpu.get_registers() { + let console_len = devices::get_console_output(ctx_id) + .map(|b| b.len()) + .unwrap_or(0); + let if_flag = vcpu.interrupts_enabled().unwrap_or(false); + eprintln!( + "[WHPX] HLT stuck: consecutive={} total_halt={} halt_with_irq={} \ + exits={} RIP={:#X} RFLAGS={:#X} IF={} console={}B mmio={}", + halt_count, total_halt_exits, halt_with_irq, + exit_count, regs.rip, regs.rflags, + if_flag, console_len, mmio_count + ); + } + } + if halt_count > MAX_HALTS { + if let Ok(regs) = vcpu.get_registers() { + let console_len = devices::get_console_output(ctx_id) + .map(|b| b.len()) + .unwrap_or(0); + eprintln!( + "[WHPX] HALT_MAX: consecutive={} total_halt={} halt_with_irq={} \ + exits={} RIP={:#X} console={}B mmio={}", + halt_count, total_halt_exits, halt_with_irq, + exit_count, regs.rip, console_len, mmio_count + ); + } log::warn!( - "vCPU halted {} times consecutively after {} exits", - halt_count, - exit_count + "vCPU halted {} times consecutively after {} exits \ + (total_halt={}, halt_with_irq={})", + halt_count, exit_count, + total_halt_exits, halt_with_irq ); last_exit_reason = "HALT_MAX_REACHED"; exit_code = 0; @@ -497,8 +537,9 @@ mod imp { } } - // Stop the timer thread. + // Stop the timer thread and block I/O workers. run_config.request_stop(); + devices.stop_blk_workers(); let _ = timer_thread.join(); log::info!( diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index 1e1a45d02..f3821f5ed 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -904,6 +904,52 @@ mod imp { check_hresult("WHvSetVirtualProcessorRegisters(inject_interrupt)", hr) } + /// Deliver an interrupt via the partition-level WHvRequestInterrupt API. + /// + /// Unlike [`inject_interrupt`] (which sets WHvRegisterPendingInterruption), + /// this API delivers the interrupt at the partition level and — critically — + /// resets the vCPU's HLT suspend state on platforms where + /// WHvRegisterInternalActivityState is inaccessible (Win10). + /// + /// Uses Fixed delivery, edge-triggered, physical destination mode. + /// Returns Ok(true) if the interrupt was delivered, Ok(false) if the + /// API returned an error (caller should fall back to inject_interrupt). + pub fn request_interrupt(&self, vector: u8) -> Result { + // WHV_INTERRUPT_CONTROL layout (from Hyper-V TLFS / Windows SDK): + // _bitfield (u64): + // bits 0-31: InterruptType (u32) — 0 = Fixed + // bit 32: LevelTriggered — 0 = edge + // bit 33: LogicalDestinationMode — 0 = physical + // bits 34-63: Reserved (0) + // Destination (u32): target vCPU index + // Vector (u32): interrupt vector + let interrupt = WHV_INTERRUPT_CONTROL { + _bitfield: 0, // Fixed=0, edge-triggered=0, physical=0 + Destination: self.index, + Vector: vector as u32, + }; + let hr = unsafe { + WHvRequestInterrupt( + self.partition_handle, + &interrupt, + std::mem::size_of::() as u32, + ) + }; + if hr == 0 { + Ok(true) + } else { + // Log at warn level (not debug) so it's visible at RUST_LOG=info. + // This is a critical diagnostic — if WHvRequestInterrupt fails, + // the vCPU may not wake from HLT on Win10. + log::warn!( + "WHvRequestInterrupt failed: HRESULT=0x{:08X}, vector={}", + hr as u32, + vector + ); + Ok(false) + } + } + /// Check if the guest has interrupts enabled (RFLAGS.IF = 1). pub fn interrupts_enabled(&self) -> Result { let regs = self.get_registers()?; @@ -935,30 +981,6 @@ mod imp { check_hresult("WHvSetVirtualProcessorRegisters(interrupt_window)", hr) } - /// Clear the HLT suspend state, waking the vCPU from halt. - /// - /// Uses `WHvRegisterInternalActivityState` to zero out suspend bits, - /// allowing the vCPU to resume execution (e.g., to process a pending - /// interrupt that arrived while halted). Matches QEMU's WHPX HLT - /// handling strategy. - pub fn clear_halt(&self) -> Result<()> { - // WHvRegisterInternalActivityState = 0x00040004 - // Bits: 0=StartupSuspend, 1=HaltSuspend, 2=IdleSuspend - // Write 0 to clear all suspend states. - let names: [i32; 1] = [0x00040004_i32]; - let values: Vec = vec![reg64(0)]; - let hr = unsafe { - WHvSetVirtualProcessorRegisters( - self.partition_handle, - self.index, - names.as_ptr(), - 1, - values.as_ptr(), - ) - }; - check_hresult("WHvSetVirtualProcessorRegisters(clear_halt)", hr) - } - /// Cancel a running vCPU (causes it to exit with Cancelled). pub fn cancel(&self) -> Result<()> { let hr = unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, self.index, 0) }; From fa43545ffc3a39b02547a0d2c945fdac45b34373 Mon Sep 17 00:00:00 2001 From: lile Date: Wed, 29 Apr 2026 08:28:18 +0800 Subject: [PATCH 19/33] feat(windows): enable async virtio-blk worker + fix interrupt injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable async virtio-blk I/O on WHPX with two critical fixes: 1. Pending interruption guard: Read WHvRegisterPendingInterruption before injecting new interrupts. Without this, a Cancelled vCPU exit could leave a previous injection undelivered, and the next injection would silently overwrite it — permanently sticking the PIC ISR bit (guest never sends EOI for a lost interrupt). 2. Spurious cascade guard: In Pic::acknowledge(), only call master.acknowledge() when the slave has a real deliverable IRQ. Previously, spurious cascades (slave all-masked) would set ISR bit 2 with no corresponding interrupt delivery. Also: Plan B worker architecture (worker thread never touches guest memory — all guest mem reads/writes on vCPU thread), slave PIC diagnostics, flush bytes_written fix. Win10 validated: 5/5 sync + 5/5 async PASS. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/manager.rs | 38 +- src/vmm/src/windows/devices/pic.rs | 28 +- src/vmm/src/windows/devices/virtio/block.rs | 60 ++- .../windows/devices/virtio/block_worker.rs | 368 +++++++++++------- src/vmm/src/windows/runner.rs | 154 ++++++-- src/vmm/src/windows/whpx.rs | 24 ++ 6 files changed, 465 insertions(+), 207 deletions(-) diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 52f98a6d8..09436e7a2 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -215,6 +215,11 @@ pub struct DeviceManager { /// Second virtio-blk device (slot 4) — optional, for guest rootfs. virtio_blk2: Option>, + /// Diagnostic: count QUEUE_NOTIFY writes to blk devices. + blk_queue_notify_count: u64, + /// Diagnostic: count block I/O completions drained. + blk_completion_count: u64, + /// Track whether we've requested an interrupt window. window_requested: bool, /// Last PIT tick timestamp. @@ -364,6 +369,8 @@ impl DeviceManager { virtio_9p, virtio_net, virtio_blk2, + blk_queue_notify_count: 0, + blk_completion_count: 0, window_requested: false, last_tick: Instant::now(), port61_toggle: false, @@ -515,6 +522,9 @@ impl DeviceManager { let blk2_offset = address.wrapping_sub(mmio_base_for_slot(4)); if blk_offset < MMIO_SLOT_SIZE { + if blk_offset == 0x050 { + self.blk_queue_notify_count += 1; + } if let Some(ref mut dev) = self.virtio_blk { if dev.write(blk_offset, data as u32, size, mem) { self.pic.raise_irq(irq_for_slot(0)); @@ -540,6 +550,9 @@ impl DeviceManager { } } } else if blk2_offset < MMIO_SLOT_SIZE { + if blk2_offset == 0x050 { + self.blk_queue_notify_count += 1; + } if let Some(ref mut dev) = self.virtio_blk2 { if dev.write(blk2_offset, data as u32, size, mem) { self.pic.raise_irq(irq_for_slot(4)); @@ -548,20 +561,16 @@ impl DeviceManager { } } - /// Start async block I/O workers for virtio-blk devices. + /// Start async block I/O workers for virtio-blk devices (Plan B: WHPX-safe). /// - /// Must be called after the guest memory is set up, before the vCPU loop. - pub fn start_blk_workers( - &mut self, - guest_mem: Arc, - ) { + /// Workers never access guest memory — all guest memory I/O happens + /// on the vCPU thread (in queue_notify and drain_completions). + pub fn start_blk_workers(&mut self) { if let Some(ref mut dev) = self.virtio_blk { - dev.backend_mut() - .start_worker(Arc::clone(&guest_mem), "blk-worker-0"); + dev.backend_mut().start_worker("blk-worker-0"); } if let Some(ref mut dev) = self.virtio_blk2 { - dev.backend_mut() - .start_worker(Arc::clone(&guest_mem), "blk-worker-1"); + dev.backend_mut().start_worker("blk-worker-1"); } } @@ -596,11 +605,13 @@ impl DeviceManager { // Drain async block I/O completions. if let Some(ref mut dev) = self.virtio_blk { if dev.poll_backend(mem) { + self.blk_completion_count += 1; self.pic.raise_irq(irq_for_slot(0)); } } if let Some(ref mut dev) = self.virtio_blk2 { if dev.poll_backend(mem) { + self.blk_completion_count += 1; self.pic.raise_irq(irq_for_slot(4)); } } @@ -653,6 +664,11 @@ impl DeviceManager { } } + /// Return block I/O diagnostic counters: (queue_notify_count, completion_count). + pub fn blk_stats(&self) -> (u64, u64) { + (self.blk_queue_notify_count, self.blk_completion_count) + } + /// Whether an ACPI S5 shutdown was detected. pub fn shutdown_requested(&self) -> bool { self.shutdown_requested @@ -682,6 +698,8 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { virtio_9p: None, virtio_net: None, virtio_blk2: None, + blk_queue_notify_count: 0, + blk_completion_count: 0, window_requested: false, last_tick: Instant::now(), port61_toggle: false, diff --git a/src/vmm/src/windows/devices/pic.rs b/src/vmm/src/windows/devices/pic.rs index b0b6ce0f7..834f2503a 100644 --- a/src/vmm/src/windows/devices/pic.rs +++ b/src/vmm/src/windows/devices/pic.rs @@ -268,12 +268,20 @@ impl Pic { pub fn acknowledge(&mut self) -> Option { if let Some(master_irq) = self.master.pending_irq() { if master_irq == CASCADE_IRQ { - // Cascade: acknowledge slave first. + // Cascade: try to acknowledge slave first. let vector = self.slave.acknowledge(); - // Acknowledge cascade on master. - self.master.acknowledge(); - // If no more slave IRQs, clear cascade. - if self.slave.pending_irq().is_none() { + if vector.is_some() { + // Slave had a real IRQ — acknowledge cascade on master. + self.master.acknowledge(); + // If no more slave IRQs pending, clear cascade IRR. + if self.slave.pending_irq().is_none() { + self.master.clear_irq(CASCADE_IRQ); + } + } else { + // Spurious cascade: slave has no deliverable IRQ. + // Clear cascade IRR without setting ISR — otherwise + // ISR bit 2 would be permanently stuck (guest never + // sends EOI for an interrupt it didn't receive). self.master.clear_irq(CASCADE_IRQ); } vector @@ -295,6 +303,16 @@ impl Pic { ) } + /// Get slave PIC state for diagnostics: (IRR, ISR, IMR, vector_base). + pub fn slave_state(&self) -> (u8, u8, u8, u8) { + ( + self.slave.irr, + self.slave.isr, + self.slave.imr, + self.slave.vector_base, + ) + } + /// Check if the given I/O port belongs to either PIC. pub fn handles_port(&self, port: u16) -> bool { matches!( diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs index 16d70604b..ad649cf71 100644 --- a/src/vmm/src/windows/devices/virtio/block.rs +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -8,7 +8,6 @@ //! a worker, requests are processed synchronously (fallback mode). use std::sync::mpsc; -use std::sync::Arc; use std::thread; use super::block_worker::{BlockCompletion, BlockRequest, BlockWorker, BufferDesc, RequestType}; @@ -90,15 +89,15 @@ impl VirtioBlock { self.request_tx.is_some() } - /// Start the async block I/O worker thread. + /// Start the async block I/O worker thread (Plan B: WHPX-safe). /// /// Moves the disk backend to the worker. After this call, `queue_notify` /// dispatches requests asynchronously instead of blocking. - pub fn start_worker( - &mut self, - guest_mem: Arc, - name: &str, - ) { + /// + /// The worker thread never accesses guest memory — all guest memory + /// reads/writes happen on the vCPU thread (in queue_notify and + /// drain_completions). + pub fn start_worker(&mut self, name: &str) { let disk = match self.disk.take() { Some(d) => d, None => { @@ -110,14 +109,14 @@ impl VirtioBlock { let (req_tx, req_rx) = mpsc::channel(); let (comp_tx, comp_rx) = mpsc::channel(); - let worker = BlockWorker::new(req_rx, comp_tx, disk, guest_mem, self.read_only); + let worker = BlockWorker::new(req_rx, comp_tx, disk, self.read_only); let handle = worker.run(name); self.request_tx = Some(req_tx); self.completion_rx = Some(comp_rx); self.worker_handle = Some(handle); - log::info!("block worker '{}' started", name); + log::info!("block worker '{}' started (Plan B)", name); } /// Stop the worker thread and reclaim resources. @@ -138,6 +137,9 @@ impl VirtioBlock { /// /// Called from `tick_and_poll()` in the vCPU loop. Returns `true` if /// any completions were processed (interrupt should be raised). + /// + /// **Plan B**: This method writes read data and status bytes to guest + /// memory on the vCPU thread, which is safe for WHPX. pub fn drain_completions( &mut self, queue: &mut Virtqueue, @@ -150,6 +152,21 @@ impl VirtioBlock { let mut drained = false; while let Ok(comp) = rx.try_recv() { + // Write read data to guest memory (scatter to original buffer locations). + if let Some(ref read_data) = comp.read_data { + let mut data_offset: usize = 0; + for target in &comp.read_targets { + let end = data_offset + target.len as usize; + if end <= read_data.len() { + let _ = mem.write_at(target.addr, &read_data[data_offset..end]); + } + data_offset = end; + } + } + + // Write status byte to guest memory. + let _ = mem.write_at(comp.status_addr, &[comp.status]); + let _ = queue.add_used(comp.head_index, comp.bytes_written, mem); drained = true; } @@ -158,6 +175,10 @@ impl VirtioBlock { /// Parse a descriptor chain header and build a BlockRequest. /// + /// For write requests (Plan B), pre-reads data from guest memory + /// into the request's `write_data` field so the worker thread + /// never needs to access guest memory. + /// /// Returns None if the chain is malformed. fn parse_request( chain: &[Descriptor], @@ -214,12 +235,33 @@ impl VirtioBlock { }) .collect(); + // Plan B: For write requests, pre-read data from guest memory + // so the worker thread never needs guest memory access. + let write_data = if req_type == RequestType::Write { + let mut all_data = Vec::new(); + for desc in data_descs { + if !desc.is_write() { + // Device-readable buffer: contains data to write to disk. + let mut buf = vec![0u8; desc.len as usize]; + if mem.read_at(desc.addr, &mut buf).is_err() { + log::debug!("BLK: pre-read write data failed addr=0x{:X}", desc.addr); + return None; + } + all_data.extend_from_slice(&buf); + } + } + Some(all_data) + } else { + None + }; + Some(BlockRequest { head_index, req_type, sector, data_buffers, status_addr: status_desc.addr, + write_data, }) } diff --git a/src/vmm/src/windows/devices/virtio/block_worker.rs b/src/vmm/src/windows/devices/virtio/block_worker.rs index 04021934e..65ef5884a 100644 --- a/src/vmm/src/windows/devices/virtio/block_worker.rs +++ b/src/vmm/src/windows/devices/virtio/block_worker.rs @@ -3,18 +3,20 @@ //! Moves disk I/O off the vCPU loop into a dedicated thread so that //! long-running reads/writes don't starve vsock or net devices. //! -//! The vCPU thread sends `BlockRequest`s (parsed descriptor chains) -//! via an mpsc channel. The worker performs disk I/O, writes data and -//! status bytes to guest memory, and sends `BlockCompletion`s back. -//! The vCPU thread drains completions during `tick_and_poll()` and -//! updates the used ring. +//! **Plan B (WHPX-safe)**: The worker thread NEVER accesses guest memory. +//! - For reads: worker reads disk → Vec, sends Vec in completion. +//! The vCPU thread writes the data to guest memory. +//! - For writes: vCPU thread pre-reads data from guest memory → Vec, +//! sends Vec in request. Worker writes Vec to disk. +//! - Status byte is always written by the vCPU thread. +//! +//! This avoids WHPX memory coherence issues where non-vCPU thread +//! writes to guest memory cause ~60% boot failure on Win10. use std::sync::mpsc; -use std::sync::Arc; use std::thread; use super::disk::DiskBackend; -use super::queue::GuestMemoryAccessor; /// Block size in bytes (standard sector size). const SECTOR_SIZE: u64 = 512; @@ -34,6 +36,8 @@ pub enum RequestType { } /// A single buffer descriptor from the virtqueue chain. +/// +/// Used in completions to tell the vCPU thread where to write read data. #[derive(Debug, Clone)] pub struct BufferDesc { /// Guest physical address. @@ -54,9 +58,14 @@ pub struct BlockRequest { /// Starting sector (for read/write). pub sector: u64, /// Data buffer descriptors (between header and status). + /// For reads: describes where vCPU thread should write the returned data. + /// For writes: only used for metadata (the actual data is in write_data). pub data_buffers: Vec, /// Guest address of the status byte (last descriptor). pub status_addr: u64, + /// Pre-read write data from guest memory (only for Write requests). + /// The vCPU thread reads this from guest memory before sending. + pub write_data: Option>, } /// Completion sent from the worker back to the vCPU thread. @@ -66,31 +75,42 @@ pub struct BlockCompletion { pub head_index: u16, /// Total bytes written to device-writable descriptors (for used ring). pub bytes_written: u32, + /// Virtio-blk status byte (OK/IOERR/UNSUPP). + pub status: u8, + /// Guest address where status byte should be written. + pub status_addr: u64, + /// Data read from disk (only for Read requests). + /// The vCPU thread writes this to guest memory at the addresses + /// specified in read_targets. + pub read_data: Option>, + /// Guest memory targets for read data (addr, len pairs from data_buffers). + /// The vCPU thread iterates these to scatter read_data into guest memory. + pub read_targets: Vec, } /// Worker thread that processes block I/O requests. -pub struct BlockWorker { +/// +/// The worker NEVER accesses guest memory. All guest memory reads/writes +/// are done by the vCPU thread (Plan B for WHPX safety). +pub struct BlockWorker { request_rx: mpsc::Receiver, completion_tx: mpsc::Sender, disk: Box, - guest_mem: Arc, read_only: bool, } -impl BlockWorker { +impl BlockWorker { /// Create a new block worker. pub fn new( request_rx: mpsc::Receiver, completion_tx: mpsc::Sender, disk: Box, - guest_mem: Arc, read_only: bool, ) -> Self { BlockWorker { request_rx, completion_tx, disk, - guest_mem, read_only, } } @@ -106,18 +126,10 @@ impl BlockWorker { /// Blocking recv loop: process requests until the channel closes. fn work(mut self) { - log::info!("block worker started"); + log::info!("block worker started (Plan B: no guest memory access)"); while let Ok(req) = self.request_rx.recv() { - let (status, bytes_written) = self.process_request(&req); - - // Write status byte to guest memory. - let _ = self.guest_mem.write_at(req.status_addr, &[status]); - - let completion = BlockCompletion { - head_index: req.head_index, - bytes_written, - }; + let completion = self.process_request(req); // If the vCPU thread dropped its receiver, the VM is shutting down. if self.completion_tx.send(completion).is_err() { @@ -128,73 +140,162 @@ impl BlockWorker { log::info!("block worker exiting"); } - /// Process a single block request. Returns (status, bytes_written). - fn process_request(&mut self, req: &BlockRequest) -> (u8, u32) { + /// Process a single block request. Returns a completion with data/status. + fn process_request(&mut self, req: BlockRequest) -> BlockCompletion { match req.req_type { - RequestType::Read => self.handle_read(req.sector, &req.data_buffers), - RequestType::Write => self.handle_write(req.sector, &req.data_buffers), - RequestType::Flush => (self.handle_flush(), 0), - RequestType::Unsupported => (VIRTIO_BLK_S_UNSUPP, 0), + RequestType::Read => self.handle_read(req), + RequestType::Write => self.handle_write(req), + RequestType::Flush => self.handle_flush(req), + RequestType::Unsupported => BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_UNSUPP, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }, } } - fn handle_read(&mut self, sector: u64, data_buffers: &[BufferDesc]) -> (u8, u32) { - let mut offset = sector * SECTOR_SIZE; + fn handle_read(&mut self, req: BlockRequest) -> BlockCompletion { + let mut offset = req.sector * SECTOR_SIZE; + let mut all_data = Vec::new(); let mut bytes_written: u32 = 0; - for buf in data_buffers { + for buf in &req.data_buffers { if !buf.is_write { log::debug!("BLK worker READ: buffer not device-writable"); - return (VIRTIO_BLK_S_IOERR, bytes_written); + return BlockCompletion { + head_index: req.head_index, + bytes_written, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } let mut data = vec![0u8; buf.len as usize]; if let Err(e) = self.disk.read_at(offset, &mut data) { log::debug!("BLK worker READ: disk.read_at failed: {}", e); - return (VIRTIO_BLK_S_IOERR, bytes_written); - } - if let Err(e) = self.guest_mem.write_at(buf.addr, &data) { - log::debug!("BLK worker READ: mem.write_at failed: {}", e); - return (VIRTIO_BLK_S_IOERR, bytes_written); + return BlockCompletion { + head_index: req.head_index, + bytes_written, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } + all_data.extend_from_slice(&data); offset += buf.len as u64; bytes_written += buf.len; } // +1 for the status byte (also device-writable). - (VIRTIO_BLK_S_OK, bytes_written + 1) + BlockCompletion { + head_index: req.head_index, + bytes_written: bytes_written + 1, + status: VIRTIO_BLK_S_OK, + status_addr: req.status_addr, + read_data: Some(all_data), + read_targets: req.data_buffers, + } } - fn handle_write(&mut self, sector: u64, data_buffers: &[BufferDesc]) -> (u8, u32) { + fn handle_write(&mut self, req: BlockRequest) -> BlockCompletion { if self.read_only { - return (VIRTIO_BLK_S_IOERR, 0); + return BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } - let mut offset = sector * SECTOR_SIZE; + let write_data = match req.write_data { + Some(ref data) => data, + None => { + log::debug!("BLK worker WRITE: no write_data provided"); + return BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; + } + }; - for buf in data_buffers { + let mut offset = req.sector * SECTOR_SIZE; + let mut data_offset: usize = 0; + + for buf in &req.data_buffers { if buf.is_write { // Data for write must be device-readable (not device-writable). - return (VIRTIO_BLK_S_IOERR, 0); + return BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } - let mut data = vec![0u8; buf.len as usize]; - if self.guest_mem.read_at(buf.addr, &mut data).is_err() { - return (VIRTIO_BLK_S_IOERR, 0); + let end = data_offset + buf.len as usize; + if end > write_data.len() { + log::debug!("BLK worker WRITE: write_data too short"); + return BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } - if self.disk.write_at(offset, &data).is_err() { - return (VIRTIO_BLK_S_IOERR, 0); + if self.disk.write_at(offset, &write_data[data_offset..end]).is_err() { + return BlockCompletion { + head_index: req.head_index, + bytes_written: 0, + status: VIRTIO_BLK_S_IOERR, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + }; } offset += buf.len as u64; + data_offset = end; } // Only status byte is device-writable for writes. - (VIRTIO_BLK_S_OK, 1) + BlockCompletion { + head_index: req.head_index, + bytes_written: 1, + status: VIRTIO_BLK_S_OK, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], + } } - fn handle_flush(&mut self) -> u8 { - if self.disk.flush().is_err() { + fn handle_flush(&mut self, req: BlockRequest) -> BlockCompletion { + let status = if self.disk.flush().is_err() { VIRTIO_BLK_S_IOERR } else { VIRTIO_BLK_S_OK + }; + // bytes_written=1 for the status byte (device-writable), + // matching the sync path in VirtioBlock::queue_notify. + BlockCompletion { + head_index: req.head_index, + bytes_written: 1, + status, + status_addr: req.status_addr, + read_data: None, + read_targets: vec![], } } } @@ -202,7 +303,6 @@ impl BlockWorker { #[cfg(test)] mod tests { use super::*; - use std::cell::RefCell; use std::sync::mpsc; /// In-memory disk backend for testing. @@ -276,65 +376,14 @@ mod tests { } } - /// Thread-safe mock guest memory for testing the worker. - struct MockMem { - data: std::sync::Mutex>, - } - - impl MockMem { - fn new(size: usize) -> Self { - MockMem { - data: std::sync::Mutex::new(vec![0u8; size]), - } - } - - fn write_bytes(&self, addr: u64, bytes: &[u8]) { - let a = addr as usize; - let mut data = self.data.lock().unwrap(); - data[a..a + bytes.len()].copy_from_slice(bytes); - } - - fn read_bytes(&self, addr: u64, len: usize) -> Vec { - let a = addr as usize; - let data = self.data.lock().unwrap(); - data[a..a + len].to_vec() - } - } - - impl GuestMemoryAccessor for MockMem { - fn read_at(&self, addr: u64, buf: &mut [u8]) -> super::super::super::super::error::Result<()> { - let a = addr as usize; - let data = self.data.lock().unwrap(); - if a + buf.len() > data.len() { - return Err(super::super::super::super::error::WkrunError::Memory( - "out of bounds".into(), - )); - } - buf.copy_from_slice(&data[a..a + buf.len()]); - Ok(()) - } - fn write_at(&self, addr: u64, data: &[u8]) -> super::super::super::super::error::Result<()> { - let a = addr as usize; - let mut mem = self.data.lock().unwrap(); - if a + data.len() > mem.len() { - return Err(super::super::super::super::error::WkrunError::Memory( - "out of bounds".into(), - )); - } - mem[a..a + data.len()].copy_from_slice(data); - Ok(()) - } - } - #[test] fn test_worker_read_request() { let (req_tx, req_rx) = mpsc::channel(); let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::with_pattern(4); - let mem = Arc::new(MockMem::new(0x10000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-read"); // Send a read request for sector 2 (pattern = 0x02). @@ -349,6 +398,7 @@ mod tests { is_write: true, }], status_addr: 0x3000, + write_data: None, }) .unwrap(); @@ -360,14 +410,18 @@ mod tests { let comp = comp_rx.recv().unwrap(); assert_eq!(comp.head_index, 42); assert_eq!(comp.bytes_written, 513); // 512 data + 1 status - - // Verify data written to guest memory. - let data = mem.read_bytes(0x2000, 512); - assert!(data.iter().all(|&b| b == 0x02)); - - // Verify status byte. - let status = mem.read_bytes(0x3000, 1); - assert_eq!(status[0], VIRTIO_BLK_S_OK); + assert_eq!(comp.status, VIRTIO_BLK_S_OK); + assert_eq!(comp.status_addr, 0x3000); + + // Verify read data is returned in completion (not written to guest mem). + let read_data = comp.read_data.unwrap(); + assert_eq!(read_data.len(), 512); + assert!(read_data.iter().all(|&b| b == 0x02)); + + // Verify read targets match the original buffers. + assert_eq!(comp.read_targets.len(), 1); + assert_eq!(comp.read_targets[0].addr, 0x2000); + assert_eq!(comp.read_targets[0].len, 512); } #[test] @@ -376,12 +430,11 @@ mod tests { let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::new(2048); - let mem = Arc::new(MockMem::new(0x10000)); - // Write data to guest memory that the worker will read. - mem.write_bytes(0x2000, &vec![0xAB; 512]); + // Write data is pre-read from guest memory by the vCPU thread. + let write_data = vec![0xAB; 512]; - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-write"); req_tx @@ -395,6 +448,7 @@ mod tests { is_write: false, // Device-readable for writes. }], status_addr: 0x3000, + write_data: Some(write_data), }) .unwrap(); @@ -404,9 +458,8 @@ mod tests { let comp = comp_rx.recv().unwrap(); assert_eq!(comp.head_index, 7); assert_eq!(comp.bytes_written, 1); // Only status byte is writable. - - let status = mem.read_bytes(0x3000, 1); - assert_eq!(status[0], VIRTIO_BLK_S_OK); + assert_eq!(comp.status, VIRTIO_BLK_S_OK); + assert!(comp.read_data.is_none()); } #[test] @@ -415,9 +468,8 @@ mod tests { let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::new(1024); - let mem = Arc::new(MockMem::new(0x10000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-flush"); req_tx @@ -427,6 +479,7 @@ mod tests { sector: 0, data_buffers: vec![], status_addr: 0x3000, + write_data: None, }) .unwrap(); @@ -435,10 +488,8 @@ mod tests { let comp = comp_rx.recv().unwrap(); assert_eq!(comp.head_index, 3); - assert_eq!(comp.bytes_written, 0); - - let status = mem.read_bytes(0x3000, 1); - assert_eq!(status[0], VIRTIO_BLK_S_OK); + assert_eq!(comp.bytes_written, 1); // status byte + assert_eq!(comp.status, VIRTIO_BLK_S_OK); } #[test] @@ -447,9 +498,8 @@ mod tests { let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::new(1024); - let mem = Arc::new(MockMem::new(0x10000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-unsupp"); req_tx @@ -459,6 +509,7 @@ mod tests { sector: 0, data_buffers: vec![], status_addr: 0x3000, + write_data: None, }) .unwrap(); @@ -468,9 +519,7 @@ mod tests { let comp = comp_rx.recv().unwrap(); assert_eq!(comp.head_index, 5); assert_eq!(comp.bytes_written, 0); - - let status = mem.read_bytes(0x3000, 1); - assert_eq!(status[0], VIRTIO_BLK_S_UNSUPP); + assert_eq!(comp.status, VIRTIO_BLK_S_UNSUPP); } #[test] @@ -479,9 +528,8 @@ mod tests { let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::with_pattern(8); - let mem = Arc::new(MockMem::new(0x10000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-multi"); // Send 3 read requests for sectors 0, 1, 2. @@ -497,6 +545,7 @@ mod tests { is_write: true, }], status_addr: 0x8000 + i as u64, + write_data: None, }) .unwrap(); } @@ -511,13 +560,14 @@ mod tests { } assert_eq!(completions.len(), 3); - // Verify each sector's data. - for i in 0..3u16 { - let data = mem.read_bytes(0x2000 + (i as u64) * 0x1000, 512); + // Verify each sector's data is in the completion. + for (idx, comp) in completions.iter().enumerate() { + let data = comp.read_data.as_ref().unwrap(); + assert_eq!(data.len(), 512); assert!( - data.iter().all(|&b| b == i as u8), + data.iter().all(|&b| b == idx as u8), "sector {} data mismatch", - i + idx ); } } @@ -528,9 +578,8 @@ mod tests { let (comp_tx, comp_rx) = mpsc::channel(); let disk = MemDisk::new(1024); - let mem = Arc::new(MockMem::new(0x10000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem.clone(), true); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), true); let handle = worker.run("test-blk-ro"); req_tx @@ -544,6 +593,7 @@ mod tests { is_write: false, }], status_addr: 0x3000, + write_data: Some(vec![0xAB; 512]), }) .unwrap(); @@ -552,9 +602,7 @@ mod tests { let comp = comp_rx.recv().unwrap(); assert_eq!(comp.bytes_written, 0); - - let status = mem.read_bytes(0x3000, 1); - assert_eq!(status[0], VIRTIO_BLK_S_IOERR); + assert_eq!(comp.status, VIRTIO_BLK_S_IOERR); } #[test] @@ -563,13 +611,45 @@ mod tests { let (comp_tx, _comp_rx) = mpsc::channel(); let disk = MemDisk::new(1024); - let mem = Arc::new(MockMem::new(0x1000)); - let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), mem, false); + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); let handle = worker.run("test-blk-shutdown"); // Drop the sender — worker should exit gracefully. drop(req_tx); handle.join().unwrap(); // Should not hang or panic. } + + #[test] + fn test_worker_write_missing_data_returns_error() { + let (req_tx, req_rx) = mpsc::channel(); + let (comp_tx, comp_rx) = mpsc::channel(); + + let disk = MemDisk::new(2048); + + let worker = BlockWorker::new(req_rx, comp_tx, Box::new(disk), false); + let handle = worker.run("test-blk-write-nodata"); + + // Write request without write_data should fail. + req_tx + .send(BlockRequest { + head_index: 1, + req_type: RequestType::Write, + sector: 0, + data_buffers: vec![BufferDesc { + addr: 0x2000, + len: 512, + is_write: false, + }], + status_addr: 0x3000, + write_data: None, // Missing! + }) + .unwrap(); + + drop(req_tx); + handle.join().unwrap(); + + let comp = comp_rx.recv().unwrap(); + assert_eq!(comp.status, VIRTIO_BLK_S_IOERR); + } } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 8f28cab92..c00d6ca96 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -10,6 +10,7 @@ #[cfg(target_os = "windows")] mod imp { use std::collections::HashMap; + use std::io::Write as IoWrite; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; @@ -27,8 +28,8 @@ mod imp { /// Implement GuestMemoryAccessor directly on GuestMemory. /// - /// This allows `Arc` to be passed to block worker threads - /// (GuestMemory is Send+Sync since its regions are Send+Sync). + /// This allows GuestMemory to be used via the GuestMemoryAccessor trait + /// in device handling code (virtio queues, block I/O, etc.). impl GuestMemoryAccessor for GuestMemory { fn read_at(&self, addr: u64, buf: &mut [u8]) -> Result<()> { self.read_at_addr(addr, buf) @@ -137,6 +138,27 @@ mod imp { run_config: VcpuRunConfig, canceller_slot: Arc>>, ) -> Result { + // Open a diagnostic log file for debugging boot failures. + // Written to a fixed path that persists across box lifecycle. + let mut diag_log: Option = None; + if let Ok(f) = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(r"C:\ws-boxlite\whpx-diag.log") + { + diag_log = Some(f); + } + + macro_rules! diag { + ($($arg:tt)*) => { + if let Some(ref mut f) = diag_log { + let _ = writeln!(f, $($arg)*); + let _ = f.flush(); + } + }; + } + diag!("\n=== VM START ctx_id={} ===", ctx.id); + // Validate required fields. let kernel_path = ctx .kernel_path @@ -179,7 +201,7 @@ mod imp { partition.setup()?; - // Allocate and map guest memory (Arc for sharing with block worker threads). + // Allocate and map guest memory. let guest_mem = Arc::new(GuestMemory::new(ctx.ram_mib)?); guest_mem.map_to_partition(&partition)?; @@ -189,12 +211,11 @@ mod imp { devices::store_console_buffer(ctx_id, setup.console_buffer); let mut devices = setup.devices; - // NOTE: Async block workers are NOT started on Windows/WHPX. - // The worker thread writes to guest memory from a non-vCPU thread, - // which conflicts with WHPX's memory tracking and causes ~60% boot - // failure rate on Win10. Sync disk I/O is reliable (100% pass rate) - // and sufficient for typical workloads. - // devices.start_blk_workers(guest_mem.clone()); + // NOTE: Block I/O workers are started lazily (deferred start) inside + // the vCPU loop, on the first MMIO write. Starting them here (before + // the vCPU runs) causes ~80% boot failure on WHPX — the worker + // thread creation appears to interfere with WHPX partition state + // during early boot. // Build kernel command line. let cmdline = build_kernel_cmdline( @@ -227,6 +248,7 @@ mod imp { ctx.ram_mib as u64 * 1024 * 1024 > crate::windows::memory::VIRTIO_MMIO_BASE, cmdline.len() ); + diag!("Kernel loaded, RIP={:#X}, ram={}MB", regs.rip, ctx.ram_mib); // Create vCPU and set registers. let vcpu = WhpxVcpu::new(&partition, 0)?; @@ -257,6 +279,12 @@ mod imp { let start_time = Instant::now(); let mut last_progress = Instant::now(); let mut mmio_count: u64 = 0; + let mut blk_workers_started = false; + let sync_block = std::env::var("BOXLITE_SYNC_BLOCK").is_ok(); + let mut serial_out_count: u64 = 0; + let mut io_out_count: u64 = 0; + let mut io_in_count: u64 = 0; + let mut inject_count: u64 = 0; let mut last_exit_reason = "none"; let exit_code; @@ -265,23 +293,35 @@ mod imp { devices.tick_and_poll(mem_ref); // Try to inject pending interrupt. + // CRITICAL: Do NOT acknowledge a new PIC interrupt if a previous + // injection is still pending in WHPX. Overwriting the pending + // interruption register would lose the old interrupt and leave + // its PIC ISR bit permanently stuck (guest never sends EOI). if devices.pic.has_pending() { - match vcpu.interrupts_enabled() { - Ok(true) => { - if let Some(vector) = devices.pic.acknowledge() { - log::debug!("Injecting interrupt vector {:#X}", vector); - vcpu.inject_interrupt(vector)?; - devices.set_window_requested(false); + let already_pending = vcpu + .has_pending_interruption() + .unwrap_or(false); + if already_pending { + // Previous interrupt not yet delivered — skip this cycle. + } else { + match vcpu.interrupts_enabled() { + Ok(true) => { + if let Some(vector) = devices.pic.acknowledge() { + log::debug!("Injecting interrupt vector {:#X}", vector); + vcpu.inject_interrupt(vector)?; + devices.set_window_requested(false); + inject_count += 1; + } } - } - Ok(false) => { - if !devices.window_requested() { - vcpu.request_interrupt_window()?; - devices.set_window_requested(true); + Ok(false) => { + if !devices.window_requested() { + vcpu.request_interrupt_window()?; + devices.set_window_requested(true); + } + } + Err(ref e) => { + log::warn!("interrupts_enabled() error: {:?}", e); } - } - Err(ref e) => { - log::warn!("interrupts_enabled() error: {:?}", e); } } } @@ -308,6 +348,10 @@ mod imp { match exit { VcpuExit::IoOut { port, size, data } => { halt_count = 0; + io_out_count += 1; + if port == 0x3F8 { + serial_out_count += 1; + } devices.handle_io_out(port, size, data); if devices.shutdown_requested() { log::info!("ACPI shutdown detected after {} exits", exit_count); @@ -319,6 +363,7 @@ mod imp { } VcpuExit::IoIn { port, size } => { halt_count = 0; + io_in_count += 1; let data = devices.handle_io_in(port, size); vcpu.complete_io_in(data, size)?; } @@ -335,6 +380,19 @@ mod imp { } => { halt_count = 0; mmio_count += 1; + // Deferred start: spawn block I/O workers on first MMIO + // write (after vCPU is running). If BOXLITE_SYNC_BLOCK is + // set, skip workers entirely (sync disk I/O for A/B testing). + if !blk_workers_started && !sync_block { + devices.start_blk_workers(); + blk_workers_started = true; + let msg = format!( + "Block workers started at exit={} mmio={} elapsed={:.1}ms", + exit_count, mmio_count, start_time.elapsed().as_secs_f64() * 1000.0 + ); + eprintln!("[WHPX] {}", msg); + diag!("{}", msg); + } devices.handle_mmio_write(address, size, data, mem_ref); vcpu.skip_instruction()?; } @@ -357,9 +415,15 @@ mod imp { devices.tick_and_poll(mem_ref); if devices.pic.has_pending() { - if let Some(vector) = devices.pic.acknowledge() { - vcpu.inject_interrupt(vector)?; - devices.set_window_requested(false); + let already_pending = vcpu + .has_pending_interruption() + .unwrap_or(false); + if !already_pending { + if let Some(vector) = devices.pic.acknowledge() { + vcpu.inject_interrupt(vector)?; + devices.set_window_requested(false); + inject_count += 1; + } } halt_with_irq += 1; halt_count = 0; @@ -424,21 +488,31 @@ mod imp { exit_code = 0; break; } - // Wall-clock progress report every 5 seconds. - if last_progress.elapsed() >= Duration::from_secs(5) { + // Wall-clock progress report every 2 seconds. + if last_progress.elapsed() >= Duration::from_secs(2) { last_progress = Instant::now(); if let Ok(regs) = vcpu.get_registers() { let console_len = devices::get_console_output(ctx_id) .map(|b| b.len()) .unwrap_or(0); - log::info!( - "Progress @ {:.1}s: exits={} RIP={:#X} console={}B mmio={}", + let (qn, bc) = devices.blk_stats(); + let (irr, isr, imr, vbase) = devices.pic.master_state(); + let (s_irr, s_isr, s_imr, s_vbase) = devices.pic.slave_state(); + let msg = format!( + "Progress @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} mode={}", start_time.elapsed().as_secs_f64(), - exit_count, - regs.rip, - console_len, - mmio_count, + exit_count, regs.rip, console_len, + io_out_count, serial_out_count, + mmio_count, qn, bc, + halt_count, total_halt_exits, + halt_with_irq, inject_count, + irr, isr, imr, vbase, + s_irr, s_isr, s_imr, s_vbase, + if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }, ); + log::info!("{}", msg); + eprintln!("[WHPX] {}", msg); + diag!("{}", msg); } } } @@ -548,13 +622,15 @@ mod imp { exit_count, last_exit_reason ); - eprintln!( - "[WHPX] VM exited, code={} exits={} reason={} elapsed={:.1}s", - exit_code, - exit_count, - last_exit_reason, + let exit_msg = format!( + "VM exited, code={} exits={} reason={} io_out={} serial={} io_in={} mmio={} halt={}/{} elapsed={:.1}s", + exit_code, exit_count, last_exit_reason, + io_out_count, serial_out_count, io_in_count, + mmio_count, total_halt_exits, halt_with_irq, start_time.elapsed().as_secs_f64(), ); + eprintln!("[WHPX] {}", exit_msg); + diag!("{}", exit_msg); Ok(exit_code) } diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index f3821f5ed..1bafde4f5 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -956,6 +956,30 @@ mod imp { Ok(regs.rflags & (1 << 9) != 0) } + /// Check if there is a pending interruption that hasn't been delivered yet. + /// + /// Returns `true` if `WHvRegisterPendingInterruption` bit 0 + /// (InterruptionPending) is set. A new injection must NOT be + /// attempted while a previous one is still pending — doing so + /// would overwrite the old interrupt, leaving its PIC ISR bit + /// permanently stuck. + pub fn has_pending_interruption(&self) -> Result { + let names = [WHvRegisterPendingInterruption]; + let mut values: Vec = vec![zeroed_reg_value(); 1]; + let hr = unsafe { + WHvGetVirtualProcessorRegisters( + self.partition_handle, + self.index, + names.as_ptr(), + 1, + values.as_mut_ptr(), + ) + }; + check_hresult("WHvGetVirtualProcessorRegisters(pending_interruption)", hr)?; + let pending = unsafe { values[0].Reg64 }; + Ok(pending & 1 != 0) + } + /// Request an interrupt window exit. /// /// The next `run()` call will exit with [`VcpuExit::InterruptWindow`] From 23be56a3692c96e03929a29fd1b6cb80280c8899 Mon Sep 17 00:00:00 2001 From: lile Date: Wed, 29 Apr 2026 18:30:49 +0800 Subject: [PATCH 20/33] feat(windows): add userspace IOAPIC + LAPIC interrupt architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace PIC-only interrupt routing with a full IOAPIC + LAPIC + IrqChip architecture. This enables level-triggered interrupts, 24 IRQ pins, and lays the groundwork for multi-vCPU support (Iter 3). New files: - ioapic.rs: 24-pin I/O APIC with redirection table and MMIO interface - lapic.rs: Local APIC with IRR/ISR priority logic and timer support - irq_chip.rs: Coordinator wiring PIC + IOAPIC + LAPIC with auto PIC→APIC mode transition Key changes: - MADT ACPI table added for Linux kernel APIC discovery - Remove noapic/nolapic from kernel cmdline to enable APIC mode - Manager routes MMIO reads/writes to IOAPIC (0xFEC00000) and LAPIC (0xFEE00000) address ranges - Runner uses IrqChip for interrupt injection with LAPIC priority checks - IRQ 0→GSI 2 remapping per standard x86 convention (MADT ISO entry) - PIC retained for legacy boot compatibility; IrqChip auto-switches when guest enables LAPIC SVR + IOAPIC has unmasked entries Verified: Win10 vm-bench 8/8 PASS, Win11 vm-bench 8/8 PASS, net-test 8/8 PASS on both machines. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/boot/acpi.rs | 229 ++++++++- src/vmm/src/windows/cmdline.rs | 36 +- src/vmm/src/windows/devices/ioapic.rs | 494 +++++++++++++++++++ src/vmm/src/windows/devices/irq_chip.rs | 396 +++++++++++++++ src/vmm/src/windows/devices/lapic.rs | 624 ++++++++++++++++++++++++ src/vmm/src/windows/devices/manager.rs | 80 ++- src/vmm/src/windows/devices/mod.rs | 3 + src/vmm/src/windows/memory.rs | 12 + src/vmm/src/windows/runner.rs | 28 +- 9 files changed, 1855 insertions(+), 47 deletions(-) create mode 100644 src/vmm/src/windows/devices/ioapic.rs create mode 100644 src/vmm/src/windows/devices/irq_chip.rs create mode 100644 src/vmm/src/windows/devices/lapic.rs diff --git a/src/vmm/src/windows/boot/acpi.rs b/src/vmm/src/windows/boot/acpi.rs index cee46ccbd..265de49d8 100644 --- a/src/vmm/src/windows/boot/acpi.rs +++ b/src/vmm/src/windows/boot/acpi.rs @@ -1,24 +1,35 @@ //! Minimal ACPI table generation for WHPX guest boot. //! -//! Generates RSDP, RSDT, FADT, and DSDT tables so the Linux kernel can -//! discover the PM1a_CNT register and perform clean ACPI S5 shutdown -//! instead of falling back to an HLT loop. +//! Generates RSDP, RSDT, FADT, DSDT, and MADT tables so the Linux kernel can: +//! - Discover the PM1a_CNT register for clean ACPI S5 shutdown +//! - Discover the IOAPIC and LAPIC for APIC-mode interrupt routing /// Total size of the ACPI region in guest memory. -pub const ACPI_REGION_SIZE: u64 = 0x200; // 512 bytes +pub const ACPI_REGION_SIZE: u64 = 0x400; // 1024 bytes // Table offsets within the ACPI region. const RSDP_OFFSET: usize = 0x00; const RSDT_OFFSET: usize = 0x20; const FADT_OFFSET: usize = 0x60; const DSDT_OFFSET: usize = 0x100; +const MADT_OFFSET: usize = 0x140; // Table sizes. const RSDP_SIZE: usize = 20; -const RSDT_SIZE: usize = 40; // 36-byte header + 4-byte entry +const RSDT_HEADER_SIZE: usize = 36; +const RSDT_ENTRIES: usize = 2; // FADT + MADT +const RSDT_SIZE: usize = RSDT_HEADER_SIZE + RSDT_ENTRIES * 4; // 36 + 8 = 44 const FADT_SIZE: usize = 116; const DSDT_HEADER_SIZE: usize = 36; +/// MADT structure sizes. +const MADT_HEADER_SIZE: usize = 44; // 36-byte ACPI header + 4-byte Local APIC Address + 4-byte Flags +const MADT_LAPIC_ENTRY_SIZE: usize = 8; // Type 0: Processor Local APIC +const MADT_IOAPIC_ENTRY_SIZE: usize = 12; // Type 1: I/O APIC +const MADT_ISO_ENTRY_SIZE: usize = 10; // Type 2: Interrupt Source Override +const MADT_SIZE: usize = + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE + MADT_ISO_ENTRY_SIZE; + // ACPI PM1a I/O port addresses (must match manager.rs constants). const PM1A_EVT_BLK: u32 = 0x600; const PM1A_CNT_BLK: u32 = 0x604; @@ -29,6 +40,12 @@ const PM1A_CNT_BLK: u32 = 0x604; /// virtio-MMIO devices (IRQ 5-9). IRQ 11 is unused. const SCI_INT: u16 = 11; +/// IOAPIC base address (must match memory.rs). +const IOAPIC_BASE: u32 = 0xFEC0_0000; + +/// LAPIC base address (must match memory.rs). +const LAPIC_BASE: u32 = 0xFEE0_0000; + /// AML bytecode for the `\_S5_` sleep package. /// /// Encodes: `Name(\_S5_, Package(4) { 5, 5, 0, 0 })` @@ -43,7 +60,7 @@ const S5_AML: &[u8] = &[ 0x08, 0x5C, 0x5F, 0x53, 0x35, 0x5F, 0x12, 0x0A, 0x04, 0x0A, 0x05, 0x0A, 0x05, 0x00, 0x00, ]; -/// Build ACPI tables (RSDP, RSDT, FADT, DSDT) for the given base address. +/// Build ACPI tables (RSDP, RSDT, FADT, DSDT, MADT) for the given base address. /// /// Returns a `Vec` of exactly `ACPI_REGION_SIZE` bytes. The caller /// writes this to guest memory at `acpi_base`. @@ -53,6 +70,7 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { let rsdt_addr = acpi_base + RSDT_OFFSET as u64; let fadt_addr = acpi_base + FADT_OFFSET as u64; let dsdt_addr = acpi_base + DSDT_OFFSET as u64; + let madt_addr = acpi_base + MADT_OFFSET as u64; // ---- RSDP (20 bytes at offset 0x00) ---- let rsdp = &mut region[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; @@ -63,7 +81,7 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { rsdp[16..20].copy_from_slice(&(rsdt_addr as u32).to_le_bytes()); // RsdtAddress acpi_checksum(&mut region[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE], 8); - // ---- RSDT (40 bytes at offset 0x20) ---- + // ---- RSDT (44 bytes at offset 0x20) ---- let rsdt = &mut region[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; rsdt[0..4].copy_from_slice(b"RSDT"); // Signature rsdt[4..8].copy_from_slice(&(RSDT_SIZE as u32).to_le_bytes()); // Length @@ -76,6 +94,8 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { rsdt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision // Entry[0]: pointer to FADT rsdt[36..40].copy_from_slice(&(fadt_addr as u32).to_le_bytes()); + // Entry[1]: pointer to MADT + rsdt[40..44].copy_from_slice(&(madt_addr as u32).to_le_bytes()); acpi_checksum(&mut region[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE], 9); // ---- FADT (116 bytes at offset 0x60) ---- @@ -120,9 +140,69 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { dsdt[DSDT_HEADER_SIZE..DSDT_HEADER_SIZE + S5_AML.len()].copy_from_slice(S5_AML); acpi_checksum(&mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size], 9); + // ---- MADT (Multiple APIC Description Table) at offset 0x140 ---- + build_madt(&mut region[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]); + region } +/// Build the MADT (Multiple APIC Description Table). +/// +/// Tells the Linux kernel about the Local APIC and I/O APIC. +/// +/// Structure: +/// - Header (44 bytes): standard ACPI header + LAPIC address + flags +/// - Entry 0 - Local APIC (type 0, 8 bytes): Processor 0, APIC ID 0 +/// - Entry 1 - I/O APIC (type 1, 12 bytes): IOAPIC ID 0, base 0xFEC00000 +/// - Entry 2 - Interrupt Source Override (type 2, 10 bytes): IRQ 0 → GSI 2 +fn build_madt(madt: &mut [u8]) { + // ACPI header. + madt[0..4].copy_from_slice(b"APIC"); // Signature + madt[4..8].copy_from_slice(&(MADT_SIZE as u32).to_le_bytes()); // Length + madt[8] = 1; // Revision + // madt[9] = checksum (computed below) + madt[10..16].copy_from_slice(b"BOXLTE"); // OEMID + madt[16..24].copy_from_slice(b"BOXLITEV"); // OEM Table ID + madt[24..28].copy_from_slice(&1u32.to_le_bytes()); // OEM Revision + madt[28..32].copy_from_slice(b"BXLT"); // Creator ID + madt[32..36].copy_from_slice(&1u32.to_le_bytes()); // Creator Revision + + // Local APIC Address (offset 36, 4 bytes). + madt[36..40].copy_from_slice(&LAPIC_BASE.to_le_bytes()); + + // Flags (offset 40, 4 bytes): PCAT_COMPAT = 1 (dual 8259 PICs present). + madt[40..44].copy_from_slice(&1u32.to_le_bytes()); + + // --- Entry 0: Processor Local APIC (type 0, 8 bytes) --- + let mut off = MADT_HEADER_SIZE; + madt[off] = 0; // Entry type: Processor Local APIC + madt[off + 1] = MADT_LAPIC_ENTRY_SIZE as u8; // Length + madt[off + 2] = 0; // ACPI Processor ID + madt[off + 3] = 0; // APIC ID + madt[off + 4..off + 8].copy_from_slice(&1u32.to_le_bytes()); // Flags: enabled + off += MADT_LAPIC_ENTRY_SIZE; + + // --- Entry 1: I/O APIC (type 1, 12 bytes) --- + madt[off] = 1; // Entry type: I/O APIC + madt[off + 1] = MADT_IOAPIC_ENTRY_SIZE as u8; // Length + madt[off + 2] = 0; // I/O APIC ID + madt[off + 3] = 0; // Reserved + madt[off + 4..off + 8].copy_from_slice(&IOAPIC_BASE.to_le_bytes()); // I/O APIC Address + madt[off + 8..off + 12].copy_from_slice(&0u32.to_le_bytes()); // Global System Interrupt Base + off += MADT_IOAPIC_ENTRY_SIZE; + + // --- Entry 2: Interrupt Source Override (type 2, 10 bytes) --- + // Standard x86 convention: PIT timer (IRQ 0) routes to IOAPIC pin 2. + madt[off] = 2; // Entry type: Interrupt Source Override + madt[off + 1] = MADT_ISO_ENTRY_SIZE as u8; // Length + madt[off + 2] = 0; // Bus: ISA + madt[off + 3] = 0; // Source: IRQ 0 (PIT timer) + madt[off + 4..off + 8].copy_from_slice(&2u32.to_le_bytes()); // Global System Interrupt: 2 + madt[off + 8..off + 10].copy_from_slice(&0u16.to_le_bytes()); // Flags: conforming + + acpi_checksum(madt, 9); +} + /// Compute ACPI checksum and store it at `checksum_offset`. /// /// The checksum byte is set so that the sum of all bytes in the table @@ -163,6 +243,20 @@ mod tests { assert_eq!(sum, 0, "RSDT checksum must be zero"); } + #[test] + fn test_rsdt_has_two_entries() { + let region = build_acpi_tables(TEST_BASE); + let rsdt = ®ion[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; + + // Entry[0]: FADT pointer. + let fadt_ptr = u32::from_le_bytes(rsdt[36..40].try_into().unwrap()); + assert_eq!(fadt_ptr, (TEST_BASE + FADT_OFFSET as u64) as u32); + + // Entry[1]: MADT pointer. + let madt_ptr = u32::from_le_bytes(rsdt[40..44].try_into().unwrap()); + assert_eq!(madt_ptr, (TEST_BASE + MADT_OFFSET as u64) as u32); + } + #[test] fn test_fadt_signature_and_pm1a_cnt() { let region = build_acpi_tables(TEST_BASE); @@ -228,4 +322,125 @@ mod tests { let dsdt_addr = u32::from_le_bytes(fadt[40..44].try_into().unwrap()); assert_eq!(dsdt_addr, (TEST_BASE + DSDT_OFFSET as u64) as u32); } + + // ---- MADT tests ---- + + #[test] + fn test_madt_signature_and_checksum() { + let region = build_acpi_tables(TEST_BASE); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + + assert_eq!(&madt[0..4], b"APIC"); + + let length = u32::from_le_bytes(madt[4..8].try_into().unwrap()); + assert_eq!(length, MADT_SIZE as u32); + + let sum: u8 = madt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "MADT checksum must be zero"); + } + + #[test] + fn test_madt_lapic_address() { + let region = build_acpi_tables(TEST_BASE); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + + let lapic_addr = u32::from_le_bytes(madt[36..40].try_into().unwrap()); + assert_eq!(lapic_addr, LAPIC_BASE); + } + + #[test] + fn test_madt_pcat_compat_flag() { + let region = build_acpi_tables(TEST_BASE); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + + let flags = u32::from_le_bytes(madt[40..44].try_into().unwrap()); + assert_eq!(flags, 1, "PCAT_COMPAT flag must be set"); + } + + #[test] + fn test_madt_lapic_entry() { + let region = build_acpi_tables(TEST_BASE); + let off = MADT_OFFSET + MADT_HEADER_SIZE; + + assert_eq!(region[off], 0, "entry type: Local APIC"); + assert_eq!(region[off + 1], 8, "entry length"); + assert_eq!(region[off + 2], 0, "ACPI Processor ID"); + assert_eq!(region[off + 3], 0, "APIC ID"); + let flags = u32::from_le_bytes(region[off + 4..off + 8].try_into().unwrap()); + assert_eq!(flags, 1, "enabled flag"); + } + + #[test] + fn test_madt_ioapic_entry() { + let region = build_acpi_tables(TEST_BASE); + let off = MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE; + + assert_eq!(region[off], 1, "entry type: I/O APIC"); + assert_eq!(region[off + 1], 12, "entry length"); + assert_eq!(region[off + 2], 0, "I/O APIC ID"); + let ioapic_addr = u32::from_le_bytes(region[off + 4..off + 8].try_into().unwrap()); + assert_eq!(ioapic_addr, IOAPIC_BASE); + let gsi_base = u32::from_le_bytes(region[off + 8..off + 12].try_into().unwrap()); + assert_eq!(gsi_base, 0, "GSI base must be 0"); + } + + #[test] + fn test_madt_interrupt_source_override() { + let region = build_acpi_tables(TEST_BASE); + let off = + MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE; + + assert_eq!(region[off], 2, "entry type: Interrupt Source Override"); + assert_eq!(region[off + 1], 10, "entry length"); + assert_eq!(region[off + 2], 0, "bus: ISA"); + assert_eq!(region[off + 3], 0, "source: IRQ 0"); + let gsi = u32::from_le_bytes(region[off + 4..off + 8].try_into().unwrap()); + assert_eq!(gsi, 2, "GSI: IRQ 0 → pin 2"); + let flags = u16::from_le_bytes(region[off + 8..off + 10].try_into().unwrap()); + assert_eq!(flags, 0, "conforming polarity/trigger"); + } + + #[test] + fn test_tables_do_not_overlap() { + // Verify no ACPI tables overlap each other. + let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); + let tables = [ + ("RSDP", RSDP_OFFSET, RSDP_OFFSET + RSDP_SIZE), + ("RSDT", RSDT_OFFSET, RSDT_OFFSET + RSDT_SIZE), + ("FADT", FADT_OFFSET, FADT_OFFSET + FADT_SIZE), + ("DSDT", DSDT_OFFSET, DSDT_OFFSET + dsdt_size), + ("MADT", MADT_OFFSET, MADT_OFFSET + MADT_SIZE), + ]; + + for i in 0..tables.len() { + for j in (i + 1)..tables.len() { + let (name_a, start_a, end_a) = tables[i]; + let (name_b, start_b, end_b) = tables[j]; + assert!( + end_a <= start_b || end_b <= start_a, + "{} [{:#X}..{:#X}) overlaps {} [{:#X}..{:#X})", + name_a, + start_a, + end_a, + name_b, + start_b, + end_b + ); + } + } + } + + #[test] + fn test_all_tables_fit_in_region() { + let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); + let last_table_end = MADT_OFFSET + MADT_SIZE; + assert!( + last_table_end <= ACPI_REGION_SIZE as usize, + "tables extend beyond region: {} > {}", + last_table_end, + ACPI_REGION_SIZE + ); + // Also verify DSDT doesn't extend into MADT. + assert!(DSDT_OFFSET + dsdt_size <= MADT_OFFSET); + } } diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index fe5c00a94..7cfd26b29 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -17,8 +17,7 @@ pub const FIRST_MMIO_IRQ: u8 = 5; /// - `console=ttyS0`: Route kernel console to serial port (required — no VGA). /// - `quiet loglevel=1`: Suppress kernel printk to console. /// - `i8042.nokbd i8042.noaux`: Skip PS/2 keyboard/mouse probe (10K+ exits). -/// - `noapic nolapic nosmp`: Use legacy PIC mode (no APIC/SMP). Required for -/// our 8259 PIC emulation. +/// - `nosmp`: Single vCPU mode (multi-vCPU deferred to Iter 3). /// - `nohyperv`: Disable Hyper-V guest enlightenments. WHPX exposes Hyper-V /// CPUID leaves but doesn't fully support synthetic timers/SynIC, causing /// clock stalls if the kernel tries to use them. @@ -26,8 +25,11 @@ pub const FIRST_MMIO_IRQ: u8 = 5; /// depends on a reliable timer source. /// - `nokaslr`: Disable kernel address space randomization for deterministic /// boot in our controlled single-vCPU environment. +/// +/// Note: `noapic` and `nolapic` are NOT present — the MADT table in ACPI +/// tells the kernel about the IOAPIC and LAPIC for APIC-mode interrupt routing. const BASE_CMDLINE: &str = - "console=ttyS0 quiet loglevel=1 i8042.nokbd i8042.noaux noapic nolapic nosmp nohyperv lpj=1000000 nokaslr"; + "console=ttyS0 quiet loglevel=1 i8042.nokbd i8042.noaux nosmp nohyperv lpj=1000000 nokaslr"; /// Serial console parameters appended in verbose mode. /// @@ -68,8 +70,9 @@ pub fn build_kernel_cmdline( ) -> String { let mut cmdline = if verbose { // Verbose mode: serial console + full i8042 probe for debugging. + // No noapic/nolapic — APIC mode is enabled via MADT (same as quiet mode). format!( - "{} noapic nolapic nosmp nohyperv lpj=1000000 nokaslr", + "{} nosmp nohyperv lpj=1000000 nokaslr", VERBOSE_CONSOLE ) } else { @@ -240,23 +243,36 @@ mod tests { } #[test] - fn test_cmdline_no_noacpi() { - // Verify neither quiet nor verbose mode includes noacpi. + fn test_cmdline_no_noacpi_no_noapic() { + // Verify neither quiet nor verbose mode includes noacpi or noapic. + // APIC mode is enabled via MADT; noapic/nolapic would disable it. let quiet = build_simple(None, false, &[]); assert!( !quiet.contains("noacpi"), "quiet cmdline must not contain noacpi" ); + assert!( + !quiet.contains("noapic"), + "quiet cmdline must not contain noapic (APIC enabled via MADT)" + ); + assert!( + !quiet.contains("nolapic"), + "quiet cmdline must not contain nolapic" + ); let verbose = build_kernel_cmdline(None, false, &[], None, None, None, &[], true); assert!( !verbose.contains("noacpi"), "verbose cmdline must not contain noacpi" ); - - // Ensure noapic (APIC disable) is still present — it's different from noacpi. - assert!(quiet.contains("noapic")); - assert!(verbose.contains("noapic")); + assert!( + !verbose.contains("noapic"), + "verbose cmdline must not contain noapic (APIC enabled via MADT)" + ); + assert!( + !verbose.contains("nolapic"), + "verbose cmdline must not contain nolapic" + ); } #[test] diff --git a/src/vmm/src/windows/devices/ioapic.rs b/src/vmm/src/windows/devices/ioapic.rs new file mode 100644 index 000000000..31c6910f5 --- /dev/null +++ b/src/vmm/src/windows/devices/ioapic.rs @@ -0,0 +1,494 @@ +//! I/O APIC (IOAPIC) emulation. +//! +//! Emulates a 24-pin IOAPIC with redirection table entries for routing +//! interrupts from devices to the Local APIC. +//! +//! MMIO interface at 0xFEC0_0000 (4KB region): +//! - Offset 0x00: IOREGSEL (write register index) +//! - Offset 0x10: IOWIN (read/write selected register) +//! +//! Registers: +//! - 0x00: IOAPIC ID +//! - 0x01: IOAPIC Version (24 entries, version 0x11) +//! - 0x10-0x3F: Redirection table entries (low/high 32 bits) + +/// Number of redirection table entries (pins). +const NUM_PINS: usize = 24; + +/// IOAPIC version register value. +/// Bits [7:0] = version (0x11 = 82093AA), bits [23:16] = max redirection entry (23). +const IOAPIC_VERSION: u32 = 0x0017_0011; + +/// A single redirection table entry. +/// +/// Each entry controls how an interrupt on the corresponding pin is delivered. +#[derive(Debug, Clone, Copy)] +struct RedirectionEntry { + /// IDT vector (0-255). + vector: u8, + /// Delivery mode: 0=Fixed, 2=SMI, 4=NMI, 5=INIT, 7=ExtINT. + delivery_mode: u8, + /// Destination mode: false=physical, true=logical. + dest_mode: bool, + /// Pin polarity: false=active-high, true=active-low. + polarity: bool, + /// Trigger mode: false=edge, true=level. + trigger_mode: bool, + /// true = masked (interrupt suppressed). + mask: bool, + /// Level-triggered: set on delivery, cleared on EOI. + remote_irr: bool, + /// LAPIC destination ID. + dest: u8, +} + +impl Default for RedirectionEntry { + fn default() -> Self { + Self { + vector: 0, + delivery_mode: 0, + dest_mode: false, + polarity: false, + trigger_mode: false, + mask: true, // Masked by default + remote_irr: false, + dest: 0, + } + } +} + +impl RedirectionEntry { + /// Read the low 32 bits of the redirection entry. + fn read_low(&self) -> u32 { + let mut val = self.vector as u32; + val |= (self.delivery_mode as u32 & 0x7) << 8; + if self.dest_mode { + val |= 1 << 11; + } + if self.polarity { + val |= 1 << 13; + } + if self.remote_irr { + val |= 1 << 14; + } + if self.trigger_mode { + val |= 1 << 15; + } + if self.mask { + val |= 1 << 16; + } + val + } + + /// Read the high 32 bits (destination field in bits [31:24]). + fn read_high(&self) -> u32 { + (self.dest as u32) << 24 + } + + /// Write the low 32 bits. + fn write_low(&mut self, val: u32) { + self.vector = (val & 0xFF) as u8; + self.delivery_mode = ((val >> 8) & 0x7) as u8; + self.dest_mode = val & (1 << 11) != 0; + self.polarity = val & (1 << 13) != 0; + // remote_irr is read-only (bit 14). + self.trigger_mode = val & (1 << 15) != 0; + self.mask = val & (1 << 16) != 0; + } + + /// Write the high 32 bits. + fn write_high(&mut self, val: u32) { + self.dest = ((val >> 24) & 0xFF) as u8; + } +} + +/// 24-pin I/O APIC. +pub struct IoApic { + /// IOAPIC ID (bits [27:24] of register 0x00). + id: u8, + /// IOREGSEL: indirect register select. + reg_sel: u8, + /// 24 redirection table entries. + entries: [RedirectionEntry; NUM_PINS], + /// Pin assertion state (for level-triggered re-injection). + pin_state: u32, +} + +impl Default for IoApic { + fn default() -> Self { + Self::new() + } +} + +impl IoApic { + /// Create a new IOAPIC with default state (all pins masked). + pub fn new() -> Self { + Self { + id: 0, + reg_sel: 0, + entries: [RedirectionEntry::default(); NUM_PINS], + pin_state: 0, + } + } + + /// Process an IRQ signal. Returns the vector to deliver if the interrupt + /// is deliverable, or None if masked/blocked. + /// + /// - Edge-triggered: deliver if not masked, set pin state. + /// - Level-triggered: deliver if not masked AND remote_irr not set. + pub fn service_irq(&mut self, irq: u8, level: bool) -> Option { + if irq as usize >= NUM_PINS { + return None; + } + + if level { + self.pin_state |= 1 << irq; + } else { + self.pin_state &= !(1 << irq); + return None; // Deassertion doesn't deliver. + } + + let entry = &mut self.entries[irq as usize]; + + if entry.mask { + return None; + } + + if entry.trigger_mode { + // Level-triggered: only deliver if remote_irr is not set. + if entry.remote_irr { + return None; + } + entry.remote_irr = true; + } + // Edge-triggered: always deliver (if not masked). + + Some(entry.vector) + } + + /// Handle End-of-Interrupt for a given vector. + /// + /// Clears remote_irr for matching level-triggered entries. + /// Returns the pin number if still asserted (needs re-injection), or None. + pub fn end_of_interrupt(&mut self, vector: u8) -> Option { + for (i, entry) in self.entries.iter_mut().enumerate() { + if entry.vector == vector && entry.trigger_mode && entry.remote_irr { + entry.remote_irr = false; + // Check if pin is still asserted. + if self.pin_state & (1 << i) != 0 { + return Some(i as u8); + } + } + } + None + } + + /// Read from the IOAPIC MMIO region. + /// + /// Only offsets 0x00 (IOREGSEL) and 0x10 (IOWIN) are valid. + pub fn read_mmio(&self, offset: u64) -> u32 { + match offset { + 0x00 => self.reg_sel as u32, + 0x10 => self.read_register(self.reg_sel), + _ => 0, + } + } + + /// Write to the IOAPIC MMIO region. + pub fn write_mmio(&mut self, offset: u64, value: u32) { + match offset { + 0x00 => self.reg_sel = value as u8, + 0x10 => self.write_register(self.reg_sel, value), + _ => {} + } + } + + /// Read an indirect register by index. + fn read_register(&self, reg: u8) -> u32 { + match reg { + 0x00 => (self.id as u32) << 24, // IOAPIC ID + 0x01 => IOAPIC_VERSION, // Version + 0x02 => 0, // Arbitration ID (not used) + 0x10..=0x3F => { + let pin = ((reg - 0x10) / 2) as usize; + if pin < NUM_PINS { + if reg & 1 == 0 { + self.entries[pin].read_low() + } else { + self.entries[pin].read_high() + } + } else { + 0 + } + } + _ => 0, + } + } + + /// Check if any redirection table entry is unmasked (active). + pub fn has_unmasked_entries(&self) -> bool { + self.entries.iter().any(|e| !e.mask) + } + + /// Write an indirect register by index. + fn write_register(&mut self, reg: u8, value: u32) { + match reg { + 0x00 => self.id = ((value >> 24) & 0x0F) as u8, + 0x10..=0x3F => { + let pin = ((reg - 0x10) / 2) as usize; + if pin < NUM_PINS { + if reg & 1 == 0 { + self.entries[pin].write_low(value); + } else { + self.entries[pin].write_high(value); + } + } + } + _ => {} // Read-only or reserved registers. + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ioapic_initial_state() { + let ioapic = IoApic::new(); + assert_eq!(ioapic.id, 0); + assert_eq!(ioapic.reg_sel, 0); + // All entries should be masked. + for entry in &ioapic.entries { + assert!(entry.mask); + assert_eq!(entry.vector, 0); + } + assert!(!ioapic.has_unmasked_entries()); + } + + #[test] + fn test_ioapic_has_unmasked_entries() { + let mut ioapic = IoApic::new(); + assert!(!ioapic.has_unmasked_entries()); + + // Unmask pin 2 with vector 0x22. + ioapic.write_mmio(0x00, 0x14); // Select register 0x14 (pin 2 low) + ioapic.write_mmio(0x10, 0x22); // vector=0x22, mask bit 16 = 0 (unmasked) + assert!(ioapic.has_unmasked_entries()); + } + + #[test] + fn test_ioapic_version_register() { + let ioapic = IoApic::new(); + // Select version register. + let version = ioapic.read_register(0x01); + assert_eq!(version & 0xFF, 0x11, "version should be 0x11"); + assert_eq!((version >> 16) & 0xFF, 23, "max redir entry should be 23"); + } + + #[test] + fn test_ioapic_id_read_write() { + let mut ioapic = IoApic::new(); + ioapic.write_register(0x00, 0x0A00_0000); // Set ID = 0x0A + assert_eq!(ioapic.read_register(0x00), 0x0A00_0000); + assert_eq!(ioapic.id, 0x0A); + } + + #[test] + fn test_ioapic_redir_entry_read_write() { + let mut ioapic = IoApic::new(); + + // Write low 32 bits of entry 0 (register 0x10): + // vector=0x30, delivery_mode=0 (Fixed), level-triggered, unmasked + let low: u32 = 0x30 | (1 << 15); // vector=0x30, trigger=level, mask=0 + ioapic.write_register(0x10, low); + + // Write high 32 bits of entry 0 (register 0x11): + // destination = LAPIC 0 + ioapic.write_register(0x11, 0x00 << 24); + + let read_low = ioapic.read_register(0x10); + assert_eq!(read_low & 0xFF, 0x30, "vector"); + assert!(read_low & (1 << 15) != 0, "trigger mode should be level"); + assert!(read_low & (1 << 16) == 0, "should be unmasked"); + + let read_high = ioapic.read_register(0x11); + assert_eq!((read_high >> 24) & 0xFF, 0, "dest should be 0"); + } + + #[test] + fn test_ioapic_masked_irq_not_delivered() { + let mut ioapic = IoApic::new(); + // Entry 0 is masked by default. + assert_eq!(ioapic.service_irq(0, true), None); + } + + #[test] + fn test_ioapic_edge_triggered_delivery() { + let mut ioapic = IoApic::new(); + + // Configure pin 5: edge-triggered, vector 0x25, unmasked. + ioapic.entries[5].vector = 0x25; + ioapic.entries[5].mask = false; + ioapic.entries[5].trigger_mode = false; // Edge + + let vector = ioapic.service_irq(5, true); + assert_eq!(vector, Some(0x25)); + } + + #[test] + fn test_ioapic_level_triggered_delivery() { + let mut ioapic = IoApic::new(); + + // Configure pin 3: level-triggered, vector 0x33, unmasked. + ioapic.entries[3].vector = 0x33; + ioapic.entries[3].mask = false; + ioapic.entries[3].trigger_mode = true; // Level + + let vector = ioapic.service_irq(3, true); + assert_eq!(vector, Some(0x33)); + assert!(ioapic.entries[3].remote_irr, "remote_irr should be set"); + } + + #[test] + fn test_ioapic_level_triggered_blocked_by_remote_irr() { + let mut ioapic = IoApic::new(); + + // Configure pin 3: level-triggered, vector 0x33, unmasked. + ioapic.entries[3].vector = 0x33; + ioapic.entries[3].mask = false; + ioapic.entries[3].trigger_mode = true; + + // First delivery sets remote_irr. + assert_eq!(ioapic.service_irq(3, true), Some(0x33)); + + // Second delivery blocked by remote_irr. + assert_eq!(ioapic.service_irq(3, true), None); + } + + #[test] + fn test_ioapic_eoi_clears_remote_irr() { + let mut ioapic = IoApic::new(); + + ioapic.entries[3].vector = 0x33; + ioapic.entries[3].mask = false; + ioapic.entries[3].trigger_mode = true; + + ioapic.service_irq(3, true); + assert!(ioapic.entries[3].remote_irr); + + // EOI should clear remote_irr and return the pin for re-injection. + let reinject_pin = ioapic.end_of_interrupt(0x33); + assert!(!ioapic.entries[3].remote_irr); + // Pin is still asserted, so re-injection needed on pin 3. + assert_eq!(reinject_pin, Some(3)); + } + + #[test] + fn test_ioapic_eoi_no_reinjection_when_deasserted() { + let mut ioapic = IoApic::new(); + + ioapic.entries[3].vector = 0x33; + ioapic.entries[3].mask = false; + ioapic.entries[3].trigger_mode = true; + + ioapic.service_irq(3, true); + // Deassert the pin. + ioapic.service_irq(3, false); + + let reinject_pin = ioapic.end_of_interrupt(0x33); + assert_eq!(reinject_pin, None, "no reinjection when pin is deasserted"); + } + + #[test] + fn test_ioapic_deassertion_does_not_deliver() { + let mut ioapic = IoApic::new(); + + ioapic.entries[5].vector = 0x25; + ioapic.entries[5].mask = false; + + // Deassertion (level=false) should not deliver. + assert_eq!(ioapic.service_irq(5, false), None); + } + + #[test] + fn test_ioapic_out_of_range_irq() { + let mut ioapic = IoApic::new(); + assert_eq!(ioapic.service_irq(24, true), None); + assert_eq!(ioapic.service_irq(255, true), None); + } + + #[test] + fn test_ioapic_mmio_regsel() { + let mut ioapic = IoApic::new(); + + // Write IOREGSEL. + ioapic.write_mmio(0x00, 0x01); + assert_eq!(ioapic.reg_sel, 0x01); + + // Read IOREGSEL. + assert_eq!(ioapic.read_mmio(0x00), 0x01); + } + + #[test] + fn test_ioapic_mmio_iowin_version() { + let mut ioapic = IoApic::new(); + + // Select version register. + ioapic.write_mmio(0x00, 0x01); + let version = ioapic.read_mmio(0x10); + assert_eq!(version & 0xFF, 0x11); + } + + #[test] + fn test_ioapic_mmio_invalid_offset() { + let mut ioapic = IoApic::new(); + // Invalid offsets should return 0 / be no-ops. + assert_eq!(ioapic.read_mmio(0x04), 0); + ioapic.write_mmio(0x04, 0xDEAD); + } + + #[test] + fn test_ioapic_redir_entry_remote_irr_readonly() { + let mut ioapic = IoApic::new(); + + // Set remote_irr manually. + ioapic.entries[0].remote_irr = true; + + // Write low word without remote_irr bit — it should NOT clear remote_irr. + let low = 0x30u32; // vector=0x30, no remote_irr bit set + ioapic.write_register(0x10, low); + + // remote_irr is read-only in the write path. + assert!(ioapic.entries[0].remote_irr); + } + + #[test] + fn test_ioapic_multiple_pins_independent() { + let mut ioapic = IoApic::new(); + + // Configure two different pins. + ioapic.entries[1].vector = 0x21; + ioapic.entries[1].mask = false; + ioapic.entries[2].vector = 0x22; + ioapic.entries[2].mask = false; + + assert_eq!(ioapic.service_irq(1, true), Some(0x21)); + assert_eq!(ioapic.service_irq(2, true), Some(0x22)); + } + + #[test] + fn test_ioapic_out_of_range_register() { + let ioapic = IoApic::new(); + // Registers beyond 0x3F should return 0. + assert_eq!(ioapic.read_register(0x40), 0); + assert_eq!(ioapic.read_register(0xFF), 0); + } + + #[test] + fn test_ioapic_pin_beyond_24_in_redir() { + let ioapic = IoApic::new(); + // Register 0x10 + 24*2 = 0x40, which is pin 24 (out of range). + assert_eq!(ioapic.read_register(0x40), 0); + } +} diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs new file mode 100644 index 000000000..79c8fff5e --- /dev/null +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -0,0 +1,396 @@ +//! IrqChip — coordinator wiring PIC + IOAPIC + LAPIC together. +//! +//! Manages the interrupt routing between legacy PIC (for early boot before +//! APIC is enabled) and the IOAPIC + LAPIC path (after guest enables APIC). +//! +//! The APIC mode is auto-detected: when the guest writes to the LAPIC SVR +//! register with the enable bit set, the IrqChip switches to APIC mode. + +use std::time::Instant; + +use super::ioapic::IoApic; +use super::lapic::LocalApic; +use super::pic::Pic; +use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; + +/// Coordinated interrupt controller combining PIC, IOAPIC, and LAPIC. +pub struct IrqChip { + /// Legacy PIC (for early boot before APIC is enabled). + pub pic: Pic, + /// I/O APIC for routing device interrupts to the LAPIC. + ioapic: IoApic, + /// Local APIC for priority management and timer. + lapic: LocalApic, + /// false = PIC mode (early boot), true = APIC mode. + apic_mode: bool, +} + +impl Default for IrqChip { + fn default() -> Self { + Self::new() + } +} + +impl IrqChip { + /// Create a new IrqChip in PIC mode (legacy boot). + pub fn new() -> Self { + Self { + pic: Pic::new(), + ioapic: IoApic::new(), + lapic: LocalApic::new(), + apic_mode: false, + } + } + + /// Whether the chip is in APIC mode (vs legacy PIC mode). + pub fn apic_mode(&self) -> bool { + self.apic_mode + } + + /// Raise an interrupt on the given ISA IRQ line. + /// + /// Routes to IOAPIC (if APIC mode) or PIC (legacy mode). + /// In APIC mode, applies the standard x86 IRQ-to-GSI remapping: + /// ISA IRQ 0 (PIT timer) → IOAPIC pin 2 (GSI 2), matching the + /// Interrupt Source Override entry in the MADT. + pub fn raise_irq(&mut self, irq: u8) { + if self.apic_mode { + // Remap ISA IRQ to IOAPIC pin (GSI). + // Standard x86: PIT timer (IRQ 0) routes to IOAPIC pin 2. + let gsi = if irq == 0 { 2 } else { irq }; + if let Some(vector) = self.ioapic.service_irq(gsi, true) { + self.lapic.accept_interrupt(vector); + } + } else { + self.pic.raise_irq(irq); + } + } + + /// Get the highest-priority injectable vector, if any. + /// + /// Checks LAPIC (APIC mode) or PIC (legacy mode). + pub fn get_injectable_vector(&self) -> Option { + if self.apic_mode { + self.lapic.get_highest_injectable() + } else { + if self.pic.has_pending() { + // PIC has pending, but we need to peek — can't acknowledge yet. + // Return a sentinel to indicate "has pending". + Some(0) // Caller should use acknowledge_interrupt() to get actual vector. + } else { + None + } + } + } + + /// Check if there are any pending interrupts (without acknowledging). + pub fn has_pending(&self) -> bool { + if self.apic_mode { + self.lapic.get_highest_injectable().is_some() + } else { + self.pic.has_pending() + } + } + + /// Acknowledge the highest-priority interrupt. + /// + /// In PIC mode: acknowledges from PIC and returns the vector. + /// In APIC mode: returns the highest injectable from LAPIC. + pub fn acknowledge(&mut self) -> Option { + if self.apic_mode { + self.lapic.get_highest_injectable() + } else { + self.pic.acknowledge() + } + } + + /// Called after the vector has been injected into the vCPU. + /// + /// In APIC mode: moves the vector from IRR to ISR in the LAPIC. + /// In PIC mode: no-op (PIC acknowledge already moved to ISR). + pub fn notify_injected(&mut self, vector: u8) { + if self.apic_mode { + self.lapic.start_of_interrupt(vector); + } + } + + /// Handle an EOI from the guest. + /// + /// In PIC mode: handled via I/O port writes (OCW2 commands). + /// In APIC mode: propagates EOI from LAPIC to IOAPIC for level-triggered + /// interrupt completion. + fn handle_lapic_eoi(&mut self, vector: u8) { + if let Some(pin) = self.ioapic.end_of_interrupt(vector) { + // Pin still asserted — re-deliver using the correct IOAPIC pin. + if let Some(new_vector) = self.ioapic.service_irq(pin, true) { + self.lapic.accept_interrupt(new_vector); + } + } + } + + /// Tick the LAPIC timer. Returns the timer vector if it fired. + pub fn tick_timer(&mut self, now: Instant) -> Option { + if !self.apic_mode { + return None; + } + if let Some(vector) = self.lapic.tick_timer(now) { + self.lapic.accept_interrupt(vector); + Some(vector) + } else { + None + } + } + + /// Handle an MMIO read to an IOAPIC or LAPIC address. + /// + /// Returns Some(value) if the address was handled, None otherwise. + pub fn handle_mmio_read(&self, addr: u64, _size: u8) -> Option { + if addr >= IOAPIC_MMIO_BASE && addr < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { + let offset = addr - IOAPIC_MMIO_BASE; + Some(self.ioapic.read_mmio(offset)) + } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + let offset = addr - LAPIC_MMIO_BASE; + Some(self.lapic.read_mmio(offset)) + } else { + None + } + } + + /// Handle an MMIO write to an IOAPIC or LAPIC address. + /// + /// Returns true if the address was handled. + pub fn handle_mmio_write(&mut self, addr: u64, _size: u8, data: u32) -> bool { + if addr >= IOAPIC_MMIO_BASE && addr < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { + let offset = addr - IOAPIC_MMIO_BASE; + self.ioapic.write_mmio(offset, data); + // An IOAPIC entry may have been unmasked — check transition. + self.check_apic_transition(); + true + } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + let offset = addr - LAPIC_MMIO_BASE; + let eoi_vector = self.lapic.write_mmio(offset, data); + + // LAPIC SVR may have been enabled — check transition. + self.check_apic_transition(); + + // Handle EOI propagation to IOAPIC. + if let Some(vector) = eoi_vector { + self.handle_lapic_eoi(vector); + } + + true + } else { + false + } + } + + /// Check if conditions are met to switch from PIC to APIC mode. + /// + /// The transition requires BOTH: + /// 1. LAPIC is software-enabled (SVR bit 8 set by guest) + /// 2. IOAPIC has at least one unmasked redirection entry + /// + /// This prevents a gap where the kernel has enabled the LAPIC but hasn't + /// yet programmed the IOAPIC entries, which would silently drop interrupts + /// (all IOAPIC entries start masked). + fn check_apic_transition(&mut self) { + if self.apic_mode { + return; + } + if self.lapic.is_enabled() && self.ioapic.has_unmasked_entries() { + log::info!( + "APIC mode enabled — LAPIC active + IOAPIC has unmasked entries" + ); + self.apic_mode = true; + } + } + + /// Get PIC master state for diagnostics. + pub fn pic_master_state(&self) -> (u8, u8, u8, u8) { + self.pic.master_state() + } + + /// Get PIC slave state for diagnostics. + pub fn pic_slave_state(&self) -> (u8, u8, u8, u8) { + self.pic.slave_state() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_irq_chip_starts_in_pic_mode() { + let chip = IrqChip::new(); + assert!(!chip.apic_mode()); + } + + #[test] + fn test_irq_chip_pic_mode_raise_irq() { + let mut chip = IrqChip::new(); + + // Program PIC for testing. + chip.pic.write_port(0x20, 0x11); // ICW1 + chip.pic.write_port(0x21, 0x20); // ICW2: vector base 0x20 + chip.pic.write_port(0x21, 0x04); // ICW3 + chip.pic.write_port(0x21, 0x01); // ICW4 + chip.pic.write_port(0x21, 0x00); // IMR: unmask all + + chip.raise_irq(0); + assert!(chip.has_pending()); + + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x20)); + } + + #[test] + fn test_irq_chip_apic_mode_switch_requires_ioapic_entries() { + let mut chip = IrqChip::new(); + assert!(!chip.apic_mode()); + + // Write to LAPIC SVR with enable bit — NOT enough alone. + let svr_addr = LAPIC_MMIO_BASE + 0x0F0; + chip.handle_mmio_write(svr_addr, 4, 0x1FF); + assert!( + !chip.apic_mode(), + "APIC mode must NOT activate on SVR alone" + ); + + // Unmask an IOAPIC entry (pin 2, vector 0x22) — NOW transition triggers. + chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x14); // Select reg 0x14 (pin 2 low) + chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked + + assert!( + chip.apic_mode(), + "APIC mode should activate when LAPIC enabled + IOAPIC unmasked" + ); + } + + #[test] + fn test_irq_chip_apic_mode_raise_irq() { + let mut chip = IrqChip::new(); + + // Enable LAPIC SVR. + chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + + // Configure IOAPIC pin 5: vector 0x25, unmasked, edge-triggered. + // This triggers the APIC mode transition (LAPIC enabled + unmasked entry). + chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x1A); // Select register 0x1A (pin 5 low) + chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x25); // vector=0x25, unmasked + assert!(chip.apic_mode()); + + chip.raise_irq(5); + assert!(chip.has_pending()); + + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x25)); + } + + #[test] + fn test_irq_chip_apic_mode_irq0_remaps_to_gsi2() { + let mut chip = IrqChip::new(); + + // Enable LAPIC SVR. + chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + + // Configure IOAPIC pin 2: vector 0x22, unmasked, edge-triggered. + // This is the standard x86 PIT timer mapping (IRQ 0 → GSI 2 per MADT ISO). + chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x14); // Select register 0x14 (pin 2 low) + chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked + assert!(chip.apic_mode()); + + // raise_irq(0) should remap to IOAPIC pin 2 and deliver vector 0x22. + chip.raise_irq(0); + assert!(chip.has_pending()); + + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x22)); + } + + #[test] + fn test_irq_chip_mmio_read_ioapic() { + let mut chip = IrqChip::new(); + // Read IOAPIC version register. + // First set IOREGSEL via write, then read IOWIN. + // Write to IOREGSEL at offset 0x00 sets the register index (0x01 = version). + // This doesn't unmask any entries, so APIC mode stays off. + chip.ioapic.write_mmio(0x00, 0x01); // Direct access to avoid transition check + let version = chip.handle_mmio_read(IOAPIC_MMIO_BASE + 0x10, 4); + assert_eq!(version, Some(0x0017_0011)); + } + + #[test] + fn test_irq_chip_mmio_read_lapic() { + let chip = IrqChip::new(); + let version = chip.handle_mmio_read(LAPIC_MMIO_BASE + 0x030, 4); + assert!(version.is_some()); + assert_eq!(version.unwrap() & 0xFF, 0x14); + } + + #[test] + fn test_irq_chip_mmio_read_unhandled() { + let chip = IrqChip::new(); + // Address outside IOAPIC/LAPIC range. + assert_eq!(chip.handle_mmio_read(0xDEAD_0000, 4), None); + } + + #[test] + fn test_irq_chip_mmio_write_unhandled() { + let mut chip = IrqChip::new(); + assert!(!chip.handle_mmio_write(0xDEAD_0000, 4, 0)); + } + + #[test] + fn test_irq_chip_eoi_propagation() { + let mut chip = IrqChip::new(); + + // Enable LAPIC SVR. + chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + + // Configure IOAPIC pin 3: vector 0x33, level-triggered, unmasked. + // This also triggers APIC mode transition. + chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x16); // register 0x16 = pin 3 low + chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x33 | (1 << 15)); // vector=0x33, level-triggered + assert!(chip.apic_mode()); + + // Raise IRQ 3. + chip.raise_irq(3); + let vector = chip.acknowledge(); + assert_eq!(vector, Some(0x33)); + + // Inject and acknowledge in LAPIC. + chip.notify_injected(0x33); + + // Write EOI to LAPIC (offset 0x0B0). + chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0B0, 4, 0); + + // After EOI, the pin is still asserted → re-injection. + assert!(chip.has_pending()); + } + + #[test] + fn test_irq_chip_timer_only_in_apic_mode() { + let mut chip = IrqChip::new(); + let now = Instant::now(); + // In PIC mode, timer should not fire. + assert_eq!(chip.tick_timer(now), None); + } + + #[test] + fn test_irq_chip_notify_injected_pic_mode() { + let mut chip = IrqChip::new(); + // In PIC mode, notify_injected is a no-op. + chip.notify_injected(0x20); + } + + #[test] + fn test_irq_chip_diagnostics() { + let chip = IrqChip::new(); + let (irr, isr, imr, vbase) = chip.pic_master_state(); + assert_eq!(irr, 0); + assert_eq!(isr, 0); + assert_eq!(imr, 0xFF); + assert_eq!(vbase, 0); + } +} diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs new file mode 100644 index 000000000..b7948337c --- /dev/null +++ b/src/vmm/src/windows/devices/lapic.rs @@ -0,0 +1,624 @@ +//! Local APIC (LAPIC) emulation. +//! +//! Minimal single-vCPU LAPIC for interrupt priority management. +//! Tracks IRR (Interrupt Request Register) and ISR (In-Service Register) +//! as 256-bit vectors, and implements priority-based interrupt delivery. +//! +//! MMIO interface at 0xFEE0_0000 (4KB region): +//! - 0x020: LAPIC ID +//! - 0x030: LAPIC Version +//! - 0x080: TPR (Task Priority Register) +//! - 0x0B0: EOI (write-only) +//! - 0x0F0: SVR (Spurious Vector Register) +//! - 0x100-0x170: ISR (read-only, 256 bits) +//! - 0x200-0x270: IRR (read-only, 256 bits) +//! - 0x320: LVT Timer +//! - 0x380: Timer Initial Count +//! - 0x390: Timer Current Count +//! - 0x3E0: Timer Divide Configuration +//! +//! Simplifications for single-vCPU: +//! - No IPI support (no ICR register) +//! - No arbitration +//! - Destination always matches APIC ID 0 + +use std::time::Instant; + +/// LAPIC version: integrated APIC with 6 LVT entries. +const LAPIC_VERSION: u32 = 0x0005_0014; // version 0x14, max LVT=5 + +/// SVR bit 8: APIC software enable. +const SVR_APIC_ENABLE: u32 = 1 << 8; + +/// LVT mask bit (bit 16). +const LVT_MASKED: u32 = 1 << 16; + +/// Timer modes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum TimerMode { + OneShot, + Periodic, +} + +/// Minimal single-vCPU Local APIC. +pub struct LocalApic { + /// APIC ID (always 0 for single vCPU). + id: u8, + /// 256-bit Interrupt Request Register (8 x 32-bit words). + irr: [u32; 8], + /// 256-bit In-Service Register. + isr: [u32; 8], + /// Task Priority Register (only low 8 bits used). + tpr: u8, + /// Spurious Vector Register (bit 8 = APIC enabled). + svr: u32, + + // Timer state. + /// Timer mode. + timer_mode: TimerMode, + /// LVT Timer vector. + timer_vector: u8, + /// LVT Timer mask. + timer_masked: bool, + /// Divide configuration register value. + timer_divide_reg: u32, + /// Computed divisor (1, 2, 4, 8, 16, 32, 64, 128). + timer_divisor: u32, + /// Initial count register. + timer_initial: u32, + /// When the timer fires next (host time). + timer_deadline: Option, + /// Timer period for periodic mode. + timer_period_ns: u64, +} + +impl Default for LocalApic { + fn default() -> Self { + Self::new() + } +} + +impl LocalApic { + /// Create a new LAPIC with default state (disabled). + pub fn new() -> Self { + Self { + id: 0, + irr: [0; 8], + isr: [0; 8], + tpr: 0, + svr: 0, // APIC disabled by default + + timer_mode: TimerMode::OneShot, + timer_vector: 0, + timer_masked: true, + timer_divide_reg: 0, + timer_divisor: 2, // Default divisor + timer_initial: 0, + timer_deadline: None, + timer_period_ns: 0, + } + } + + /// Whether the LAPIC is software-enabled (SVR bit 8). + pub fn is_enabled(&self) -> bool { + self.svr & SVR_APIC_ENABLE != 0 + } + + /// Accept an interrupt vector into the IRR. + pub fn accept_interrupt(&mut self, vector: u8) { + let word = (vector / 32) as usize; + let bit = vector % 32; + self.irr[word] |= 1 << bit; + } + + /// Get the highest-priority vector in IRR that beats the current + /// Processor Priority (PPR = max(TPR, highest ISR vector class)). + /// + /// Returns None if no injectable vector exists. + pub fn get_highest_injectable(&self) -> Option { + let highest_irr = Self::highest_bit(&self.irr)?; + let ppr = self.processor_priority(); + + // Vector must have higher priority (higher number = higher priority + // in x86, but within the same class, lower bit wins; the priority + // class is vector >> 4). + if (highest_irr >> 4) > (ppr >> 4) { + Some(highest_irr) + } else { + None + } + } + + /// Called when the vector is actually injected into the vCPU. + /// Moves the vector from IRR to ISR. + pub fn start_of_interrupt(&mut self, vector: u8) { + let word = (vector / 32) as usize; + let bit = vector % 32; + self.irr[word] &= !(1 << bit); + self.isr[word] |= 1 << bit; + } + + /// Handle End-of-Interrupt. + /// + /// Clears the highest-priority ISR bit. + /// Returns the vector that was cleared (for IOAPIC EOI broadcast). + pub fn end_of_interrupt(&mut self) -> Option { + let highest = Self::highest_bit(&self.isr)?; + let word = (highest / 32) as usize; + let bit = highest % 32; + self.isr[word] &= !(1 << bit); + Some(highest) + } + + /// Tick the LAPIC timer. Returns the timer vector if it fired. + pub fn tick_timer(&mut self, now: Instant) -> Option { + if self.timer_masked || self.timer_initial == 0 { + return None; + } + + let deadline = self.timer_deadline?; + + if now >= deadline { + let vector = self.timer_vector; + + match self.timer_mode { + TimerMode::OneShot => { + self.timer_deadline = None; + } + TimerMode::Periodic => { + // Rearm timer for next period. + let period = std::time::Duration::from_nanos(self.timer_period_ns); + self.timer_deadline = Some(deadline + period); + } + } + + Some(vector) + } else { + None + } + } + + /// Read from the LAPIC MMIO region. + pub fn read_mmio(&self, offset: u64) -> u32 { + match offset { + 0x020 => (self.id as u32) << 24, // LAPIC ID + 0x030 => LAPIC_VERSION, // Version + 0x080 => self.tpr as u32, // TPR + 0x0B0 => 0, // EOI (write-only) + 0x0F0 => self.svr, // SVR + // ISR: 0x100, 0x110, 0x120, ..., 0x170 + 0x100..=0x170 if offset & 0x0F == 0 => { + let idx = ((offset - 0x100) / 0x10) as usize; + if idx < 8 { + self.isr[idx] + } else { + 0 + } + } + // IRR: 0x200, 0x210, 0x220, ..., 0x270 + 0x200..=0x270 if offset & 0x0F == 0 => { + let idx = ((offset - 0x200) / 0x10) as usize; + if idx < 8 { + self.irr[idx] + } else { + 0 + } + } + 0x320 => self.read_lvt_timer(), // LVT Timer + 0x380 => self.timer_initial, // Timer Initial Count + 0x390 => 0, // Timer Current Count (approximation) + 0x3E0 => self.timer_divide_reg, // Timer Divide Configuration + _ => 0, + } + } + + /// Write to the LAPIC MMIO region. + /// + /// Returns `Some(vector)` if an EOI was written (caller should broadcast + /// to IOAPIC). + pub fn write_mmio(&mut self, offset: u64, value: u32) -> Option { + match offset { + 0x080 => self.tpr = (value & 0xFF) as u8, + 0x0B0 => { + // EOI: clear highest ISR, return vector for IOAPIC. + return self.end_of_interrupt(); + } + 0x0F0 => { + self.svr = value; + log::debug!( + "LAPIC SVR write: {:#X} (enabled={})", + value, + value & SVR_APIC_ENABLE != 0 + ); + } + 0x320 => self.write_lvt_timer(value), + 0x380 => self.write_initial_count(value), + 0x3E0 => self.write_divide_config(value), + _ => {} + } + None + } + + /// Compute Processor Priority Register (PPR). + /// + /// PPR = max(TPR, highest ISR priority class) — determines the minimum + /// priority class that can be delivered. + fn processor_priority(&self) -> u8 { + let isr_class = Self::highest_bit(&self.isr) + .map(|v| v & 0xF0) + .unwrap_or(0); + let tpr_class = self.tpr & 0xF0; + std::cmp::max(isr_class, tpr_class) + } + + /// Find the highest set bit across an 8-word (256-bit) register. + /// Returns the bit index (0-255) or None if all zero. + fn highest_bit(reg: &[u32; 8]) -> Option { + for word_idx in (0..8).rev() { + let word = reg[word_idx]; + if word != 0 { + let bit = 31 - word.leading_zeros(); + return Some((word_idx as u8) * 32 + bit as u8); + } + } + None + } + + /// Read the LVT Timer register. + fn read_lvt_timer(&self) -> u32 { + let mut val = self.timer_vector as u32; + if self.timer_masked { + val |= LVT_MASKED; + } + if self.timer_mode == TimerMode::Periodic { + val |= 1 << 17; + } + val + } + + /// Write the LVT Timer register. + fn write_lvt_timer(&mut self, value: u32) { + self.timer_vector = (value & 0xFF) as u8; + self.timer_masked = value & LVT_MASKED != 0; + self.timer_mode = if value & (1 << 17) != 0 { + TimerMode::Periodic + } else { + TimerMode::OneShot + }; + } + + /// Write the Timer Initial Count register. + fn write_initial_count(&mut self, value: u32) { + self.timer_initial = value; + if value == 0 { + self.timer_deadline = None; + return; + } + + // Compute timer period: initial_count * divisor * base_period. + // Base period is ~100ns (approximation of bus clock period). + // This gives reasonable timer behavior for Linux's LAPIC timer driver. + let ticks = value as u64 * self.timer_divisor as u64; + self.timer_period_ns = ticks * 100; // ~100ns per bus clock tick + let period = std::time::Duration::from_nanos(self.timer_period_ns); + self.timer_deadline = Some(Instant::now() + period); + } + + /// Write the Timer Divide Configuration register. + fn write_divide_config(&mut self, value: u32) { + self.timer_divide_reg = value & 0x0B; // Only bits 0,1,3 are used. + // Decode divisor: bits [3,1,0] encode the divisor. + // 0b000=2, 0b001=4, 0b010=8, 0b011=16, + // 0b100=32, 0b101=64, 0b110=128, 0b111=1 + let div_bits = ((value & 0x08) >> 1) | (value & 0x03); + self.timer_divisor = match div_bits { + 0b000 => 2, + 0b001 => 4, + 0b010 => 8, + 0b011 => 16, + 0b100 => 32, + 0b101 => 64, + 0b110 => 128, + 0b111 => 1, + _ => 2, + }; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lapic_initial_state() { + let lapic = LocalApic::new(); + assert_eq!(lapic.id, 0); + assert!(!lapic.is_enabled()); + assert_eq!(lapic.tpr, 0); + assert!(lapic.timer_masked); + } + + #[test] + fn test_lapic_enable_via_svr() { + let mut lapic = LocalApic::new(); + lapic.write_mmio(0x0F0, SVR_APIC_ENABLE | 0xFF); + assert!(lapic.is_enabled()); + } + + #[test] + fn test_lapic_accept_and_get_injectable() { + let mut lapic = LocalApic::new(); + + // Accept vector 0x30. + lapic.accept_interrupt(0x30); + assert_eq!(lapic.get_highest_injectable(), Some(0x30)); + } + + #[test] + fn test_lapic_priority_ordering() { + let mut lapic = LocalApic::new(); + + // Accept vectors 0x30 and 0x50 — 0x50 has higher priority. + lapic.accept_interrupt(0x30); + lapic.accept_interrupt(0x50); + assert_eq!(lapic.get_highest_injectable(), Some(0x50)); + } + + #[test] + fn test_lapic_isr_blocks_lower_priority() { + let mut lapic = LocalApic::new(); + + // Put vector 0x50 in service. + lapic.accept_interrupt(0x50); + lapic.start_of_interrupt(0x50); + + // Accept vector 0x30 — lower priority class, should be blocked. + lapic.accept_interrupt(0x30); + assert_eq!(lapic.get_highest_injectable(), None); + + // Accept vector 0x60 — higher priority, should be injectable. + lapic.accept_interrupt(0x60); + assert_eq!(lapic.get_highest_injectable(), Some(0x60)); + } + + #[test] + fn test_lapic_tpr_blocks_low_priority() { + let mut lapic = LocalApic::new(); + + // Set TPR to class 5 (0x50) — blocks vectors 0x00-0x5F. + lapic.write_mmio(0x080, 0x50); + + lapic.accept_interrupt(0x30); + assert_eq!(lapic.get_highest_injectable(), None); + + lapic.accept_interrupt(0x60); + assert_eq!(lapic.get_highest_injectable(), Some(0x60)); + } + + #[test] + fn test_lapic_start_of_interrupt() { + let mut lapic = LocalApic::new(); + + lapic.accept_interrupt(0x30); + assert!(lapic.irr[1] & (1 << 16) != 0); // 0x30 = word 1, bit 16 + + lapic.start_of_interrupt(0x30); + assert_eq!(lapic.irr[1] & (1 << 16), 0, "IRR should be cleared"); + assert!(lapic.isr[1] & (1 << 16) != 0, "ISR should be set"); + } + + #[test] + fn test_lapic_eoi_clears_isr() { + let mut lapic = LocalApic::new(); + + lapic.accept_interrupt(0x30); + lapic.start_of_interrupt(0x30); + + let vector = lapic.end_of_interrupt(); + assert_eq!(vector, Some(0x30)); + assert_eq!(lapic.isr[1] & (1 << 16), 0, "ISR should be cleared"); + } + + #[test] + fn test_lapic_eoi_clears_highest_isr() { + let mut lapic = LocalApic::new(); + + // Put two vectors in service. + lapic.accept_interrupt(0x30); + lapic.start_of_interrupt(0x30); + lapic.accept_interrupt(0x50); + lapic.start_of_interrupt(0x50); + + // EOI clears highest (0x50). + let vector = lapic.end_of_interrupt(); + assert_eq!(vector, Some(0x50)); + + // Next EOI clears 0x30. + let vector = lapic.end_of_interrupt(); + assert_eq!(vector, Some(0x30)); + } + + #[test] + fn test_lapic_eoi_empty_isr() { + let mut lapic = LocalApic::new(); + assert_eq!(lapic.end_of_interrupt(), None); + } + + #[test] + fn test_lapic_mmio_read_id() { + let lapic = LocalApic::new(); + assert_eq!(lapic.read_mmio(0x020), 0); // ID = 0, shifted left 24 + } + + #[test] + fn test_lapic_mmio_read_version() { + let lapic = LocalApic::new(); + let version = lapic.read_mmio(0x030); + assert_eq!(version & 0xFF, 0x14); + } + + #[test] + fn test_lapic_mmio_svr_roundtrip() { + let mut lapic = LocalApic::new(); + lapic.write_mmio(0x0F0, 0x1FF); + assert_eq!(lapic.read_mmio(0x0F0), 0x1FF); + } + + #[test] + fn test_lapic_mmio_tpr_roundtrip() { + let mut lapic = LocalApic::new(); + lapic.write_mmio(0x080, 0x40); + assert_eq!(lapic.read_mmio(0x080), 0x40); + } + + #[test] + fn test_lapic_mmio_eoi_returns_vector() { + let mut lapic = LocalApic::new(); + lapic.accept_interrupt(0x30); + lapic.start_of_interrupt(0x30); + + let vector = lapic.write_mmio(0x0B0, 0); + assert_eq!(vector, Some(0x30)); + } + + #[test] + fn test_lapic_mmio_isr_read() { + let mut lapic = LocalApic::new(); + lapic.accept_interrupt(0x30); + lapic.start_of_interrupt(0x30); + + // 0x30 = word 1 (offset 0x110) + assert_ne!(lapic.read_mmio(0x110), 0); + assert_eq!(lapic.read_mmio(0x100), 0); // Word 0 should be empty. + } + + #[test] + fn test_lapic_mmio_irr_read() { + let mut lapic = LocalApic::new(); + lapic.accept_interrupt(0x30); + + // 0x30 = word 1 (offset 0x210) + assert_ne!(lapic.read_mmio(0x210), 0); + assert_eq!(lapic.read_mmio(0x200), 0); + } + + #[test] + fn test_lapic_lvt_timer_write_read() { + let mut lapic = LocalApic::new(); + + // Set timer: vector=0x20, periodic, unmasked. + let lvt = 0x20 | (1 << 17); // vector=0x20, periodic + lapic.write_mmio(0x320, lvt); + + let read = lapic.read_mmio(0x320); + assert_eq!(read & 0xFF, 0x20); + assert!(read & (1 << 17) != 0, "periodic bit"); + assert!(read & LVT_MASKED == 0, "should be unmasked"); + } + + #[test] + fn test_lapic_timer_divide_config() { + let mut lapic = LocalApic::new(); + + // Divisor = 1 (bits [3,1,0] = 0b111 → register value = 0b1011 = 0x0B) + lapic.write_mmio(0x3E0, 0x0B); + assert_eq!(lapic.timer_divisor, 1); + + // Divisor = 16 (bits [3,1,0] = 0b011 → register value = 0b0011 = 0x03) + lapic.write_mmio(0x3E0, 0x03); + assert_eq!(lapic.timer_divisor, 16); + } + + #[test] + fn test_lapic_timer_fires_oneshot() { + let mut lapic = LocalApic::new(); + + // Configure: vector=0x20, oneshot, unmasked, divisor=1 + lapic.write_mmio(0x320, 0x20); // vector=0x20, oneshot, unmasked + lapic.write_mmio(0x3E0, 0x0B); // divisor=1 + + // Set initial count → arms the timer. + lapic.write_mmio(0x380, 1); // count=1 + + // Timer should fire after some time. + let future = Instant::now() + std::time::Duration::from_millis(100); + let vector = lapic.tick_timer(future); + assert_eq!(vector, Some(0x20)); + + // Second tick should not fire (oneshot). + let vector = lapic.tick_timer(future + std::time::Duration::from_millis(100)); + assert_eq!(vector, None); + } + + #[test] + fn test_lapic_timer_masked_no_fire() { + let mut lapic = LocalApic::new(); + + // Configure: masked + lapic.write_mmio(0x320, 0x20 | LVT_MASKED); + lapic.write_mmio(0x3E0, 0x0B); + lapic.write_mmio(0x380, 1); + + let future = Instant::now() + std::time::Duration::from_millis(100); + assert_eq!(lapic.tick_timer(future), None); + } + + #[test] + fn test_lapic_timer_zero_count_disarms() { + let mut lapic = LocalApic::new(); + + lapic.write_mmio(0x320, 0x20); + lapic.write_mmio(0x3E0, 0x0B); + lapic.write_mmio(0x380, 0); // count=0 disarms + + let future = Instant::now() + std::time::Duration::from_millis(100); + assert_eq!(lapic.tick_timer(future), None); + } + + #[test] + fn test_lapic_highest_bit() { + let mut reg = [0u32; 8]; + assert_eq!(LocalApic::highest_bit(®), None); + + reg[0] = 1; // bit 0 + assert_eq!(LocalApic::highest_bit(®), Some(0)); + + reg[7] = 1 << 31; // bit 255 + assert_eq!(LocalApic::highest_bit(®), Some(255)); + + reg[3] = 1 << 16; // bit 112 + // Highest should still be 255. + assert_eq!(LocalApic::highest_bit(®), Some(255)); + } + + #[test] + fn test_lapic_processor_priority() { + let mut lapic = LocalApic::new(); + + // No ISR, TPR=0 → PPR=0. + assert_eq!(lapic.processor_priority(), 0); + + // TPR=0x40 → PPR=0x40. + lapic.tpr = 0x40; + assert_eq!(lapic.processor_priority(), 0x40); + + // ISR has 0x50 → PPR=max(0x40, 0x50)=0x50. + lapic.accept_interrupt(0x50); + lapic.start_of_interrupt(0x50); + assert_eq!(lapic.processor_priority(), 0x50); + } + + #[test] + fn test_lapic_mmio_invalid_offset() { + let mut lapic = LocalApic::new(); + assert_eq!(lapic.read_mmio(0x400), 0); + assert_eq!(lapic.write_mmio(0x400, 0xDEAD), None); + } + + #[test] + fn test_lapic_mmio_isr_non_aligned() { + let lapic = LocalApic::new(); + // Non-16-byte-aligned ISR offset should return 0. + assert_eq!(lapic.read_mmio(0x104), 0); + } +} diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 09436e7a2..5ffab4845 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -14,7 +14,7 @@ use super::super::cmdline::{irq_for_slot, mmio_base_for_slot, MmioSlot, MMIO_SLO use super::super::context::VmContext; use super::super::error::{Result, WkrunError}; use super::super::vcpu::IoHandler; -use super::pic::Pic; +use super::irq_chip::IrqChip; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; use super::virtio::block::VirtioBlock; @@ -200,7 +200,7 @@ pub struct DeviceSetup { /// Centralized device manager for all emulated devices. pub struct DeviceManager { serial: Serial, - pub pic: Pic, + pub irq_chip: IrqChip, pit: Pit, cmos_addr: u8, @@ -219,6 +219,10 @@ pub struct DeviceManager { blk_queue_notify_count: u64, /// Diagnostic: count block I/O completions drained. blk_completion_count: u64, + /// Diagnostic: count MMIO accesses to IOAPIC range. + ioapic_mmio_count: u64, + /// Diagnostic: count MMIO accesses to LAPIC range. + lapic_mmio_count: u64, /// Track whether we've requested an interrupt window. window_requested: bool, @@ -361,7 +365,7 @@ impl DeviceManager { let devices = DeviceManager { serial, - pic: Pic::new(), + irq_chip: IrqChip::new(), pit: Pit::new(), cmos_addr: 0, virtio_blk, @@ -371,6 +375,8 @@ impl DeviceManager { virtio_blk2, blk_queue_notify_count: 0, blk_completion_count: 0, + ioapic_mmio_count: 0, + lapic_mmio_count: 0, window_requested: false, last_tick: Instant::now(), port61_toggle: false, @@ -392,11 +398,11 @@ impl DeviceManager { if self.serial.handles_port(port) { self.serial.io_write(port, size, data); if self.serial.has_interrupt() { - self.pic.raise_irq(4); + self.irq_chip.raise_irq(4); } - } else if self.pic.handles_port(port) { + } else if self.irq_chip.pic.handles_port(port) { log::trace!("PIC write: port={:#X} data={:#X}", port, data as u8); - self.pic.write_port(port, data as u8); + self.irq_chip.pic.write_port(port, data as u8); } else if self.pit.handles_port(port) { log::trace!("PIT write: port={:#X} data={:#X}", port, data as u8); self.pit.write_port(port, data as u8); @@ -422,11 +428,11 @@ impl DeviceManager { if self.serial.handles_port(port) { let val = self.serial.io_read(port, size); if self.serial.has_interrupt() { - self.pic.raise_irq(4); + self.irq_chip.raise_irq(4); } val - } else if self.pic.handles_port(port) { - self.pic.read_port(port) as u32 + } else if self.irq_chip.pic.handles_port(port) { + self.irq_chip.pic.read_port(port) as u32 } else if self.pit.handles_port(port) { self.pit.read_port(port) as u32 } else if port == 0x71 { @@ -467,7 +473,18 @@ impl DeviceManager { /// Handle an MMIO read from the guest. /// /// Returns the data to inject into the destination register. - pub fn handle_mmio_read(&self, address: u64, size: u8) -> u64 { + pub fn handle_mmio_read(&mut self, address: u64, size: u8) -> u64 { + // Check IOAPIC/LAPIC ranges first. + if let Some(val) = self.irq_chip.handle_mmio_read(address, size) { + use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; + if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { + self.ioapic_mmio_count += 1; + } else if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + self.lapic_mmio_count += 1; + } + return val as u64; + } + let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); @@ -515,6 +532,17 @@ impl DeviceManager { data: u64, mem: &dyn GuestMemoryAccessor, ) { + // Check IOAPIC/LAPIC ranges first. + if self.irq_chip.handle_mmio_write(address, size as u8, data as u32) { + use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; + if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { + self.ioapic_mmio_count += 1; + } else if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + self.lapic_mmio_count += 1; + } + return; + } + let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); let vsock_offset = address.wrapping_sub(mmio_base_for_slot(1)); let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); @@ -527,7 +555,7 @@ impl DeviceManager { } if let Some(ref mut dev) = self.virtio_blk { if dev.write(blk_offset, data as u32, size, mem) { - self.pic.raise_irq(irq_for_slot(0)); + self.irq_chip.raise_irq(irq_for_slot(0)); } } } else if vsock_offset < MMIO_SLOT_SIZE { @@ -535,18 +563,18 @@ impl DeviceManager { .virtio_vsock .write(vsock_offset, data as u32, size, mem) { - self.pic.raise_irq(irq_for_slot(1)); + self.irq_chip.raise_irq(irq_for_slot(1)); } } else if p9_offset < MMIO_SLOT_SIZE { if let Some(ref mut dev) = self.virtio_9p { if dev.write(p9_offset, data as u32, size, mem) { - self.pic.raise_irq(irq_for_slot(2)); + self.irq_chip.raise_irq(irq_for_slot(2)); } } } else if net_offset < MMIO_SLOT_SIZE { if let Some(ref mut dev) = self.virtio_net { if dev.write(net_offset, data as u32, size, mem) { - self.pic.raise_irq(irq_for_slot(3)); + self.irq_chip.raise_irq(irq_for_slot(3)); } } } else if blk2_offset < MMIO_SLOT_SIZE { @@ -555,7 +583,7 @@ impl DeviceManager { } if let Some(ref mut dev) = self.virtio_blk2 { if dev.write(blk2_offset, data as u32, size, mem) { - self.pic.raise_irq(irq_for_slot(4)); + self.irq_chip.raise_irq(irq_for_slot(4)); } } } @@ -598,34 +626,37 @@ impl DeviceManager { if elapsed_ns > 0 { let fires = self.pit.tick(elapsed_ns); for _ in 0..fires { - self.pic.raise_irq(0); + self.irq_chip.raise_irq(0); } } + // Tick LAPIC timer (only fires in APIC mode). + self.irq_chip.tick_timer(now); + // Drain async block I/O completions. if let Some(ref mut dev) = self.virtio_blk { if dev.poll_backend(mem) { self.blk_completion_count += 1; - self.pic.raise_irq(irq_for_slot(0)); + self.irq_chip.raise_irq(irq_for_slot(0)); } } if let Some(ref mut dev) = self.virtio_blk2 { if dev.poll_backend(mem) { self.blk_completion_count += 1; - self.pic.raise_irq(irq_for_slot(4)); + self.irq_chip.raise_irq(irq_for_slot(4)); } } // Poll vsock for host-initiated data. if self.virtio_vsock.poll(mem) { log::debug!("vsock poll raised IRQ {}", irq_for_slot(1)); - self.pic.raise_irq(irq_for_slot(1)); + self.irq_chip.raise_irq(irq_for_slot(1)); } // Poll net for incoming frames. if let Some(ref mut dev) = self.virtio_net { if dev.poll(mem) { - self.pic.raise_irq(irq_for_slot(3)); + self.irq_chip.raise_irq(irq_for_slot(3)); } } } @@ -669,6 +700,11 @@ impl DeviceManager { (self.blk_queue_notify_count, self.blk_completion_count) } + /// Get IOAPIC/LAPIC MMIO access counts for diagnostics. + pub fn apic_mmio_stats(&self) -> (u64, u64) { + (self.ioapic_mmio_count, self.lapic_mmio_count) + } + /// Whether an ACPI S5 shutdown was detected. pub fn shutdown_requested(&self) -> bool { self.shutdown_requested @@ -690,7 +726,7 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { let vsock_backend = VirtioVsock::new(GUEST_CID); DeviceManager { serial, - pic: Pic::new(), + irq_chip: IrqChip::new(), pit: Pit::new(), cmos_addr: 0, virtio_blk: None, @@ -700,6 +736,8 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { virtio_blk2: None, blk_queue_notify_count: 0, blk_completion_count: 0, + ioapic_mmio_count: 0, + lapic_mmio_count: 0, window_requested: false, last_tick: Instant::now(), port61_toggle: false, diff --git a/src/vmm/src/windows/devices/mod.rs b/src/vmm/src/windows/devices/mod.rs index d5add51ba..f54460b49 100644 --- a/src/vmm/src/windows/devices/mod.rs +++ b/src/vmm/src/windows/devices/mod.rs @@ -1,5 +1,8 @@ //! Device emulation for the guest VM. +pub mod ioapic; +pub mod irq_chip; +pub mod lapic; pub mod manager; pub mod pic; pub mod pit; diff --git a/src/vmm/src/windows/memory.rs b/src/vmm/src/windows/memory.rs index 7a24c6b54..973e25f5b 100644 --- a/src/vmm/src/windows/memory.rs +++ b/src/vmm/src/windows/memory.rs @@ -47,6 +47,18 @@ pub const VIRTIO_MMIO_BASE: u64 = 0xD000_0000; /// 2MB provides room for many devices and aligns with 2MB page table granularity. pub const MMIO_REGION_SIZE: u64 = 0x20_0000; +/// IOAPIC MMIO base address. +pub const IOAPIC_MMIO_BASE: u64 = 0xFEC0_0000; + +/// IOAPIC MMIO region size (4 KB). +pub const IOAPIC_MMIO_SIZE: u64 = 0x1000; + +/// LAPIC MMIO base address. +pub const LAPIC_MMIO_BASE: u64 = 0xFEE0_0000; + +/// LAPIC MMIO region size (4 KB). +pub const LAPIC_MMIO_SIZE: u64 = 0x1000; + // Windows-specific guest memory allocation and mapping. #[cfg(target_os = "windows")] mod imp { diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index c00d6ca96..d449b96e1 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -139,12 +139,16 @@ mod imp { canceller_slot: Arc>>, ) -> Result { // Open a diagnostic log file for debugging boot failures. - // Written to a fixed path that persists across box lifecycle. + // Uses TEMP directory so it works on any Windows machine. let mut diag_log: Option = None; + let diag_path = format!( + "{}\\whpx-diag.log", + std::env::var("TEMP").unwrap_or_else(|_| r"C:\Temp".to_string()) + ); if let Ok(f) = std::fs::OpenOptions::new() .create(true) .append(true) - .open(r"C:\ws-boxlite\whpx-diag.log") + .open(&diag_path) { diag_log = Some(f); } @@ -297,7 +301,7 @@ mod imp { // injection is still pending in WHPX. Overwriting the pending // interruption register would lose the old interrupt and leave // its PIC ISR bit permanently stuck (guest never sends EOI). - if devices.pic.has_pending() { + if devices.irq_chip.has_pending() { let already_pending = vcpu .has_pending_interruption() .unwrap_or(false); @@ -306,9 +310,10 @@ mod imp { } else { match vcpu.interrupts_enabled() { Ok(true) => { - if let Some(vector) = devices.pic.acknowledge() { + if let Some(vector) = devices.irq_chip.acknowledge() { log::debug!("Injecting interrupt vector {:#X}", vector); vcpu.inject_interrupt(vector)?; + devices.irq_chip.notify_injected(vector); devices.set_window_requested(false); inject_count += 1; } @@ -414,13 +419,14 @@ mod imp { // was halted. devices.tick_and_poll(mem_ref); - if devices.pic.has_pending() { + if devices.irq_chip.has_pending() { let already_pending = vcpu .has_pending_interruption() .unwrap_or(false); if !already_pending { - if let Some(vector) = devices.pic.acknowledge() { + if let Some(vector) = devices.irq_chip.acknowledge() { vcpu.inject_interrupt(vector)?; + devices.irq_chip.notify_injected(vector); devices.set_window_requested(false); inject_count += 1; } @@ -496,18 +502,22 @@ mod imp { .map(|b| b.len()) .unwrap_or(0); let (qn, bc) = devices.blk_stats(); - let (irr, isr, imr, vbase) = devices.pic.master_state(); - let (s_irr, s_isr, s_imr, s_vbase) = devices.pic.slave_state(); + let (ioapic_mmio, lapic_mmio) = devices.apic_mmio_stats(); + let (irr, isr, imr, vbase) = devices.irq_chip.pic_master_state(); + let (s_irr, s_isr, s_imr, s_vbase) = devices.irq_chip.pic_slave_state(); + let apic_mode = devices.irq_chip.apic_mode(); let msg = format!( - "Progress @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} mode={}", + "Progress @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} ioapic_mmio={} lapic_mmio={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} irq={} mode={}", start_time.elapsed().as_secs_f64(), exit_count, regs.rip, console_len, io_out_count, serial_out_count, mmio_count, qn, bc, halt_count, total_halt_exits, halt_with_irq, inject_count, + ioapic_mmio, lapic_mmio, irr, isr, imr, vbase, s_irr, s_isr, s_imr, s_vbase, + if apic_mode { "apic" } else { "pic" }, if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }, ); log::info!("{}", msg); From 92205ddc0c33cb8dadc1cf7e59d8068324d2f46d Mon Sep 17 00:00:00 2001 From: lile Date: Wed, 29 Apr 2026 21:22:48 +0800 Subject: [PATCH 21/33] feat(windows): multi-vCPU support (LAPIC array, ICR/IPI, multi-threaded runner) - LAPIC: per-vCPU instances with ICR register for IPI dispatch (INIT, SIPI, Fixed interrupt delivery modes) - IrqChip: multi-LAPIC array with per-vcpu_id routing - IOAPIC: returns (vector, dest_apic_id) from service_irq - DeviceManager: vcpu_id parameter threading, IpiAction returns - Runner: BSP + AP threads via std::thread::scope, Arc>, INIT-SIPI-SIPI protocol with Condvar wake, per-vCPU stats - MADT: dynamic N-LAPIC entry generation for num_vcpus - Cmdline: removed nosmp for SMP kernel boot - Loader: num_vcpus parameter threading to ACPI tables - WHPX: VcpuCanceller Clone, set_ap_initial_regs for real-mode AP startup - CPUID leaf 1: topology injection (max APIC IDs, initial APIC ID) - MSR 0x1B: IA32_APIC_BASE with enable + BSP flag - Hyper-V CPUID leaves masked to zeros Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/boot/acpi.rs | 150 ++- src/vmm/src/windows/boot/loader.rs | 6 +- src/vmm/src/windows/cmdline.rs | 27 +- src/vmm/src/windows/devices/ioapic.rs | 37 +- src/vmm/src/windows/devices/irq_chip.rs | 328 +++++-- src/vmm/src/windows/devices/lapic.rs | 279 +++++- src/vmm/src/windows/devices/manager.rs | 37 +- src/vmm/src/windows/runner.rs | 1136 ++++++++++++++++++----- src/vmm/src/windows/whpx.rs | 69 ++ 9 files changed, 1617 insertions(+), 452 deletions(-) diff --git a/src/vmm/src/windows/boot/acpi.rs b/src/vmm/src/windows/boot/acpi.rs index 265de49d8..59e9dbaa9 100644 --- a/src/vmm/src/windows/boot/acpi.rs +++ b/src/vmm/src/windows/boot/acpi.rs @@ -27,8 +27,17 @@ const MADT_HEADER_SIZE: usize = 44; // 36-byte ACPI header + 4-byte Local APIC A const MADT_LAPIC_ENTRY_SIZE: usize = 8; // Type 0: Processor Local APIC const MADT_IOAPIC_ENTRY_SIZE: usize = 12; // Type 1: I/O APIC const MADT_ISO_ENTRY_SIZE: usize = 10; // Type 2: Interrupt Source Override -const MADT_SIZE: usize = - MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE + MADT_ISO_ENTRY_SIZE; + +/// Compute the MADT size for a given number of vCPUs. +const fn madt_size(num_vcpus: u8) -> usize { + MADT_HEADER_SIZE + + MADT_LAPIC_ENTRY_SIZE * (num_vcpus as usize) + + MADT_IOAPIC_ENTRY_SIZE + + MADT_ISO_ENTRY_SIZE +} + +/// MADT size for the default single-vCPU case (used for static offset validation). +const MADT_SIZE_1: usize = madt_size(1); // ACPI PM1a I/O port addresses (must match manager.rs constants). const PM1A_EVT_BLK: u32 = 0x600; @@ -64,7 +73,9 @@ const S5_AML: &[u8] = &[ /// /// Returns a `Vec` of exactly `ACPI_REGION_SIZE` bytes. The caller /// writes this to guest memory at `acpi_base`. -pub fn build_acpi_tables(acpi_base: u64) -> Vec { +/// +/// `num_vcpus` determines how many LAPIC entries are generated in the MADT. +pub fn build_acpi_tables(acpi_base: u64, num_vcpus: u8) -> Vec { let mut region = vec![0u8; ACPI_REGION_SIZE as usize]; let rsdt_addr = acpi_base + RSDT_OFFSET as u64; @@ -141,24 +152,32 @@ pub fn build_acpi_tables(acpi_base: u64) -> Vec { acpi_checksum(&mut region[DSDT_OFFSET..DSDT_OFFSET + dsdt_size], 9); // ---- MADT (Multiple APIC Description Table) at offset 0x140 ---- - build_madt(&mut region[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]); + let madt_sz = madt_size(num_vcpus); + assert!( + MADT_OFFSET + madt_sz <= ACPI_REGION_SIZE as usize, + "MADT ({} bytes for {} vCPUs) exceeds ACPI region", + madt_sz, num_vcpus, + ); + build_madt(&mut region[MADT_OFFSET..MADT_OFFSET + madt_sz], num_vcpus); region } /// Build the MADT (Multiple APIC Description Table). /// -/// Tells the Linux kernel about the Local APIC and I/O APIC. +/// Tells the Linux kernel about the Local APIC(s) and I/O APIC. /// /// Structure: /// - Header (44 bytes): standard ACPI header + LAPIC address + flags -/// - Entry 0 - Local APIC (type 0, 8 bytes): Processor 0, APIC ID 0 -/// - Entry 1 - I/O APIC (type 1, 12 bytes): IOAPIC ID 0, base 0xFEC00000 -/// - Entry 2 - Interrupt Source Override (type 2, 10 bytes): IRQ 0 → GSI 2 -fn build_madt(madt: &mut [u8]) { +/// - N x Local APIC entries (type 0, 8 bytes each): one per vCPU +/// - I/O APIC entry (type 1, 12 bytes): IOAPIC ID 0, base 0xFEC00000 +/// - Interrupt Source Override (type 2, 10 bytes): IRQ 0 → GSI 2 +fn build_madt(madt: &mut [u8], num_vcpus: u8) { + let total_size = madt.len(); + // ACPI header. madt[0..4].copy_from_slice(b"APIC"); // Signature - madt[4..8].copy_from_slice(&(MADT_SIZE as u32).to_le_bytes()); // Length + madt[4..8].copy_from_slice(&(total_size as u32).to_le_bytes()); // Length madt[8] = 1; // Revision // madt[9] = checksum (computed below) madt[10..16].copy_from_slice(b"BOXLTE"); // OEMID @@ -173,16 +192,18 @@ fn build_madt(madt: &mut [u8]) { // Flags (offset 40, 4 bytes): PCAT_COMPAT = 1 (dual 8259 PICs present). madt[40..44].copy_from_slice(&1u32.to_le_bytes()); - // --- Entry 0: Processor Local APIC (type 0, 8 bytes) --- + // --- N x Processor Local APIC entries (type 0, 8 bytes each) --- let mut off = MADT_HEADER_SIZE; - madt[off] = 0; // Entry type: Processor Local APIC - madt[off + 1] = MADT_LAPIC_ENTRY_SIZE as u8; // Length - madt[off + 2] = 0; // ACPI Processor ID - madt[off + 3] = 0; // APIC ID - madt[off + 4..off + 8].copy_from_slice(&1u32.to_le_bytes()); // Flags: enabled - off += MADT_LAPIC_ENTRY_SIZE; - - // --- Entry 1: I/O APIC (type 1, 12 bytes) --- + for i in 0..num_vcpus { + madt[off] = 0; // Entry type: Processor Local APIC + madt[off + 1] = MADT_LAPIC_ENTRY_SIZE as u8; // Length + madt[off + 2] = i; // ACPI Processor ID + madt[off + 3] = i; // APIC ID + madt[off + 4..off + 8].copy_from_slice(&1u32.to_le_bytes()); // Flags: enabled + off += MADT_LAPIC_ENTRY_SIZE; + } + + // --- I/O APIC entry (type 1, 12 bytes) --- madt[off] = 1; // Entry type: I/O APIC madt[off + 1] = MADT_IOAPIC_ENTRY_SIZE as u8; // Length madt[off + 2] = 0; // I/O APIC ID @@ -191,7 +212,7 @@ fn build_madt(madt: &mut [u8]) { madt[off + 8..off + 12].copy_from_slice(&0u32.to_le_bytes()); // Global System Interrupt Base off += MADT_IOAPIC_ENTRY_SIZE; - // --- Entry 2: Interrupt Source Override (type 2, 10 bytes) --- + // --- Interrupt Source Override (type 2, 10 bytes) --- // Standard x86 convention: PIT timer (IRQ 0) routes to IOAPIC pin 2. madt[off] = 2; // Entry type: Interrupt Source Override madt[off + 1] = MADT_ISO_ENTRY_SIZE as u8; // Length @@ -221,7 +242,7 @@ mod tests { #[test] fn test_rsdp_signature_and_checksum() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let rsdp = ®ion[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; assert_eq!(&rsdp[0..8], b"RSD PTR "); @@ -232,7 +253,7 @@ mod tests { #[test] fn test_rsdt_signature_and_length() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let rsdt = ®ion[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; assert_eq!(&rsdt[0..4], b"RSDT"); @@ -245,7 +266,7 @@ mod tests { #[test] fn test_rsdt_has_two_entries() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let rsdt = ®ion[RSDT_OFFSET..RSDT_OFFSET + RSDT_SIZE]; // Entry[0]: FADT pointer. @@ -259,7 +280,7 @@ mod tests { #[test] fn test_fadt_signature_and_pm1a_cnt() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; assert_eq!(&fadt[0..4], b"FACP"); @@ -279,7 +300,7 @@ mod tests { #[test] fn test_dsdt_contains_s5_package() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); let dsdt = ®ion[DSDT_OFFSET..DSDT_OFFSET + dsdt_size]; @@ -295,13 +316,13 @@ mod tests { #[test] fn test_total_region_size() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); assert_eq!(region.len(), ACPI_REGION_SIZE as usize); } #[test] fn test_rsdp_points_to_rsdt() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let rsdp = ®ion[RSDP_OFFSET..RSDP_OFFSET + RSDP_SIZE]; let rsdt_addr = u32::from_le_bytes(rsdp[16..20].try_into().unwrap()); assert_eq!(rsdt_addr, (TEST_BASE + RSDT_OFFSET as u64) as u32); @@ -309,7 +330,7 @@ mod tests { #[test] fn test_fadt_sci_int() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; let sci_int = u16::from_le_bytes(fadt[46..48].try_into().unwrap()); assert_eq!(sci_int, 11, "SCI_INT must be on an unused IRQ"); @@ -317,7 +338,7 @@ mod tests { #[test] fn test_fadt_points_to_dsdt() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let fadt = ®ion[FADT_OFFSET..FADT_OFFSET + FADT_SIZE]; let dsdt_addr = u32::from_le_bytes(fadt[40..44].try_into().unwrap()); assert_eq!(dsdt_addr, (TEST_BASE + DSDT_OFFSET as u64) as u32); @@ -327,13 +348,13 @@ mod tests { #[test] fn test_madt_signature_and_checksum() { - let region = build_acpi_tables(TEST_BASE); - let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + let region = build_acpi_tables(TEST_BASE, 1); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE_1]; assert_eq!(&madt[0..4], b"APIC"); let length = u32::from_le_bytes(madt[4..8].try_into().unwrap()); - assert_eq!(length, MADT_SIZE as u32); + assert_eq!(length, MADT_SIZE_1 as u32); let sum: u8 = madt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); assert_eq!(sum, 0, "MADT checksum must be zero"); @@ -341,8 +362,8 @@ mod tests { #[test] fn test_madt_lapic_address() { - let region = build_acpi_tables(TEST_BASE); - let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + let region = build_acpi_tables(TEST_BASE, 1); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE_1]; let lapic_addr = u32::from_le_bytes(madt[36..40].try_into().unwrap()); assert_eq!(lapic_addr, LAPIC_BASE); @@ -350,8 +371,8 @@ mod tests { #[test] fn test_madt_pcat_compat_flag() { - let region = build_acpi_tables(TEST_BASE); - let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE]; + let region = build_acpi_tables(TEST_BASE, 1); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + MADT_SIZE_1]; let flags = u32::from_le_bytes(madt[40..44].try_into().unwrap()); assert_eq!(flags, 1, "PCAT_COMPAT flag must be set"); @@ -359,7 +380,7 @@ mod tests { #[test] fn test_madt_lapic_entry() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let off = MADT_OFFSET + MADT_HEADER_SIZE; assert_eq!(region[off], 0, "entry type: Local APIC"); @@ -372,7 +393,7 @@ mod tests { #[test] fn test_madt_ioapic_entry() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let off = MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE; assert_eq!(region[off], 1, "entry type: I/O APIC"); @@ -386,7 +407,7 @@ mod tests { #[test] fn test_madt_interrupt_source_override() { - let region = build_acpi_tables(TEST_BASE); + let region = build_acpi_tables(TEST_BASE, 1); let off = MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE; @@ -409,7 +430,7 @@ mod tests { ("RSDT", RSDT_OFFSET, RSDT_OFFSET + RSDT_SIZE), ("FADT", FADT_OFFSET, FADT_OFFSET + FADT_SIZE), ("DSDT", DSDT_OFFSET, DSDT_OFFSET + dsdt_size), - ("MADT", MADT_OFFSET, MADT_OFFSET + MADT_SIZE), + ("MADT", MADT_OFFSET, MADT_OFFSET + MADT_SIZE_1), ]; for i in 0..tables.len() { @@ -433,7 +454,7 @@ mod tests { #[test] fn test_all_tables_fit_in_region() { let dsdt_size = DSDT_HEADER_SIZE + S5_AML.len(); - let last_table_end = MADT_OFFSET + MADT_SIZE; + let last_table_end = MADT_OFFSET + MADT_SIZE_1; assert!( last_table_end <= ACPI_REGION_SIZE as usize, "tables extend beyond region: {} > {}", @@ -443,4 +464,51 @@ mod tests { // Also verify DSDT doesn't extend into MADT. assert!(DSDT_OFFSET + dsdt_size <= MADT_OFFSET); } + + // ---- Multi-vCPU MADT tests ---- + + #[test] + fn test_madt_multi_vcpu_lapic_entries() { + let region = build_acpi_tables(TEST_BASE, 4); + let madt_sz = madt_size(4); + let madt = ®ion[MADT_OFFSET..MADT_OFFSET + madt_sz]; + + // Verify MADT length field matches. + let length = u32::from_le_bytes(madt[4..8].try_into().unwrap()); + assert_eq!(length, madt_sz as u32); + + // Verify checksum. + let sum: u8 = madt.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "MADT checksum must be zero for 4 vCPUs"); + + // Verify 4 LAPIC entries with correct IDs. + for i in 0..4u8 { + let off = MADT_HEADER_SIZE + (i as usize) * MADT_LAPIC_ENTRY_SIZE; + assert_eq!(madt[off], 0, "entry type: Local APIC for vCPU {}", i); + assert_eq!(madt[off + 1], 8, "entry length for vCPU {}", i); + assert_eq!(madt[off + 2], i, "ACPI Processor ID for vCPU {}", i); + assert_eq!(madt[off + 3], i, "APIC ID for vCPU {}", i); + let flags = u32::from_le_bytes(madt[off + 4..off + 8].try_into().unwrap()); + assert_eq!(flags, 1, "enabled flag for vCPU {}", i); + } + + // Verify IOAPIC entry follows the 4 LAPIC entries. + let ioapic_off = MADT_HEADER_SIZE + 4 * MADT_LAPIC_ENTRY_SIZE; + assert_eq!(madt[ioapic_off], 1, "entry type: I/O APIC"); + } + + #[test] + fn test_madt_size_scales_with_vcpus() { + assert_eq!(madt_size(1), MADT_SIZE_1); + assert_eq!( + madt_size(2), + MADT_SIZE_1 + MADT_LAPIC_ENTRY_SIZE, + "2 vCPUs adds one more LAPIC entry" + ); + assert_eq!( + madt_size(4), + MADT_SIZE_1 + 3 * MADT_LAPIC_ENTRY_SIZE, + "4 vCPUs adds three more LAPIC entries" + ); + } } diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs index b0da1dc1b..c43a9afe8 100644 --- a/src/vmm/src/windows/boot/loader.rs +++ b/src/vmm/src/windows/boot/loader.rs @@ -224,8 +224,9 @@ pub fn load_kernel( kernel_image: &[u8], cmdline: &str, ram_mib: u32, + num_vcpus: u8, ) -> Result<(StandardRegisters, SpecialRegisters)> { - load_kernel_with_initrd(guest_mem, kernel_image, cmdline, ram_mib, None) + load_kernel_with_initrd(guest_mem, kernel_image, cmdline, ram_mib, None, num_vcpus) } /// Load a Linux bzImage kernel with an optional initrd. @@ -236,6 +237,7 @@ pub fn load_kernel_with_initrd( cmdline: &str, ram_mib: u32, initrd: Option<&[u8]>, + num_vcpus: u8, ) -> Result<(StandardRegisters, SpecialRegisters)> { let header = parse_bzimage(kernel_image)?; @@ -294,7 +296,7 @@ pub fn load_kernel_with_initrd( boot_params.set_cmdline_size(cmdline_bytes.len() as u32); // Write ACPI tables to guest memory. - let acpi_data = acpi::build_acpi_tables(ACPI_START); + let acpi_data = acpi::build_acpi_tables(ACPI_START, num_vcpus); guest_mem.write_at_addr(ACPI_START, &acpi_data)?; // Set E820 memory map. diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index 7cfd26b29..bac75f617 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -17,19 +17,19 @@ pub const FIRST_MMIO_IRQ: u8 = 5; /// - `console=ttyS0`: Route kernel console to serial port (required — no VGA). /// - `quiet loglevel=1`: Suppress kernel printk to console. /// - `i8042.nokbd i8042.noaux`: Skip PS/2 keyboard/mouse probe (10K+ exits). -/// - `nosmp`: Single vCPU mode (multi-vCPU deferred to Iter 3). /// - `nohyperv`: Disable Hyper-V guest enlightenments. WHPX exposes Hyper-V /// CPUID leaves but doesn't fully support synthetic timers/SynIC, causing /// clock stalls if the kernel tries to use them. /// - `lpj=1000000`: Preset loops_per_jiffy to skip delay calibration, which /// depends on a reliable timer source. /// - `nokaslr`: Disable kernel address space randomization for deterministic -/// boot in our controlled single-vCPU environment. +/// boot in our controlled environment. /// /// Note: `noapic` and `nolapic` are NOT present — the MADT table in ACPI /// tells the kernel about the IOAPIC and LAPIC for APIC-mode interrupt routing. +/// Note: `nosmp` is NOT present — multi-vCPU is supported via MADT LAPIC entries. const BASE_CMDLINE: &str = - "console=ttyS0 quiet loglevel=1 i8042.nokbd i8042.noaux nosmp nohyperv lpj=1000000 nokaslr"; + "console=ttyS0 quiet loglevel=1 i8042.nokbd i8042.noaux nohyperv lpj=1000000 nokaslr"; /// Serial console parameters appended in verbose mode. /// @@ -71,10 +71,7 @@ pub fn build_kernel_cmdline( let mut cmdline = if verbose { // Verbose mode: serial console + full i8042 probe for debugging. // No noapic/nolapic — APIC mode is enabled via MADT (same as quiet mode). - format!( - "{} nosmp nohyperv lpj=1000000 nokaslr", - VERBOSE_CONSOLE - ) + format!("{} nohyperv lpj=1000000 nokaslr", VERBOSE_CONSOLE) } else { BASE_CMDLINE.to_string() }; @@ -242,6 +239,22 @@ mod tests { assert!(!cmdline.contains("noacpi")); } + #[test] + fn test_cmdline_no_nosmp() { + // nosmp must NOT be present — multi-vCPU is supported via MADT LAPIC entries. + let quiet = build_simple(None, false, &[]); + assert!( + !quiet.contains("nosmp"), + "quiet cmdline must not contain nosmp (multi-vCPU enabled)" + ); + + let verbose = build_kernel_cmdline(None, false, &[], None, None, None, &[], true); + assert!( + !verbose.contains("nosmp"), + "verbose cmdline must not contain nosmp (multi-vCPU enabled)" + ); + } + #[test] fn test_cmdline_no_noacpi_no_noapic() { // Verify neither quiet nor verbose mode includes noacpi or noapic. diff --git a/src/vmm/src/windows/devices/ioapic.rs b/src/vmm/src/windows/devices/ioapic.rs index 31c6910f5..dc6779bcc 100644 --- a/src/vmm/src/windows/devices/ioapic.rs +++ b/src/vmm/src/windows/devices/ioapic.rs @@ -131,12 +131,12 @@ impl IoApic { } } - /// Process an IRQ signal. Returns the vector to deliver if the interrupt + /// Process an IRQ signal. Returns `(vector, dest_apic_id)` if the interrupt /// is deliverable, or None if masked/blocked. /// /// - Edge-triggered: deliver if not masked, set pin state. /// - Level-triggered: deliver if not masked AND remote_irr not set. - pub fn service_irq(&mut self, irq: u8, level: bool) -> Option { + pub fn service_irq(&mut self, irq: u8, level: bool) -> Option<(u8, u8)> { if irq as usize >= NUM_PINS { return None; } @@ -163,7 +163,7 @@ impl IoApic { } // Edge-triggered: always deliver (if not masked). - Some(entry.vector) + Some((entry.vector, entry.dest)) } /// Handle End-of-Interrupt for a given vector. @@ -327,26 +327,26 @@ mod tests { fn test_ioapic_edge_triggered_delivery() { let mut ioapic = IoApic::new(); - // Configure pin 5: edge-triggered, vector 0x25, unmasked. + // Configure pin 5: edge-triggered, vector 0x25, dest 0, unmasked. ioapic.entries[5].vector = 0x25; ioapic.entries[5].mask = false; ioapic.entries[5].trigger_mode = false; // Edge - let vector = ioapic.service_irq(5, true); - assert_eq!(vector, Some(0x25)); + let result = ioapic.service_irq(5, true); + assert_eq!(result, Some((0x25, 0))); } #[test] fn test_ioapic_level_triggered_delivery() { let mut ioapic = IoApic::new(); - // Configure pin 3: level-triggered, vector 0x33, unmasked. + // Configure pin 3: level-triggered, vector 0x33, dest 0, unmasked. ioapic.entries[3].vector = 0x33; ioapic.entries[3].mask = false; ioapic.entries[3].trigger_mode = true; // Level - let vector = ioapic.service_irq(3, true); - assert_eq!(vector, Some(0x33)); + let result = ioapic.service_irq(3, true); + assert_eq!(result, Some((0x33, 0))); assert!(ioapic.entries[3].remote_irr, "remote_irr should be set"); } @@ -360,7 +360,7 @@ mod tests { ioapic.entries[3].trigger_mode = true; // First delivery sets remote_irr. - assert_eq!(ioapic.service_irq(3, true), Some(0x33)); + assert_eq!(ioapic.service_irq(3, true), Some((0x33, 0))); // Second delivery blocked by remote_irr. assert_eq!(ioapic.service_irq(3, true), None); @@ -473,8 +473,8 @@ mod tests { ioapic.entries[2].vector = 0x22; ioapic.entries[2].mask = false; - assert_eq!(ioapic.service_irq(1, true), Some(0x21)); - assert_eq!(ioapic.service_irq(2, true), Some(0x22)); + assert_eq!(ioapic.service_irq(1, true), Some((0x21, 0))); + assert_eq!(ioapic.service_irq(2, true), Some((0x22, 0))); } #[test] @@ -485,6 +485,19 @@ mod tests { assert_eq!(ioapic.read_register(0xFF), 0); } + #[test] + fn test_ioapic_service_irq_returns_destination() { + let mut ioapic = IoApic::new(); + + // Configure pin 4: vector 0x24, dest APIC ID = 1, unmasked. + ioapic.entries[4].vector = 0x24; + ioapic.entries[4].dest = 1; + ioapic.entries[4].mask = false; + + let result = ioapic.service_irq(4, true); + assert_eq!(result, Some((0x24, 1))); + } + #[test] fn test_ioapic_pin_beyond_24_in_redir() { let ioapic = IoApic::new(); diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs index 79c8fff5e..a1d7726e9 100644 --- a/src/vmm/src/windows/devices/irq_chip.rs +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -1,47 +1,75 @@ -//! IrqChip — coordinator wiring PIC + IOAPIC + LAPIC together. +//! IrqChip — coordinator wiring PIC + IOAPIC + LAPIC(s) together. //! //! Manages the interrupt routing between legacy PIC (for early boot before //! APIC is enabled) and the IOAPIC + LAPIC path (after guest enables APIC). //! +//! Supports multiple LAPICs for multi-vCPU configurations. Each vCPU has its +//! own LAPIC, indexed by vCPU ID. Device interrupts from the IOAPIC are routed +//! to the target LAPIC based on the redirection entry destination field. +//! //! The APIC mode is auto-detected: when the guest writes to the LAPIC SVR //! register with the enable bit set, the IrqChip switches to APIC mode. use std::time::Instant; use super::ioapic::IoApic; -use super::lapic::LocalApic; +use super::lapic::{IpiAction, LocalApic}; use super::pic::Pic; use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; -/// Coordinated interrupt controller combining PIC, IOAPIC, and LAPIC. +/// Result of an IrqChip MMIO write operation. +#[derive(Debug)] +pub struct IrqChipWriteResult { + /// Whether the address was handled by the IrqChip. + pub handled: bool, + /// IPI action to dispatch (from LAPIC ICR write). + pub ipi_action: IpiAction, +} + +impl Default for IrqChipWriteResult { + fn default() -> Self { + Self { + handled: false, + ipi_action: IpiAction::None, + } + } +} + +/// Coordinated interrupt controller combining PIC, IOAPIC, and per-vCPU LAPICs. pub struct IrqChip { /// Legacy PIC (for early boot before APIC is enabled). pub pic: Pic, - /// I/O APIC for routing device interrupts to the LAPIC. + /// I/O APIC for routing device interrupts to the LAPICs. ioapic: IoApic, - /// Local APIC for priority management and timer. - lapic: LocalApic, + /// Per-vCPU Local APICs (indexed by vCPU ID). + lapics: Vec, /// false = PIC mode (early boot), true = APIC mode. apic_mode: bool, } impl Default for IrqChip { fn default() -> Self { - Self::new() + Self::new(1) } } impl IrqChip { - /// Create a new IrqChip in PIC mode (legacy boot). - pub fn new() -> Self { + /// Create a new IrqChip in PIC mode (legacy boot) with N LAPICs. + pub fn new(num_vcpus: u8) -> Self { + let lapics = (0..num_vcpus).map(|id| LocalApic::new_with_id(id)).collect(); Self { pic: Pic::new(), ioapic: IoApic::new(), - lapic: LocalApic::new(), + lapics, apic_mode: false, } } + /// Number of vCPUs (LAPICs). + pub fn num_vcpus(&self) -> u8 { + self.lapics.len() as u8 + } + /// Whether the chip is in APIC mode (vs legacy PIC mode). pub fn apic_mode(&self) -> bool { self.apic_mode @@ -58,83 +86,93 @@ impl IrqChip { // Remap ISA IRQ to IOAPIC pin (GSI). // Standard x86: PIT timer (IRQ 0) routes to IOAPIC pin 2. let gsi = if irq == 0 { 2 } else { irq }; - if let Some(vector) = self.ioapic.service_irq(gsi, true) { - self.lapic.accept_interrupt(vector); + if let Some((vector, dest)) = self.ioapic.service_irq(gsi, true) { + let target = (dest as usize).min(self.lapics.len() - 1); + self.lapics[target].accept_interrupt(vector); } } else { self.pic.raise_irq(irq); } } - /// Get the highest-priority injectable vector, if any. + /// Get the highest-priority injectable vector for a specific vCPU. /// - /// Checks LAPIC (APIC mode) or PIC (legacy mode). - pub fn get_injectable_vector(&self) -> Option { + /// Checks LAPIC (APIC mode) or PIC (legacy mode, only for BSP / vCPU 0). + pub fn get_injectable_vector(&self, vcpu_id: u8) -> Option { if self.apic_mode { - self.lapic.get_highest_injectable() - } else { + self.lapics[vcpu_id as usize].get_highest_injectable() + } else if vcpu_id == 0 { if self.pic.has_pending() { // PIC has pending, but we need to peek — can't acknowledge yet. // Return a sentinel to indicate "has pending". - Some(0) // Caller should use acknowledge_interrupt() to get actual vector. + Some(0) // Caller should use acknowledge() to get actual vector. } else { None } + } else { + None // APs don't get PIC interrupts. } } - /// Check if there are any pending interrupts (without acknowledging). - pub fn has_pending(&self) -> bool { + /// Check if there are any pending interrupts for a specific vCPU. + pub fn has_pending(&self, vcpu_id: u8) -> bool { if self.apic_mode { - self.lapic.get_highest_injectable().is_some() - } else { + self.lapics[vcpu_id as usize].get_highest_injectable().is_some() + } else if vcpu_id == 0 { self.pic.has_pending() + } else { + false } } - /// Acknowledge the highest-priority interrupt. + /// Acknowledge the highest-priority interrupt for a specific vCPU. /// - /// In PIC mode: acknowledges from PIC and returns the vector. - /// In APIC mode: returns the highest injectable from LAPIC. - pub fn acknowledge(&mut self) -> Option { + /// In PIC mode (vCPU 0 only): acknowledges from PIC and returns the vector. + /// In APIC mode: returns the highest injectable from the vCPU's LAPIC. + pub fn acknowledge(&mut self, vcpu_id: u8) -> Option { if self.apic_mode { - self.lapic.get_highest_injectable() - } else { + self.lapics[vcpu_id as usize].get_highest_injectable() + } else if vcpu_id == 0 { self.pic.acknowledge() + } else { + None } } /// Called after the vector has been injected into the vCPU. /// - /// In APIC mode: moves the vector from IRR to ISR in the LAPIC. + /// In APIC mode: moves the vector from IRR to ISR in the vCPU's LAPIC. /// In PIC mode: no-op (PIC acknowledge already moved to ISR). - pub fn notify_injected(&mut self, vector: u8) { + pub fn notify_injected(&mut self, vcpu_id: u8, vector: u8) { if self.apic_mode { - self.lapic.start_of_interrupt(vector); + self.lapics[vcpu_id as usize].start_of_interrupt(vector); } } - /// Handle an EOI from the guest. + /// Handle an EOI from a specific vCPU's LAPIC. /// - /// In PIC mode: handled via I/O port writes (OCW2 commands). - /// In APIC mode: propagates EOI from LAPIC to IOAPIC for level-triggered - /// interrupt completion. - fn handle_lapic_eoi(&mut self, vector: u8) { + /// Propagates EOI from LAPIC to IOAPIC for level-triggered interrupt + /// completion. May trigger re-injection if the pin is still asserted. + fn handle_lapic_eoi(&mut self, vcpu_id: u8, vector: u8) { if let Some(pin) = self.ioapic.end_of_interrupt(vector) { // Pin still asserted — re-deliver using the correct IOAPIC pin. - if let Some(new_vector) = self.ioapic.service_irq(pin, true) { - self.lapic.accept_interrupt(new_vector); + if let Some((new_vector, dest)) = self.ioapic.service_irq(pin, true) { + let target = (dest as usize).min(self.lapics.len() - 1); + self.lapics[target].accept_interrupt(new_vector); } } + // Suppress unused variable warning — vcpu_id is used for routing context. + let _ = vcpu_id; } - /// Tick the LAPIC timer. Returns the timer vector if it fired. - pub fn tick_timer(&mut self, now: Instant) -> Option { + /// Tick the LAPIC timer for a specific vCPU. Returns the timer vector if it fired. + pub fn tick_timer(&mut self, vcpu_id: u8, now: Instant) -> Option { if !self.apic_mode { return None; } - if let Some(vector) = self.lapic.tick_timer(now) { - self.lapic.accept_interrupt(vector); + let lapic = &mut self.lapics[vcpu_id as usize]; + if let Some(vector) = lapic.tick_timer(now) { + lapic.accept_interrupt(vector); Some(vector) } else { None @@ -144,13 +182,14 @@ impl IrqChip { /// Handle an MMIO read to an IOAPIC or LAPIC address. /// /// Returns Some(value) if the address was handled, None otherwise. - pub fn handle_mmio_read(&self, addr: u64, _size: u8) -> Option { + /// LAPIC reads are dispatched to the requesting vCPU's LAPIC. + pub fn handle_mmio_read(&self, vcpu_id: u8, addr: u64, _size: u8) -> Option { if addr >= IOAPIC_MMIO_BASE && addr < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { let offset = addr - IOAPIC_MMIO_BASE; Some(self.ioapic.read_mmio(offset)) } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = addr - LAPIC_MMIO_BASE; - Some(self.lapic.read_mmio(offset)) + Some(self.lapics[vcpu_id as usize].read_mmio(offset)) } else { None } @@ -158,36 +197,54 @@ impl IrqChip { /// Handle an MMIO write to an IOAPIC or LAPIC address. /// - /// Returns true if the address was handled. - pub fn handle_mmio_write(&mut self, addr: u64, _size: u8, data: u32) -> bool { + /// Returns an `IrqChipWriteResult` indicating whether the address was handled + /// and any IPI action from an ICR write. + pub fn handle_mmio_write(&mut self, vcpu_id: u8, addr: u64, _size: u8, data: u32) -> IrqChipWriteResult { if addr >= IOAPIC_MMIO_BASE && addr < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { let offset = addr - IOAPIC_MMIO_BASE; self.ioapic.write_mmio(offset, data); // An IOAPIC entry may have been unmasked — check transition. self.check_apic_transition(); - true + IrqChipWriteResult { + handled: true, + ipi_action: IpiAction::None, + } } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = addr - LAPIC_MMIO_BASE; - let eoi_vector = self.lapic.write_mmio(offset, data); + let result = self.lapics[vcpu_id as usize].write_mmio(offset, data); // LAPIC SVR may have been enabled — check transition. self.check_apic_transition(); // Handle EOI propagation to IOAPIC. - if let Some(vector) = eoi_vector { - self.handle_lapic_eoi(vector); + if let Some(vector) = result.eoi_vector { + self.handle_lapic_eoi(vcpu_id, vector); } - true + IrqChipWriteResult { + handled: true, + ipi_action: result.ipi_action, + } } else { - false + IrqChipWriteResult::default() + } + } + + /// Deliver an IPI to the target LAPIC. + /// + /// Called by the runner when a vCPU's ICR write produces an IPI action + /// that targets another LAPIC (SendInterrupt variant only — INIT and SIPI + /// are handled by the runner's AP startup logic). + pub fn deliver_ipi_interrupt(&mut self, target_apic_id: u8, vector: u8) { + if (target_apic_id as usize) < self.lapics.len() { + self.lapics[target_apic_id as usize].accept_interrupt(vector); } } /// Check if conditions are met to switch from PIC to APIC mode. /// /// The transition requires BOTH: - /// 1. LAPIC is software-enabled (SVR bit 8 set by guest) + /// 1. Any LAPIC is software-enabled (SVR bit 8 set by guest) /// 2. IOAPIC has at least one unmasked redirection entry /// /// This prevents a gap where the kernel has enabled the LAPIC but hasn't @@ -197,7 +254,8 @@ impl IrqChip { if self.apic_mode { return; } - if self.lapic.is_enabled() && self.ioapic.has_unmasked_entries() { + let any_lapic_enabled = self.lapics.iter().any(|l| l.is_enabled()); + if any_lapic_enabled && self.ioapic.has_unmasked_entries() { log::info!( "APIC mode enabled — LAPIC active + IOAPIC has unmasked entries" ); @@ -222,13 +280,23 @@ mod tests { #[test] fn test_irq_chip_starts_in_pic_mode() { - let chip = IrqChip::new(); + let chip = IrqChip::new(1); assert!(!chip.apic_mode()); } + #[test] + fn test_irq_chip_multi_vcpu_creates_lapics() { + let chip = IrqChip::new(4); + assert_eq!(chip.num_vcpus(), 4); + assert_eq!(chip.lapics[0].id(), 0); + assert_eq!(chip.lapics[1].id(), 1); + assert_eq!(chip.lapics[2].id(), 2); + assert_eq!(chip.lapics[3].id(), 3); + } + #[test] fn test_irq_chip_pic_mode_raise_irq() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // Program PIC for testing. chip.pic.write_port(0x20, 0x11); // ICW1 @@ -238,28 +306,47 @@ mod tests { chip.pic.write_port(0x21, 0x00); // IMR: unmask all chip.raise_irq(0); - assert!(chip.has_pending()); + assert!(chip.has_pending(0)); - let vector = chip.acknowledge(); + let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x20)); } + #[test] + fn test_irq_chip_pic_mode_only_bsp() { + let mut chip = IrqChip::new(2); + + // Program PIC. + chip.pic.write_port(0x20, 0x11); + chip.pic.write_port(0x21, 0x20); + chip.pic.write_port(0x21, 0x04); + chip.pic.write_port(0x21, 0x01); + chip.pic.write_port(0x21, 0x00); + + chip.raise_irq(0); + + // BSP (vCPU 0) sees the interrupt. + assert!(chip.has_pending(0)); + // AP (vCPU 1) does NOT see PIC interrupts. + assert!(!chip.has_pending(1)); + } + #[test] fn test_irq_chip_apic_mode_switch_requires_ioapic_entries() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); assert!(!chip.apic_mode()); // Write to LAPIC SVR with enable bit — NOT enough alone. let svr_addr = LAPIC_MMIO_BASE + 0x0F0; - chip.handle_mmio_write(svr_addr, 4, 0x1FF); + chip.handle_mmio_write(0, svr_addr, 4, 0x1FF); assert!( !chip.apic_mode(), "APIC mode must NOT activate on SVR alone" ); // Unmask an IOAPIC entry (pin 2, vector 0x22) — NOW transition triggers. - chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x14); // Select reg 0x14 (pin 2 low) - chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE, 4, 0x14); // Select reg 0x14 (pin 2 low) + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked assert!( chip.apic_mode(), @@ -269,128 +356,165 @@ mod tests { #[test] fn test_irq_chip_apic_mode_raise_irq() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // Enable LAPIC SVR. - chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); // Configure IOAPIC pin 5: vector 0x25, unmasked, edge-triggered. - // This triggers the APIC mode transition (LAPIC enabled + unmasked entry). - chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x1A); // Select register 0x1A (pin 5 low) - chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x25); // vector=0x25, unmasked + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE, 4, 0x1A); // Select register 0x1A (pin 5 low) + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE + 0x10, 4, 0x25); // vector=0x25, unmasked assert!(chip.apic_mode()); chip.raise_irq(5); - assert!(chip.has_pending()); + assert!(chip.has_pending(0)); - let vector = chip.acknowledge(); + let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x25)); } #[test] fn test_irq_chip_apic_mode_irq0_remaps_to_gsi2() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // Enable LAPIC SVR. - chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); // Configure IOAPIC pin 2: vector 0x22, unmasked, edge-triggered. - // This is the standard x86 PIT timer mapping (IRQ 0 → GSI 2 per MADT ISO). - chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x14); // Select register 0x14 (pin 2 low) - chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE, 4, 0x14); // Select register 0x14 (pin 2 low) + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE + 0x10, 4, 0x22); // vector=0x22, unmasked assert!(chip.apic_mode()); // raise_irq(0) should remap to IOAPIC pin 2 and deliver vector 0x22. chip.raise_irq(0); - assert!(chip.has_pending()); + assert!(chip.has_pending(0)); - let vector = chip.acknowledge(); + let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x22)); } #[test] fn test_irq_chip_mmio_read_ioapic() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // Read IOAPIC version register. - // First set IOREGSEL via write, then read IOWIN. - // Write to IOREGSEL at offset 0x00 sets the register index (0x01 = version). - // This doesn't unmask any entries, so APIC mode stays off. chip.ioapic.write_mmio(0x00, 0x01); // Direct access to avoid transition check - let version = chip.handle_mmio_read(IOAPIC_MMIO_BASE + 0x10, 4); + let version = chip.handle_mmio_read(0, IOAPIC_MMIO_BASE + 0x10, 4); assert_eq!(version, Some(0x0017_0011)); } #[test] fn test_irq_chip_mmio_read_lapic() { - let chip = IrqChip::new(); - let version = chip.handle_mmio_read(LAPIC_MMIO_BASE + 0x030, 4); + let chip = IrqChip::new(1); + let version = chip.handle_mmio_read(0, LAPIC_MMIO_BASE + 0x030, 4); assert!(version.is_some()); assert_eq!(version.unwrap() & 0xFF, 0x14); } + #[test] + fn test_irq_chip_mmio_read_lapic_id_per_vcpu() { + let chip = IrqChip::new(2); + // vCPU 0 reads its own LAPIC ID. + assert_eq!(chip.handle_mmio_read(0, LAPIC_MMIO_BASE + 0x020, 4), Some(0 << 24)); + // vCPU 1 reads its own LAPIC ID. + assert_eq!(chip.handle_mmio_read(1, LAPIC_MMIO_BASE + 0x020, 4), Some(1 << 24)); + } + #[test] fn test_irq_chip_mmio_read_unhandled() { - let chip = IrqChip::new(); - // Address outside IOAPIC/LAPIC range. - assert_eq!(chip.handle_mmio_read(0xDEAD_0000, 4), None); + let chip = IrqChip::new(1); + assert_eq!(chip.handle_mmio_read(0, 0xDEAD_0000, 4), None); } #[test] fn test_irq_chip_mmio_write_unhandled() { - let mut chip = IrqChip::new(); - assert!(!chip.handle_mmio_write(0xDEAD_0000, 4, 0)); + let mut chip = IrqChip::new(1); + let result = chip.handle_mmio_write(0, 0xDEAD_0000, 4, 0); + assert!(!result.handled); } #[test] fn test_irq_chip_eoi_propagation() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // Enable LAPIC SVR. - chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); // Configure IOAPIC pin 3: vector 0x33, level-triggered, unmasked. - // This also triggers APIC mode transition. - chip.handle_mmio_write(IOAPIC_MMIO_BASE, 4, 0x16); // register 0x16 = pin 3 low - chip.handle_mmio_write(IOAPIC_MMIO_BASE + 0x10, 4, 0x33 | (1 << 15)); // vector=0x33, level-triggered + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE, 4, 0x16); // register 0x16 = pin 3 low + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE + 0x10, 4, 0x33 | (1 << 15)); // vector=0x33, level-triggered assert!(chip.apic_mode()); // Raise IRQ 3. chip.raise_irq(3); - let vector = chip.acknowledge(); + let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x33)); // Inject and acknowledge in LAPIC. - chip.notify_injected(0x33); + chip.notify_injected(0, 0x33); // Write EOI to LAPIC (offset 0x0B0). - chip.handle_mmio_write(LAPIC_MMIO_BASE + 0x0B0, 4, 0); + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0B0, 4, 0); // After EOI, the pin is still asserted → re-injection. - assert!(chip.has_pending()); + assert!(chip.has_pending(0)); } #[test] fn test_irq_chip_timer_only_in_apic_mode() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); let now = Instant::now(); // In PIC mode, timer should not fire. - assert_eq!(chip.tick_timer(now), None); + assert_eq!(chip.tick_timer(0, now), None); } #[test] fn test_irq_chip_notify_injected_pic_mode() { - let mut chip = IrqChip::new(); + let mut chip = IrqChip::new(1); // In PIC mode, notify_injected is a no-op. - chip.notify_injected(0x20); + chip.notify_injected(0, 0x20); } #[test] fn test_irq_chip_diagnostics() { - let chip = IrqChip::new(); + let chip = IrqChip::new(1); let (irr, isr, imr, vbase) = chip.pic_master_state(); assert_eq!(irr, 0); assert_eq!(isr, 0); assert_eq!(imr, 0xFF); assert_eq!(vbase, 0); } + + #[test] + fn test_irq_chip_deliver_ipi_interrupt() { + let mut chip = IrqChip::new(2); + + // Enable APIC mode: enable BSP's LAPIC SVR + unmask IOAPIC entry. + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0F0, 4, 0x1FF); + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE, 4, 0x14); + chip.handle_mmio_write(0, IOAPIC_MMIO_BASE + 0x10, 4, 0x22); + assert!(chip.apic_mode()); + + // Deliver IPI to vCPU 1. + chip.deliver_ipi_interrupt(1, 0x40); + assert!(chip.has_pending(1)); + assert_eq!(chip.acknowledge(1), Some(0x40)); + } + + #[test] + fn test_irq_chip_icr_write_returns_ipi_action() { + let mut chip = IrqChip::new(2); + + // Write ICR high on vCPU 0: destination = APIC 1. + chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x310, 4, 1 << 24); + // Write ICR low on vCPU 0: INIT delivery mode. + let result = chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x300, 4, 0x0500); + assert!(result.handled); + assert_eq!(result.ipi_action, IpiAction::SendInit { target_apic_id: 1 }); + } + + #[test] + fn test_irq_chip_default_is_single_vcpu() { + let chip = IrqChip::default(); + assert_eq!(chip.num_vcpus(), 1); + } } diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index b7948337c..8d09a0c76 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -1,6 +1,6 @@ //! Local APIC (LAPIC) emulation. //! -//! Minimal single-vCPU LAPIC for interrupt priority management. +//! Per-vCPU LAPIC for interrupt priority management and IPI delivery. //! Tracks IRR (Interrupt Request Register) and ISR (In-Service Register) //! as 256-bit vectors, and implements priority-based interrupt delivery. //! @@ -12,18 +12,46 @@ //! - 0x0F0: SVR (Spurious Vector Register) //! - 0x100-0x170: ISR (read-only, 256 bits) //! - 0x200-0x270: IRR (read-only, 256 bits) +//! - 0x300: ICR Low (Interrupt Command Register) +//! - 0x310: ICR High (destination APIC ID) //! - 0x320: LVT Timer //! - 0x380: Timer Initial Count //! - 0x390: Timer Current Count //! - 0x3E0: Timer Divide Configuration -//! -//! Simplifications for single-vCPU: -//! - No IPI support (no ICR register) -//! - No arbitration -//! - Destination always matches APIC ID 0 use std::time::Instant; +/// Action resulting from an ICR write (Inter-Processor Interrupt). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IpiAction { + /// No IPI action (non-ICR write or unrecognized delivery mode). + None, + /// Fixed delivery: send interrupt vector to target LAPIC. + SendInterrupt { target_apic_id: u8, vector: u8 }, + /// INIT delivery: reset target processor. + SendInit { target_apic_id: u8 }, + /// Startup IPI (SIPI): start target processor at vector * 0x1000. + SendSipi { target_apic_id: u8, vector: u8 }, +} + +/// Result of a LAPIC MMIO write operation. +#[derive(Debug, Clone, Copy)] +pub struct LapicWriteResult { + /// If an EOI was written, the vector that was cleared from ISR. + pub eoi_vector: Option, + /// If an ICR was written, the resulting IPI action. + pub ipi_action: IpiAction, +} + +impl Default for LapicWriteResult { + fn default() -> Self { + Self { + eoi_vector: None, + ipi_action: IpiAction::None, + } + } +} + /// LAPIC version: integrated APIC with 6 LVT entries. const LAPIC_VERSION: u32 = 0x0005_0014; // version 0x14, max LVT=5 @@ -40,9 +68,9 @@ enum TimerMode { Periodic, } -/// Minimal single-vCPU Local APIC. +/// Per-vCPU Local APIC. pub struct LocalApic { - /// APIC ID (always 0 for single vCPU). + /// APIC ID (matches vCPU index). id: u8, /// 256-bit Interrupt Request Register (8 x 32-bit words). irr: [u32; 8], @@ -53,6 +81,12 @@ pub struct LocalApic { /// Spurious Vector Register (bit 8 = APIC enabled). svr: u32, + // ICR (Interrupt Command Register) for IPI support. + /// ICR low 32 bits (vector, delivery mode, destination shorthand). + icr_low: u32, + /// ICR high 32 bits (destination APIC ID in bits 31:24). + icr_high: u32, + // Timer state. /// Timer mode. timer_mode: TimerMode, @@ -79,15 +113,23 @@ impl Default for LocalApic { } impl LocalApic { - /// Create a new LAPIC with default state (disabled). + /// Create a new LAPIC with default state (disabled), APIC ID = 0. pub fn new() -> Self { + Self::new_with_id(0) + } + + /// Create a new LAPIC with a specific APIC ID (disabled by default). + pub fn new_with_id(id: u8) -> Self { Self { - id: 0, + id, irr: [0; 8], isr: [0; 8], tpr: 0, svr: 0, // APIC disabled by default + icr_low: 0, + icr_high: 0, + timer_mode: TimerMode::OneShot, timer_vector: 0, timer_masked: true, @@ -99,6 +141,11 @@ impl LocalApic { } } + /// Get the APIC ID. + pub fn id(&self) -> u8 { + self.id + } + /// Whether the LAPIC is software-enabled (SVR bit 8). pub fn is_enabled(&self) -> bool { self.svr & SVR_APIC_ENABLE != 0 @@ -204,6 +251,8 @@ impl LocalApic { 0 } } + 0x300 => self.icr_low, // ICR Low + 0x310 => self.icr_high, // ICR High 0x320 => self.read_lvt_timer(), // LVT Timer 0x380 => self.timer_initial, // Timer Initial Count 0x390 => 0, // Timer Current Count (approximation) @@ -212,31 +261,118 @@ impl LocalApic { } } - /// Write to the LAPIC MMIO region. + /// Result of a LAPIC MMIO write. /// - /// Returns `Some(vector)` if an EOI was written (caller should broadcast - /// to IOAPIC). - pub fn write_mmio(&mut self, offset: u64, value: u32) -> Option { + /// Contains an optional EOI vector and an IPI action from ICR writes. + pub fn write_mmio(&mut self, offset: u64, value: u32) -> LapicWriteResult { match offset { - 0x080 => self.tpr = (value & 0xFF) as u8, + 0x080 => { + self.tpr = (value & 0xFF) as u8; + LapicWriteResult::default() + } 0x0B0 => { // EOI: clear highest ISR, return vector for IOAPIC. - return self.end_of_interrupt(); + LapicWriteResult { + eoi_vector: self.end_of_interrupt(), + ipi_action: IpiAction::None, + } } 0x0F0 => { self.svr = value; log::debug!( - "LAPIC SVR write: {:#X} (enabled={})", + "LAPIC {} SVR write: {:#X} (enabled={})", + self.id, value, value & SVR_APIC_ENABLE != 0 ); + LapicWriteResult::default() + } + 0x300 => { + // ICR Low write triggers IPI delivery. + self.icr_low = value; + let action = self.parse_icr(); + LapicWriteResult { + eoi_vector: None, + ipi_action: action, + } + } + 0x310 => { + // ICR High: destination APIC ID (bits 31:24). + self.icr_high = value; + LapicWriteResult::default() + } + 0x320 => { + self.write_lvt_timer(value); + LapicWriteResult::default() + } + 0x380 => { + self.write_initial_count(value); + LapicWriteResult::default() + } + 0x3E0 => { + self.write_divide_config(value); + LapicWriteResult::default() + } + _ => LapicWriteResult::default(), + } + } + + /// Parse the ICR low/high registers to produce an IPI action. + /// + /// ICR Low bits: + /// - [7:0] Vector + /// - [10:8] Delivery mode: 000=Fixed, 101=INIT, 110=Startup (SIPI) + /// - [11] Destination mode (ignored, always physical) + /// - [17:16] Destination shorthand (00=field, others unsupported) + /// + /// ICR High bits: + /// - [31:24] Destination APIC ID + fn parse_icr(&self) -> IpiAction { + let vector = (self.icr_low & 0xFF) as u8; + let delivery_mode = (self.icr_low >> 8) & 0x7; + let dest_apic_id = ((self.icr_high >> 24) & 0xFF) as u8; + + match delivery_mode { + 0b000 => { + // Fixed delivery. + log::debug!( + "LAPIC {} ICR: Fixed interrupt vector={:#X} → APIC {}", + self.id, vector, dest_apic_id + ); + IpiAction::SendInterrupt { + target_apic_id: dest_apic_id, + vector, + } + } + 0b101 => { + // INIT delivery. + log::debug!( + "LAPIC {} ICR: INIT → APIC {}", + self.id, dest_apic_id + ); + IpiAction::SendInit { + target_apic_id: dest_apic_id, + } + } + 0b110 => { + // Startup IPI (SIPI). + log::debug!( + "LAPIC {} ICR: SIPI vector={:#X} → APIC {} (start at {:#X})", + self.id, vector, dest_apic_id, (vector as u32) * 0x1000 + ); + IpiAction::SendSipi { + target_apic_id: dest_apic_id, + vector, + } + } + _ => { + log::debug!( + "LAPIC {} ICR: unsupported delivery mode {} → APIC {}", + self.id, delivery_mode, dest_apic_id + ); + IpiAction::None } - 0x320 => self.write_lvt_timer(value), - 0x380 => self.write_initial_count(value), - 0x3E0 => self.write_divide_config(value), - _ => {} } - None } /// Compute Processor Priority Register (PPR). @@ -477,8 +613,8 @@ mod tests { lapic.accept_interrupt(0x30); lapic.start_of_interrupt(0x30); - let vector = lapic.write_mmio(0x0B0, 0); - assert_eq!(vector, Some(0x30)); + let result = lapic.write_mmio(0x0B0, 0); + assert_eq!(result.eoi_vector, Some(0x30)); } #[test] @@ -612,7 +748,9 @@ mod tests { fn test_lapic_mmio_invalid_offset() { let mut lapic = LocalApic::new(); assert_eq!(lapic.read_mmio(0x400), 0); - assert_eq!(lapic.write_mmio(0x400, 0xDEAD), None); + let result = lapic.write_mmio(0x400, 0xDEAD); + assert_eq!(result.eoi_vector, None); + assert_eq!(result.ipi_action, IpiAction::None); } #[test] @@ -621,4 +759,95 @@ mod tests { // Non-16-byte-aligned ISR offset should return 0. assert_eq!(lapic.read_mmio(0x104), 0); } + + // ---- ICR / IPI tests ---- + + #[test] + fn test_lapic_new_with_id() { + let lapic = LocalApic::new_with_id(3); + assert_eq!(lapic.id(), 3); + assert_eq!(lapic.read_mmio(0x020), 3 << 24); + assert!(!lapic.is_enabled()); + } + + #[test] + fn test_lapic_icr_read_write_roundtrip() { + let mut lapic = LocalApic::new(); + + // Write ICR high (destination APIC ID = 1). + lapic.write_mmio(0x310, 1 << 24); + assert_eq!(lapic.read_mmio(0x310), 1 << 24); + + // Write ICR low (vector=0x40, Fixed delivery). + let result = lapic.write_mmio(0x300, 0x40); + assert_eq!(lapic.read_mmio(0x300), 0x40); + + match result.ipi_action { + IpiAction::SendInterrupt { + target_apic_id, + vector, + } => { + assert_eq!(target_apic_id, 1); + assert_eq!(vector, 0x40); + } + other => panic!("expected SendInterrupt, got {:?}", other), + } + } + + #[test] + fn test_lapic_icr_init_delivery() { + let mut lapic = LocalApic::new(); + + // Set destination = APIC 2. + lapic.write_mmio(0x310, 2 << 24); + // ICR low: delivery mode = 0b101 (INIT), vector ignored. + let result = lapic.write_mmio(0x300, 0x0500); + + match result.ipi_action { + IpiAction::SendInit { target_apic_id } => { + assert_eq!(target_apic_id, 2); + } + other => panic!("expected SendInit, got {:?}", other), + } + } + + #[test] + fn test_lapic_icr_sipi_delivery() { + let mut lapic = LocalApic::new(); + + // Set destination = APIC 1. + lapic.write_mmio(0x310, 1 << 24); + // ICR low: delivery mode = 0b110 (SIPI), vector = 0x10. + // Start address = 0x10 * 0x1000 = 0x10000. + let result = lapic.write_mmio(0x300, 0x0600 | 0x10); + + match result.ipi_action { + IpiAction::SendSipi { + target_apic_id, + vector, + } => { + assert_eq!(target_apic_id, 1); + assert_eq!(vector, 0x10); + } + other => panic!("expected SendSipi, got {:?}", other), + } + } + + #[test] + fn test_lapic_icr_unsupported_delivery_mode() { + let mut lapic = LocalApic::new(); + lapic.write_mmio(0x310, 1 << 24); + // Delivery mode = 0b010 (SMI) — not supported. + let result = lapic.write_mmio(0x300, 0x0200); + assert_eq!(result.ipi_action, IpiAction::None); + } + + #[test] + fn test_lapic_non_icr_write_returns_no_ipi() { + let mut lapic = LocalApic::new(); + // SVR write should produce no IPI. + let result = lapic.write_mmio(0x0F0, 0x1FF); + assert_eq!(result.ipi_action, IpiAction::None); + assert_eq!(result.eoi_vector, None); + } } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 5ffab4845..1936f5c39 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -15,6 +15,7 @@ use super::super::context::VmContext; use super::super::error::{Result, WkrunError}; use super::super::vcpu::IoHandler; use super::irq_chip::IrqChip; +use super::lapic::IpiAction; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; use super::virtio::block::VirtioBlock; @@ -365,7 +366,7 @@ impl DeviceManager { let devices = DeviceManager { serial, - irq_chip: IrqChip::new(), + irq_chip: IrqChip::new(ctx.num_vcpus), pit: Pit::new(), cmos_addr: 0, virtio_blk, @@ -473,9 +474,10 @@ impl DeviceManager { /// Handle an MMIO read from the guest. /// /// Returns the data to inject into the destination register. - pub fn handle_mmio_read(&mut self, address: u64, size: u8) -> u64 { + /// `vcpu_id` selects which LAPIC to read from (each vCPU has its own). + pub fn handle_mmio_read(&mut self, vcpu_id: u8, address: u64, size: u8) -> u64 { // Check IOAPIC/LAPIC ranges first. - if let Some(val) = self.irq_chip.handle_mmio_read(address, size) { + if let Some(val) = self.irq_chip.handle_mmio_read(vcpu_id, address, size) { use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { self.ioapic_mmio_count += 1; @@ -524,23 +526,26 @@ impl DeviceManager { /// Handle an MMIO write from the guest. /// - /// Returns `true` if an interrupt should be raised. + /// `vcpu_id` selects which LAPIC to write to (each vCPU has its own). + /// Returns the IPI action if the write was to the LAPIC ICR register. pub fn handle_mmio_write( &mut self, + vcpu_id: u8, address: u64, size: u8, data: u64, mem: &dyn GuestMemoryAccessor, - ) { + ) -> IpiAction { // Check IOAPIC/LAPIC ranges first. - if self.irq_chip.handle_mmio_write(address, size as u8, data as u32) { + let result = self.irq_chip.handle_mmio_write(vcpu_id, address, size as u8, data as u32); + if result.handled { use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { self.ioapic_mmio_count += 1; } else if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { self.lapic_mmio_count += 1; } - return; + return result.ipi_action; } let blk_offset = address.wrapping_sub(mmio_base_for_slot(0)); @@ -587,6 +592,7 @@ impl DeviceManager { } } } + IpiAction::None } /// Start async block I/O workers for virtio-blk devices (Plan B: WHPX-safe). @@ -617,7 +623,8 @@ impl DeviceManager { /// Tick the PIT timer based on wall clock time and poll devices. /// /// Call this at the top of each vCPU run loop iteration. - pub fn tick_and_poll(&mut self, mem: &dyn GuestMemoryAccessor) { + /// `vcpu_id` selects which LAPIC timer to tick (BSP should be 0). + pub fn tick_and_poll(&mut self, vcpu_id: u8, mem: &dyn GuestMemoryAccessor) { // Tick PIT. let now = Instant::now(); let elapsed_ns = now.duration_since(self.last_tick).as_nanos() as u64; @@ -631,7 +638,7 @@ impl DeviceManager { } // Tick LAPIC timer (only fires in APIC mode). - self.irq_chip.tick_timer(now); + self.irq_chip.tick_timer(vcpu_id, now); // Drain async block I/O completions. if let Some(ref mut dev) = self.virtio_blk { @@ -726,7 +733,7 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { let vsock_backend = VirtioVsock::new(GUEST_CID); DeviceManager { serial, - irq_chip: IrqChip::new(), + irq_chip: IrqChip::new(1), pit: Pit::new(), cmos_addr: 0, virtio_blk: None, @@ -891,7 +898,7 @@ mod tests { fn test_mmio_read_no_blk_device() { let dm = make_test_devices(); // Read from virtio-blk slot when no device present. - let data = dm.handle_mmio_read(mmio_base_for_slot(0), 4); + let data = dm.handle_mmio_read(0, mmio_base_for_slot(0), 4); assert_eq!(data, 0); } @@ -899,7 +906,7 @@ mod tests { fn test_mmio_read_vsock_magic() { let dm = make_test_devices(); // Read virtio magic from vsock MMIO slot. - let magic = dm.handle_mmio_read(mmio_base_for_slot(1), 4); + let magic = dm.handle_mmio_read(0, mmio_base_for_slot(1), 4); assert_eq!(magic, 0x7472_6976); // "virt" in LE. } @@ -907,7 +914,7 @@ mod tests { fn test_mmio_read_vsock_device_id() { let dm = make_test_devices(); // Device ID is at offset 0x008. - let device_id = dm.handle_mmio_read(mmio_base_for_slot(1) + 0x008, 4); + let device_id = dm.handle_mmio_read(0, mmio_base_for_slot(1) + 0x008, 4); assert_eq!(device_id, 19); // vsock device ID. } @@ -915,7 +922,7 @@ mod tests { fn test_mmio_read_out_of_range() { let dm = make_test_devices(); // Read from an address that doesn't belong to any device. - let data = dm.handle_mmio_read(0xE000_0000, 4); + let data = dm.handle_mmio_read(0, 0xE000_0000, 4); assert_eq!(data, 0); } @@ -1020,7 +1027,7 @@ mod tests { fn test_mmio_read_no_net_device() { let dm = make_test_devices(); // Read from virtio-net slot when no device present. - let data = dm.handle_mmio_read(mmio_base_for_slot(3), 4); + let data = dm.handle_mmio_read(0, mmio_base_for_slot(3), 4); assert_eq!(data, 0); } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index d449b96e1..723b18e5d 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -11,13 +11,14 @@ mod imp { use std::collections::HashMap; use std::io::Write as IoWrite; - use std::sync::atomic::Ordering; - use std::sync::{Arc, Mutex}; + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::{Arc, Condvar, Mutex}; use std::time::{Duration, Instant}; use super::super::boot::loader::load_kernel_with_initrd; use super::super::cmdline::build_kernel_cmdline; use super::super::context::VmContext; + use super::super::devices::lapic::IpiAction; use super::super::devices::manager::{self as devices, DeviceManager}; use super::super::devices::virtio::queue::GuestMemoryAccessor; use super::super::error::{Result, WkrunError}; @@ -54,6 +55,32 @@ mod imp { /// NOT indicate the VM is stuck. const MAX_HALTS: u64 = 50_000; + /// Per-AP (Application Processor) startup state. + /// + /// Each AP thread waits on its condvar until the BSP delivers an + /// INIT-SIPI-SIPI sequence via the LAPIC ICR register. + struct ApStartupState { + /// Whether this AP has received SIPI and should start executing. + started: Mutex, + /// Condvar to wake the AP thread when SIPI arrives. + condvar: Condvar, + /// SIPI vector — the AP starts executing at `vector * 0x1000`. + sipi_vector: Mutex>, + /// Whether INIT has been received (prerequisite for SIPI). + init_received: AtomicBool, + } + + impl ApStartupState { + fn new() -> Self { + Self { + started: Mutex::new(false), + condvar: Condvar::new(), + sipi_vector: Mutex::new(None), + init_received: AtomicBool::new(false), + } + } + } + /// Handle for a running VM, stored in `RUNNING_VMS`. struct VmHandle { thread: Option>>, @@ -213,7 +240,7 @@ mod imp { let ctx_id = ctx.id; let setup = DeviceManager::from_context(&ctx)?; devices::store_console_buffer(ctx_id, setup.console_buffer); - let mut devices = setup.devices; + let devices = setup.devices; // NOTE: Block I/O workers are started lazily (deferred start) inside // the vCPU loop, on the first MMIO write. Starting them here (before @@ -235,8 +262,14 @@ mod imp { // Load kernel. let initrd_ref = initrd_data.as_deref(); - let (regs, sregs) = - load_kernel_with_initrd(&guest_mem, &kernel_image, &cmdline, ctx.ram_mib, initrd_ref)?; + let (regs, sregs) = load_kernel_with_initrd( + &guest_mem, + &kernel_image, + &cmdline, + ctx.ram_mib, + initrd_ref, + ctx.num_vcpus, + )?; log::info!( "Kernel loaded at 0x100000, RIP=0x{:X}, cmdline: {}", @@ -254,272 +287,507 @@ mod imp { ); diag!("Kernel loaded, RIP={:#X}, ram={}MB", regs.rip, ctx.ram_mib); - // Create vCPU and set registers. - let vcpu = WhpxVcpu::new(&partition, 0)?; - vcpu.set_registers(®s)?; - vcpu.set_special_registers(&sregs)?; - - // Store canceller so stop() can wake the vCPU. - *canceller_slot.lock().unwrap() = Some(vcpu.canceller()); - - // Spawn timer thread for PIT interrupt delivery. - // Uses run_config.running so that request_stop() stops both the timer - // and the vCPU loop. - let timer_flag = run_config.running.clone(); - let canceller = vcpu.canceller(); - let timer_thread = std::thread::spawn(move || { - while timer_flag.load(Ordering::Relaxed) { - std::thread::sleep(Duration::from_millis(1)); - let _ = canceller.cancel(); + // Create all vCPUs. BSP (index 0) gets the boot registers. + // APs (index 1..N-1) are created but start in "wait for SIPI" state. + let num_vcpus = ctx.num_vcpus; + let mut vcpus = Vec::with_capacity(num_vcpus as usize); + for i in 0..num_vcpus as u32 { + let vcpu = WhpxVcpu::new(&partition, i)?; + if i == 0 { + vcpu.set_registers(®s)?; + vcpu.set_special_registers(&sregs)?; + } + vcpus.push(vcpu); + } + + // Collect cancellers for all vCPUs. The timer thread and stop() + // need to be able to wake any vCPU. + let cancellers: Vec = vcpus.iter().map(|v| v.canceller()).collect(); + + // Store BSP canceller so stop() can wake the VM. + *canceller_slot.lock().unwrap() = Some(cancellers[0].clone()); + + // Create per-AP startup state (one per AP, indexed by ap_id - 1). + let ap_states: Vec = (1..num_vcpus).map(|_| ApStartupState::new()).collect(); + + // Shared VM shutdown flag — set by any vCPU to signal all others to exit. + let shutdown = Arc::new(AtomicBool::new(false)); + let devices = Arc::new(Mutex::new(devices)); + + // Move diag_log into shared state for BSP diagnostics. + let diag_log = Arc::new(Mutex::new(diag_log)); + + log::info!( + "Starting VM with {} vCPU(s), ctx_id={}", + num_vcpus, + ctx_id, + ); + eprintln!("[WHPX] Starting {} vCPU(s)", num_vcpus); + + let mut exit_code = 1i32; + + // Use thread::scope so all vCPU threads are guaranteed to terminate + // before we clean up resources. The BSP runs in the scoped block; + // APs are spawned as scoped threads. + { + let shutdown_ref = &shutdown; + let devices_ref = &devices; + let ap_states_ref = &ap_states; + let cancellers_ref = &cancellers; + let run_config_ref = &run_config; + let guest_mem_ref: &GuestMemory = &guest_mem; + let diag_ref = &diag_log; + + // Spawn timer thread — cancels ALL vCPUs every 1ms. + let timer_flag = run_config.running.clone(); + let timer_cancellers: Vec = cancellers.clone(); + let timer_shutdown = shutdown.clone(); + let timer_thread = std::thread::spawn(move || { + while timer_flag.load(Ordering::Relaxed) && !timer_shutdown.load(Ordering::Relaxed) + { + std::thread::sleep(Duration::from_millis(1)); + for c in &timer_cancellers { + let _ = c.cancel(); + } + } + }); + + std::thread::scope(|s| { + // Spawn AP threads (vCPU 1..N-1). + for ap_idx in 1..num_vcpus as usize { + let vcpu = &vcpus[ap_idx]; + s.spawn(move || { + run_ap_loop( + ap_idx as u8, + num_vcpus, + vcpu, + devices_ref, + guest_mem_ref, + shutdown_ref, + run_config_ref, + cancellers_ref, + &ap_states_ref[ap_idx - 1], + ctx_id, + ); + }); + } + + // BSP runs on the current thread. + let bsp_vcpu = &vcpus[0]; + let bsp_code = run_bsp_loop( + bsp_vcpu, + devices_ref, + guest_mem_ref, + shutdown_ref, + run_config_ref, + cancellers_ref, + ap_states_ref, + ctx_id, + diag_ref, + num_vcpus, + ); + // BSP exited — signal all APs to exit. + shutdown_ref.store(true, Ordering::Release); + for c in cancellers_ref { + let _ = c.cancel(); + } + // Wake any APs still waiting for SIPI. + for ap in ap_states_ref { + *ap.started.lock().unwrap() = true; + ap.condvar.notify_one(); + } + exit_code = bsp_code; + }); + + // Stop the timer thread and block I/O workers. + run_config.request_stop(); + shutdown.store(true, Ordering::Release); + devices.lock().unwrap().stop_blk_workers(); + let _ = timer_thread.join(); + } + + log::info!("VM exited with code {}", exit_code); + eprintln!("[WHPX] VM exited, code={}", exit_code); + + Ok(exit_code) + } + + /// Per-vCPU statistics counters. + struct VcpuStats { + exit_count: u64, + halt_count: u64, + total_halt_exits: u64, + halt_with_irq: u64, + mmio_count: u64, + serial_out_count: u64, + io_out_count: u64, + io_in_count: u64, + inject_count: u64, + last_progress: Instant, + start_time: Instant, + window_requested: bool, + } + + impl VcpuStats { + fn new() -> Self { + let now = Instant::now(); + Self { + exit_count: 0, + halt_count: 0, + total_halt_exits: 0, + halt_with_irq: 0, + mmio_count: 0, + serial_out_count: 0, + io_out_count: 0, + io_in_count: 0, + inject_count: 0, + last_progress: now, + start_time: now, + window_requested: false, } - }); - - // vCPU run loop — GuestMemory implements GuestMemoryAccessor directly. - let mem_ref: &GuestMemory = &guest_mem; - let mut exit_count: u64 = 0; - let mut halt_count: u64 = 0; - let mut total_halt_exits: u64 = 0; - let mut halt_with_irq: u64 = 0; - let start_time = Instant::now(); - let mut last_progress = Instant::now(); - let mut mmio_count: u64 = 0; + } + } + + /// Try to inject a pending interrupt into a vCPU. + /// + /// Returns the number of interrupts injected (0 or 1). + fn try_inject_interrupt( + vcpu: &WhpxVcpu, + vcpu_id: u8, + devices: &mut DeviceManager, + stats: &mut VcpuStats, + ) -> Result<()> { + if !devices.irq_chip.has_pending(vcpu_id) { + return Ok(()); + } + + let already_pending = vcpu.has_pending_interruption().unwrap_or(false); + if already_pending { + return Ok(()); + } + + match vcpu.interrupts_enabled() { + Ok(true) => { + if let Some(vector) = devices.irq_chip.acknowledge(vcpu_id) { + log::debug!("vCPU{}: injecting interrupt vector {:#X}", vcpu_id, vector); + vcpu.inject_interrupt(vector)?; + devices.irq_chip.notify_injected(vcpu_id, vector); + stats.window_requested = false; + stats.inject_count += 1; + } + } + Ok(false) => { + if !stats.window_requested { + vcpu.request_interrupt_window()?; + stats.window_requested = true; + } + } + Err(ref e) => { + log::warn!("vCPU{}: interrupts_enabled() error: {:?}", vcpu_id, e); + } + } + Ok(()) + } + + /// Dispatch an IPI action from a LAPIC ICR write. + fn dispatch_ipi( + action: IpiAction, + devices: &mut DeviceManager, + ap_states: &[ApStartupState], + cancellers: &[VcpuCanceller], + ) { + match action { + IpiAction::None => {} + IpiAction::SendInit { target_apic_id } => { + let ap_idx = target_apic_id as usize; + if ap_idx > 0 && ap_idx - 1 < ap_states.len() { + ap_states[ap_idx - 1] + .init_received + .store(true, Ordering::Release); + log::info!("INIT delivered to AP{}", target_apic_id); + } + } + IpiAction::SendSipi { + target_apic_id, + vector, + } => { + let ap_idx = target_apic_id as usize; + if ap_idx > 0 && ap_idx - 1 < ap_states.len() { + let state = &ap_states[ap_idx - 1]; + if state.init_received.load(Ordering::Acquire) { + *state.sipi_vector.lock().unwrap() = Some(vector); + *state.started.lock().unwrap() = true; + state.condvar.notify_one(); + log::info!( + "SIPI delivered to AP{}: start at {:#X}", + target_apic_id, + (vector as u64) * 0x1000 + ); + } else { + log::warn!( + "SIPI to AP{} ignored (no INIT received)", + target_apic_id, + ); + } + } + } + IpiAction::SendInterrupt { + target_apic_id, + vector, + } => { + devices + .irq_chip + .deliver_ipi_interrupt(target_apic_id, vector); + let idx = target_apic_id as usize; + if idx < cancellers.len() { + let _ = cancellers[idx].cancel(); + } + log::debug!( + "IPI interrupt vector {:#X} delivered to vCPU{}", + vector, + target_apic_id, + ); + } + } + } + + /// BSP (Bootstrap Processor, vCPU 0) main loop. + /// + /// Handles timer ticking, device polling, interrupt injection, block worker + /// start, IPI dispatch, and progress diagnostics. + #[allow(clippy::too_many_arguments)] + fn run_bsp_loop( + vcpu: &WhpxVcpu, + devices: &Arc>, + guest_mem: &GuestMemory, + shutdown: &AtomicBool, + run_config: &VcpuRunConfig, + cancellers: &[VcpuCanceller], + ap_states: &[ApStartupState], + ctx_id: u32, + diag_log: &Arc>>, + num_vcpus: u8, + ) -> i32 { + macro_rules! diag { + ($($arg:tt)*) => { + if let Ok(mut guard) = diag_log.lock() { + if let Some(ref mut f) = *guard { + let _ = writeln!(f, $($arg)*); + let _ = f.flush(); + } + } + }; + } + + let mut stats = VcpuStats::new(); let mut blk_workers_started = false; let sync_block = std::env::var("BOXLITE_SYNC_BLOCK").is_ok(); - let mut serial_out_count: u64 = 0; - let mut io_out_count: u64 = 0; - let mut io_in_count: u64 = 0; - let mut inject_count: u64 = 0; - let mut last_exit_reason = "none"; - let exit_code; + let mut _last_exit_reason = "none"; loop { - // Tick PIT and poll devices. - devices.tick_and_poll(mem_ref); - - // Try to inject pending interrupt. - // CRITICAL: Do NOT acknowledge a new PIC interrupt if a previous - // injection is still pending in WHPX. Overwriting the pending - // interruption register would lose the old interrupt and leave - // its PIC ISR bit permanently stuck (guest never sends EOI). - if devices.irq_chip.has_pending() { - let already_pending = vcpu - .has_pending_interruption() - .unwrap_or(false); - if already_pending { - // Previous interrupt not yet delivered — skip this cycle. - } else { - match vcpu.interrupts_enabled() { - Ok(true) => { - if let Some(vector) = devices.irq_chip.acknowledge() { - log::debug!("Injecting interrupt vector {:#X}", vector); - vcpu.inject_interrupt(vector)?; - devices.irq_chip.notify_injected(vector); - devices.set_window_requested(false); - inject_count += 1; - } - } - Ok(false) => { - if !devices.window_requested() { - vcpu.request_interrupt_window()?; - devices.set_window_requested(true); - } - } - Err(ref e) => { - log::warn!("interrupts_enabled() error: {:?}", e); - } - } + if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { + _last_exit_reason = "SHUTDOWN_SIGNAL"; + return 0; + } + + // Tick PIT and poll devices (BSP only). + { + let mut dm = devices.lock().unwrap(); + dm.tick_and_poll(0, guest_mem); + // Try to inject pending interrupt. + if let Err(e) = try_inject_interrupt(vcpu, 0, &mut dm, &mut stats) { + log::error!("BSP interrupt injection error: {:?}", e); } } let exit = match vcpu.run() { Ok(exit) => exit, Err(e) => { - log::error!( - "vcpu.run() FAILED after {} exits: {:?}", - exit_count, - e - ); - eprintln!( - "[WHPX] vcpu.run() FAILED after {} exits: {:?}", - exit_count, e - ); - last_exit_reason = "VCPU_RUN_ERROR"; - exit_code = 1; - break; + log::error!("BSP vcpu.run() FAILED after {} exits: {:?}", stats.exit_count, e); + eprintln!("[WHPX] BSP vcpu.run() FAILED after {} exits: {:?}", stats.exit_count, e); + return 1; } }; - exit_count += 1; + stats.exit_count += 1; match exit { VcpuExit::IoOut { port, size, data } => { - halt_count = 0; - io_out_count += 1; + stats.halt_count = 0; + stats.io_out_count += 1; if port == 0x3F8 { - serial_out_count += 1; + stats.serial_out_count += 1; } - devices.handle_io_out(port, size, data); - if devices.shutdown_requested() { - log::info!("ACPI shutdown detected after {} exits", exit_count); - last_exit_reason = "ACPI_SHUTDOWN"; - exit_code = 0; - break; + let mut dm = devices.lock().unwrap(); + dm.handle_io_out(port, size, data); + if dm.shutdown_requested() { + log::info!("ACPI shutdown detected after {} exits", stats.exit_count); + _last_exit_reason = "ACPI_SHUTDOWN"; + return 0; + } + drop(dm); + if let Err(e) = vcpu.skip_instruction() { + log::error!("BSP skip_instruction error: {:?}", e); + return 1; } - vcpu.skip_instruction()?; } VcpuExit::IoIn { port, size } => { - halt_count = 0; - io_in_count += 1; - let data = devices.handle_io_in(port, size); - vcpu.complete_io_in(data, size)?; + stats.halt_count = 0; + stats.io_in_count += 1; + let data = devices.lock().unwrap().handle_io_in(port, size); + if let Err(e) = vcpu.complete_io_in(data, size) { + log::error!("BSP complete_io_in error: {:?}", e); + return 1; + } } VcpuExit::MmioRead { address, size } => { - halt_count = 0; - mmio_count += 1; - let data = devices.handle_mmio_read(address, size); - vcpu.complete_mmio_read(data)?; + stats.halt_count = 0; + stats.mmio_count += 1; + let data = devices.lock().unwrap().handle_mmio_read(0, address, size); + if let Err(e) = vcpu.complete_mmio_read(data) { + log::error!("BSP complete_mmio_read error: {:?}", e); + return 1; + } } VcpuExit::MmioWrite { address, size, data, } => { - halt_count = 0; - mmio_count += 1; - // Deferred start: spawn block I/O workers on first MMIO - // write (after vCPU is running). If BOXLITE_SYNC_BLOCK is - // set, skip workers entirely (sync disk I/O for A/B testing). + stats.halt_count = 0; + stats.mmio_count += 1; + let mut dm = devices.lock().unwrap(); if !blk_workers_started && !sync_block { - devices.start_blk_workers(); + dm.start_blk_workers(); blk_workers_started = true; let msg = format!( "Block workers started at exit={} mmio={} elapsed={:.1}ms", - exit_count, mmio_count, start_time.elapsed().as_secs_f64() * 1000.0 + stats.exit_count, + stats.mmio_count, + stats.start_time.elapsed().as_secs_f64() * 1000.0 ); eprintln!("[WHPX] {}", msg); diag!("{}", msg); } - devices.handle_mmio_write(address, size, data, mem_ref); - vcpu.skip_instruction()?; + let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); + // Dispatch IPI if this was an ICR write. + if !matches!(ipi_action, IpiAction::None) { + dispatch_ipi(ipi_action, &mut dm, ap_states, cancellers); + } + drop(dm); + if let Err(e) = vcpu.skip_instruction() { + log::error!("BSP skip_instruction error: {:?}", e); + return 1; + } } VcpuExit::InterruptWindow => { - halt_count = 0; - devices.set_window_requested(false); + stats.halt_count = 0; + stats.window_requested = false; } VcpuExit::Halt => { - total_halt_exits += 1; - if !run_config.should_run() { - log::info!("VM stop requested, exiting on Halt"); - last_exit_reason = "HALT_STOP_REQUESTED"; - exit_code = 0; - break; + stats.total_halt_exits += 1; + if !run_config.should_run() || shutdown.load(Ordering::Relaxed) { + log::info!("BSP: stop requested, exiting on Halt"); + return 0; } - // Poll devices before sleeping — a pending interrupt may - // have arrived (e.g. PIT tick, vsock data) while the guest - // was halted. - devices.tick_and_poll(mem_ref); - - if devices.irq_chip.has_pending() { - let already_pending = vcpu - .has_pending_interruption() - .unwrap_or(false); - if !already_pending { - if let Some(vector) = devices.irq_chip.acknowledge() { - vcpu.inject_interrupt(vector)?; - devices.irq_chip.notify_injected(vector); - devices.set_window_requested(false); - inject_count += 1; + // Poll devices before sleeping. + { + let mut dm = devices.lock().unwrap(); + dm.tick_and_poll(0, guest_mem); + if dm.irq_chip.has_pending(0) { + let already_pending = + vcpu.has_pending_interruption().unwrap_or(false); + if !already_pending { + if let Some(vector) = dm.irq_chip.acknowledge(0) { + let _ = vcpu.inject_interrupt(vector); + dm.irq_chip.notify_injected(0, vector); + stats.window_requested = false; + stats.inject_count += 1; + } } + stats.halt_with_irq += 1; + stats.halt_count = 0; + continue; } - halt_with_irq += 1; - halt_count = 0; - continue; } - // No pending interrupts — guest is genuinely idle. - halt_count += 1; + stats.halt_count += 1; - // Log diagnostic info every 1000 consecutive halts - if halt_count % 1000 == 0 { + if stats.halt_count % 1000 == 0 { if let Ok(regs) = vcpu.get_registers() { let console_len = devices::get_console_output(ctx_id) .map(|b| b.len()) .unwrap_or(0); let if_flag = vcpu.interrupts_enabled().unwrap_or(false); eprintln!( - "[WHPX] HLT stuck: consecutive={} total_halt={} halt_with_irq={} \ - exits={} RIP={:#X} RFLAGS={:#X} IF={} console={}B mmio={}", - halt_count, total_halt_exits, halt_with_irq, - exit_count, regs.rip, regs.rflags, - if_flag, console_len, mmio_count + "[WHPX] BSP HLT stuck: consecutive={} total_halt={} halt_with_irq={} \ + exits={} RIP={:#X} RFLAGS={:#X} IF={} console={}B mmio={} vcpus={}", + stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, + stats.exit_count, regs.rip, regs.rflags, + if_flag, console_len, stats.mmio_count, num_vcpus ); } } - if halt_count > MAX_HALTS { + if stats.halt_count > MAX_HALTS { if let Ok(regs) = vcpu.get_registers() { let console_len = devices::get_console_output(ctx_id) .map(|b| b.len()) .unwrap_or(0); eprintln!( - "[WHPX] HALT_MAX: consecutive={} total_halt={} halt_with_irq={} \ + "[WHPX] BSP HALT_MAX: consecutive={} total_halt={} halt_with_irq={} \ exits={} RIP={:#X} console={}B mmio={}", - halt_count, total_halt_exits, halt_with_irq, - exit_count, regs.rip, console_len, mmio_count + stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, + stats.exit_count, regs.rip, console_len, stats.mmio_count ); } log::warn!( - "vCPU halted {} times consecutively after {} exits \ - (total_halt={}, halt_with_irq={})", - halt_count, exit_count, - total_halt_exits, halt_with_irq + "BSP halted {} times consecutively after {} exits", + stats.halt_count, + stats.exit_count, ); - last_exit_reason = "HALT_MAX_REACHED"; - exit_code = 0; - break; + _last_exit_reason = "HALT_MAX_REACHED"; + return 0; } std::thread::sleep(Duration::from_millis(1)); } VcpuExit::Shutdown => { - log::info!("VM shutdown after {} exits", exit_count); - last_exit_reason = "VM_SHUTDOWN"; - exit_code = 0; - break; + log::info!("BSP: VM shutdown after {} exits", stats.exit_count); + return 0; } VcpuExit::Cancelled => { - // Timer thread or stop() cancelled vCPU. Check if we should exit. - if !run_config.should_run() { - log::info!("VM stop requested, exiting on Cancelled"); - last_exit_reason = "CANCELLED_STOP"; - exit_code = 0; - break; + if !run_config.should_run() || shutdown.load(Ordering::Relaxed) { + log::info!("BSP: stop requested on Cancelled"); + return 0; } - // Wall-clock progress report every 2 seconds. - if last_progress.elapsed() >= Duration::from_secs(2) { - last_progress = Instant::now(); + if stats.last_progress.elapsed() >= Duration::from_secs(2) { + stats.last_progress = Instant::now(); if let Ok(regs) = vcpu.get_registers() { + let dm = devices.lock().unwrap(); let console_len = devices::get_console_output(ctx_id) .map(|b| b.len()) .unwrap_or(0); - let (qn, bc) = devices.blk_stats(); - let (ioapic_mmio, lapic_mmio) = devices.apic_mmio_stats(); - let (irr, isr, imr, vbase) = devices.irq_chip.pic_master_state(); - let (s_irr, s_isr, s_imr, s_vbase) = devices.irq_chip.pic_slave_state(); - let apic_mode = devices.irq_chip.apic_mode(); + let (qn, bc) = dm.blk_stats(); + let (ioapic_mmio, lapic_mmio) = dm.apic_mmio_stats(); + let (irr, isr, imr, vbase) = dm.irq_chip.pic_master_state(); + let (s_irr, s_isr, s_imr, s_vbase) = dm.irq_chip.pic_slave_state(); + let apic_mode = dm.irq_chip.apic_mode(); let msg = format!( - "Progress @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} ioapic_mmio={} lapic_mmio={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} irq={} mode={}", - start_time.elapsed().as_secs_f64(), - exit_count, regs.rip, console_len, - io_out_count, serial_out_count, - mmio_count, qn, bc, - halt_count, total_halt_exits, - halt_with_irq, inject_count, + "BSP @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} ioapic_mmio={} lapic_mmio={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} irq={} mode={} vcpus={}", + stats.start_time.elapsed().as_secs_f64(), + stats.exit_count, regs.rip, console_len, + stats.io_out_count, stats.serial_out_count, + stats.mmio_count, qn, bc, + stats.halt_count, stats.total_halt_exits, + stats.halt_with_irq, stats.inject_count, ioapic_mmio, lapic_mmio, irr, isr, imr, vbase, s_irr, s_isr, s_imr, s_vbase, if apic_mode { "apic" } else { "pic" }, if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }, + num_vcpus, ); + drop(dm); log::info!("{}", msg); eprintln!("[WHPX] {}", msg); diag!("{}", msg); @@ -532,17 +800,24 @@ mod imp { rax, rdx, } => { - halt_count = 0; + stats.halt_count = 0; if is_write { log::trace!( - "MSR write: 0x{:08X} <- 0x{:016X}", + "BSP: MSR write 0x{:08X} <- 0x{:016X}", msr_number, (rdx << 32) | (rax & 0xFFFF_FFFF) ); - vcpu.skip_instruction()?; + if let Err(e) = vcpu.skip_instruction() { + log::error!("BSP skip_instruction error: {:?}", e); + return 1; + } } else { - log::trace!("MSR read: 0x{:08X} -> 0", msr_number); - vcpu.complete_msr_read(0)?; + let value = super::handle_msr_read(0, msr_number); + log::trace!("BSP: MSR read 0x{:08X} -> 0x{:X}", msr_number, value); + if let Err(e) = vcpu.complete_msr_read(value) { + log::error!("BSP complete_msr_read error: {:?}", e); + return 1; + } } } VcpuExit::CpuidAccess { @@ -553,39 +828,34 @@ mod imp { default_rcx, default_rdx, } => { - halt_count = 0; - let leaf = rax as u32; - // Mask hypervisor-related CPUID leaves to prevent the Linux - // guest from detecting Hyper-V and trying to use enlightenments - // (synthetic timers, SynIC, TSC page) that our WHPX partition - // doesn't fully support. - let (out_rax, out_rbx, out_rcx, out_rdx) = match leaf { - // Leaf 1: clear "hypervisor present" bit (ECX bit 31). - 1 => ( - default_rax, - default_rbx, - default_rcx & !(1u64 << 31), - default_rdx, - ), - // Hyper-V CPUID range: return zeros (no hypervisor features). - 0x40000000..=0x400000FF => (0, 0, 0, 0), - _ => (default_rax, default_rbx, default_rcx, default_rdx), - }; + stats.halt_count = 0; + let (out_rax, out_rbx, out_rcx, out_rdx) = super::handle_cpuid( + 0, + num_vcpus, + rax as u32, + default_rax, + default_rbx, + default_rcx, + default_rdx, + ); log::trace!( - "CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", + "BSP CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", rax, rcx, out_rax ); - vcpu.complete_cpuid(out_rax, out_rbx, out_rcx, out_rdx)?; + if let Err(e) = vcpu.complete_cpuid(out_rax, out_rbx, out_rcx, out_rdx) { + log::error!("BSP complete_cpuid error: {:?}", e); + return 1; + } } VcpuExit::UnrecoverableException => { let regs = vcpu.get_registers().ok(); let sregs = vcpu.get_special_registers().ok(); log::error!( - "Unrecoverable exception (triple fault) after {} exits. \ + "BSP: Unrecoverable exception after {} exits. \ RIP={:#X}, CR0={:#X}, CR3={:#X}, CR4={:#X}, EFER={:#X}", - exit_count, + stats.exit_count, regs.as_ref().map_or(0, |r| r.rip), sregs.as_ref().map_or(0, |s| s.cr0), sregs.as_ref().map_or(0, |s| s.cr3), @@ -593,56 +863,288 @@ mod imp { sregs.as_ref().map_or(0, |s| s.efer), ); eprintln!( - "[WHPX] TRIPLE FAULT after {} exits, RIP={:#X}", - exit_count, + "[WHPX] BSP TRIPLE FAULT after {} exits, RIP={:#X}", + stats.exit_count, regs.as_ref().map_or(0, |r| r.rip), ); - last_exit_reason = "TRIPLE_FAULT"; - exit_code = -1; - break; + return -1; } VcpuExit::Unknown(reason) => { - log::error!( - "Unknown vCPU exit reason {} after {} exits", - reason, - exit_count - ); - last_exit_reason = "UNKNOWN_EXIT"; - exit_code = -1; - break; + log::error!("BSP: Unknown exit reason {} after {} exits", reason, stats.exit_count); + return -1; } } - if exit_count >= MAX_EXITS { - log::warn!("Reached {} exit limit", MAX_EXITS); - last_exit_reason = "MAX_EXITS"; - exit_code = -1; - break; + if stats.exit_count >= MAX_EXITS { + log::warn!("BSP reached {} exit limit", MAX_EXITS); + return -1; + } + } + } + + /// AP (Application Processor, vCPU 1..N-1) loop. + /// + /// Waits for SIPI, configures initial registers, then runs a vCPU loop + /// similar to BSP but without timer ticking or block worker management. + #[allow(clippy::too_many_arguments)] + fn run_ap_loop( + ap_id: u8, + num_vcpus: u8, + vcpu: &WhpxVcpu, + devices: &Arc>, + guest_mem: &GuestMemory, + shutdown: &AtomicBool, + run_config: &VcpuRunConfig, + cancellers: &[VcpuCanceller], + startup: &ApStartupState, + _ctx_id: u32, + ) { + // Wait for SIPI from BSP. + { + let mut started = startup.started.lock().unwrap(); + while !*started { + started = startup.condvar.wait(started).unwrap(); } } - // Stop the timer thread and block I/O workers. - run_config.request_stop(); - devices.stop_blk_workers(); - let _ = timer_thread.join(); + // Check if we were woken for shutdown rather than SIPI. + if shutdown.load(Ordering::Relaxed) { + log::info!("AP{}: woken for shutdown, not SIPI", ap_id); + return; + } + // Configure AP initial register state from SIPI vector. + let sipi_vector = startup.sipi_vector.lock().unwrap().unwrap_or(0); log::info!( - "VM exited with code {} ({} exits), reason={}", - exit_code, - exit_count, - last_exit_reason - ); - let exit_msg = format!( - "VM exited, code={} exits={} reason={} io_out={} serial={} io_in={} mmio={} halt={}/{} elapsed={:.1}s", - exit_code, exit_count, last_exit_reason, - io_out_count, serial_out_count, io_in_count, - mmio_count, total_halt_exits, halt_with_irq, - start_time.elapsed().as_secs_f64(), + "AP{}: SIPI received, starting at vector={:#X} ({:#X})", + ap_id, + sipi_vector, + (sipi_vector as u64) * 0x1000 ); - eprintln!("[WHPX] {}", exit_msg); - diag!("{}", exit_msg); - Ok(exit_code) + // AP starts in real mode: CS:IP = (sipi_vector * 0x100):0x0000 + // The Linux kernel SMP trampoline is placed at sipi_vector * 0x1000. + if let Err(e) = vcpu.set_ap_initial_regs(sipi_vector, ap_id) { + log::error!("AP{}: failed to set initial registers: {:?}", ap_id, e); + return; + } + + let mut stats = VcpuStats::new(); + + loop { + if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { + log::info!("AP{}: shutdown signal received", ap_id); + return; + } + + // Try to inject pending interrupt (no timer ticking for APs). + { + let mut dm = devices.lock().unwrap(); + if let Err(e) = try_inject_interrupt(vcpu, ap_id, &mut dm, &mut stats) { + log::error!("AP{}: interrupt injection error: {:?}", ap_id, e); + } + } + + let exit = match vcpu.run() { + Ok(exit) => exit, + Err(e) => { + log::error!( + "AP{}: vcpu.run() FAILED after {} exits: {:?}", + ap_id, + stats.exit_count, + e + ); + return; + } + }; + stats.exit_count += 1; + + match exit { + VcpuExit::IoOut { port, size, data } => { + stats.halt_count = 0; + stats.io_out_count += 1; + let mut dm = devices.lock().unwrap(); + dm.handle_io_out(port, size, data); + if dm.shutdown_requested() { + log::info!("AP{}: ACPI shutdown detected", ap_id); + shutdown.store(true, Ordering::Release); + for c in cancellers { + let _ = c.cancel(); + } + return; + } + drop(dm); + let _ = vcpu.skip_instruction(); + } + VcpuExit::IoIn { port, size } => { + stats.halt_count = 0; + stats.io_in_count += 1; + let data = devices.lock().unwrap().handle_io_in(port, size); + let _ = vcpu.complete_io_in(data, size); + } + VcpuExit::MmioRead { address, size } => { + stats.halt_count = 0; + stats.mmio_count += 1; + let data = devices.lock().unwrap().handle_mmio_read(ap_id, address, size); + let _ = vcpu.complete_mmio_read(data); + } + VcpuExit::MmioWrite { + address, + size, + data, + } => { + stats.halt_count = 0; + stats.mmio_count += 1; + let mut dm = devices.lock().unwrap(); + let ipi_action = dm.handle_mmio_write(ap_id, address, size, data, guest_mem); + if !matches!(ipi_action, IpiAction::None) { + // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). + dispatch_ipi(ipi_action, &mut dm, &[], cancellers); + } + drop(dm); + let _ = vcpu.skip_instruction(); + } + VcpuExit::InterruptWindow => { + stats.halt_count = 0; + stats.window_requested = false; + } + VcpuExit::Halt => { + stats.total_halt_exits += 1; + if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { + log::info!("AP{}: stop requested on Halt", ap_id); + return; + } + + // Check for pending interrupts. + { + let mut dm = devices.lock().unwrap(); + if dm.irq_chip.has_pending(ap_id) { + let already_pending = + vcpu.has_pending_interruption().unwrap_or(false); + if !already_pending { + if let Some(vector) = dm.irq_chip.acknowledge(ap_id) { + let _ = vcpu.inject_interrupt(vector); + dm.irq_chip.notify_injected(ap_id, vector); + stats.window_requested = false; + stats.inject_count += 1; + } + } + stats.halt_with_irq += 1; + stats.halt_count = 0; + continue; + } + } + + stats.halt_count += 1; + + if stats.halt_count > MAX_HALTS { + log::info!( + "AP{}: halted {} times, idling (not treated as fatal)", + ap_id, + stats.halt_count, + ); + // APs can idle indefinitely — the kernel may park them. + // Don't exit, just keep waiting for interrupts. + stats.halt_count = 0; + } + std::thread::sleep(Duration::from_millis(1)); + } + VcpuExit::Shutdown => { + log::info!("AP{}: shutdown after {} exits", ap_id, stats.exit_count); + return; + } + VcpuExit::Cancelled => { + if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { + log::info!("AP{}: stop requested on Cancelled", ap_id); + return; + } + } + VcpuExit::MsrAccess { + msr_number, + is_write, + rax, + rdx, + } => { + stats.halt_count = 0; + if is_write { + log::trace!( + "AP{}: MSR write 0x{:08X} <- 0x{:016X}", + ap_id, + msr_number, + (rdx << 32) | (rax & 0xFFFF_FFFF) + ); + if let Err(e) = vcpu.skip_instruction() { + log::error!("AP{} skip_instruction error: {:?}", ap_id, e); + return; + } + } else { + let value = super::handle_msr_read(ap_id, msr_number); + log::trace!( + "AP{}: MSR read 0x{:08X} -> 0x{:X}", + ap_id, + msr_number, + value + ); + if let Err(e) = vcpu.complete_msr_read(value) { + log::error!("AP{} complete_msr_read error: {:?}", ap_id, e); + return; + } + } + } + VcpuExit::CpuidAccess { + rax, + rcx, + default_rax, + default_rbx, + default_rcx, + default_rdx, + } => { + stats.halt_count = 0; + let (out_rax, out_rbx, out_rcx, out_rdx) = super::handle_cpuid( + ap_id, + num_vcpus, + rax as u32, + default_rax, + default_rbx, + default_rcx, + default_rdx, + ); + log::trace!( + "AP{} CPUID leaf=0x{:X} sub=0x{:X} -> rax=0x{:X}", + ap_id, + rax, + rcx, + out_rax + ); + if let Err(e) = vcpu.complete_cpuid(out_rax, out_rbx, out_rcx, out_rdx) { + log::error!("AP{} complete_cpuid error: {:?}", ap_id, e); + return; + } + } + VcpuExit::UnrecoverableException => { + log::error!( + "AP{}: triple fault after {} exits", + ap_id, + stats.exit_count + ); + return; + } + VcpuExit::Unknown(reason) => { + log::error!( + "AP{}: unknown exit reason {} after {} exits", + ap_id, + reason, + stats.exit_count + ); + return; + } + } + + if stats.exit_count >= MAX_EXITS { + log::warn!("AP{}: reached {} exit limit", ap_id, MAX_EXITS); + return; + } + } } /// Run a VM synchronously on the calling thread (blocking). @@ -766,6 +1268,59 @@ pub fn stop(_ctx_id: u32) -> super::error::Result<()> { )) } +/// Handle CPUID exit for any vCPU. +/// +/// Injects CPU topology info into leaf 1 and masks Hyper-V leaves. +/// This is a pure function (no side effects) for testability. +fn handle_cpuid( + vcpu_id: u8, + num_vcpus: u8, + leaf: u32, + default_rax: u64, + default_rbx: u64, + default_rcx: u64, + default_rdx: u64, +) -> (u64, u64, u64, u64) { + match leaf { + // Leaf 1: feature info + topology. + // EBX[23:16] = max number of addressable APIC IDs (num_vcpus) + // EBX[31:24] = initial APIC ID (vcpu_id) + // ECX bit 31: clear "hypervisor present" + 1 => { + let mut ebx = default_rbx; + // Clear bits 31:16, then set topology fields. + ebx &= 0xFFFF_FFFF_0000_FFFF; + ebx |= (num_vcpus as u64) << 16; // max APIC IDs + ebx |= (vcpu_id as u64) << 24; // initial APIC ID + ( + default_rax, + ebx, + default_rcx & !(1u64 << 31), // clear hypervisor present + default_rdx, + ) + } + // Hyper-V CPUID range: return zeros. + 0x40000000..=0x400000FF => (0, 0, 0, 0), + _ => (default_rax, default_rbx, default_rcx, default_rdx), + } +} + +/// Handle MSR read for any vCPU. +/// +/// Returns the value to inject for the given MSR. +/// IA32_APIC_BASE (0x1B) returns the APIC base address with enable + BSP bits. +fn handle_msr_read(vcpu_id: u8, msr_number: u32) -> u64 { + if msr_number == 0x1B { + let mut val: u64 = 0xFEE0_0000 | (1 << 11); // APIC base + enable bit + if vcpu_id == 0 { + val |= 1 << 8; // BSP flag + } + val + } else { + 0 + } +} + #[cfg(test)] mod tests { use super::super::context::VmContext; @@ -904,4 +1459,89 @@ mod tests { assert!(result.is_err()); } } + + // --- handle_cpuid tests --- + + #[test] + fn test_cpuid_leaf1_topology_bsp() { + // BSP (vcpu 0) with 2 vCPUs. + let (rax, rbx, rcx, rdx) = + super::handle_cpuid(0, 2, 1, 0x1234, 0x0000_0000_0000_5678, 0x8000_0001, 0xABCD); + // EBX[23:16] = num_vcpus = 2, EBX[31:24] = vcpu_id = 0 + assert_eq!(rbx & 0x00FF_0000, 0x0002_0000, "EBX[23:16] should be 2"); + assert_eq!(rbx & 0xFF00_0000, 0x0000_0000, "EBX[31:24] should be 0 for BSP"); + // EBX[15:0] preserved from default + assert_eq!(rbx & 0xFFFF, 0x5678, "EBX[15:0] should be preserved"); + // ECX bit 31 (hypervisor present) must be cleared + assert_eq!(rcx & (1 << 31), 0, "hypervisor present bit must be cleared"); + // RAX and RDX pass through + assert_eq!(rax, 0x1234); + assert_eq!(rdx, 0xABCD); + } + + #[test] + fn test_cpuid_leaf1_topology_ap() { + // AP (vcpu 3) with 4 vCPUs. + let (_, rbx, _, _) = + super::handle_cpuid(3, 4, 1, 0, 0, 0, 0); + assert_eq!( + (rbx >> 16) & 0xFF, + 4, + "EBX[23:16] should be num_vcpus=4" + ); + assert_eq!( + (rbx >> 24) & 0xFF, + 3, + "EBX[31:24] should be vcpu_id=3" + ); + } + + #[test] + fn test_cpuid_hyperv_leaves_zeroed() { + // Hyper-V CPUID range should return all zeros. + for leaf in [0x40000000u32, 0x40000001, 0x400000FF] { + let (rax, rbx, rcx, rdx) = + super::handle_cpuid(0, 1, leaf, 0xDEAD, 0xBEEF, 0xCAFE, 0xF00D); + assert_eq!((rax, rbx, rcx, rdx), (0, 0, 0, 0), "Hyper-V leaf 0x{:X} must be zeroed", leaf); + } + } + + #[test] + fn test_cpuid_passthrough_other_leaves() { + // Non-special leaves should pass through defaults unchanged. + let (rax, rbx, rcx, rdx) = + super::handle_cpuid(0, 2, 0, 0x1111, 0x2222, 0x3333, 0x4444); + assert_eq!((rax, rbx, rcx, rdx), (0x1111, 0x2222, 0x3333, 0x4444)); + + let (rax, rbx, rcx, rdx) = + super::handle_cpuid(0, 2, 7, 0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD); + assert_eq!((rax, rbx, rcx, rdx), (0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD)); + } + + // --- handle_msr_read tests --- + + #[test] + fn test_msr_apic_base_bsp() { + // BSP should have enable + BSP flag. + let val = super::handle_msr_read(0, 0x1B); + assert_eq!(val & 0xFFFFF000, 0xFEE0_0000, "APIC base address"); + assert_ne!(val & (1 << 11), 0, "APIC enable bit must be set"); + assert_ne!(val & (1 << 8), 0, "BSP flag must be set for vcpu 0"); + } + + #[test] + fn test_msr_apic_base_ap() { + // AP should have enable but NOT BSP flag. + let val = super::handle_msr_read(1, 0x1B); + assert_eq!(val & 0xFFFFF000, 0xFEE0_0000, "APIC base address"); + assert_ne!(val & (1 << 11), 0, "APIC enable bit must be set"); + assert_eq!(val & (1 << 8), 0, "BSP flag must NOT be set for AP"); + } + + #[test] + fn test_msr_unknown_returns_zero() { + // Unknown MSR should return 0. + assert_eq!(super::handle_msr_read(0, 0x174), 0); + assert_eq!(super::handle_msr_read(1, 0xC000_0080), 0); + } } diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index 1bafde4f5..7e9a7901f 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -254,7 +254,11 @@ mod imp { // SAFETY: Each vCPU is operated on by a single thread at a time. // The WHPX API permits calling WHvRunVirtualProcessor from a dedicated thread. + // Sync is needed because std::thread::scope borrows &WhpxVcpu across threads, + // but each &WhpxVcpu is only accessed by its dedicated vCPU thread — no + // concurrent access to the Cell fields occurs. unsafe impl Send for WhpxVcpu {} + unsafe impl Sync for WhpxVcpu {} impl WhpxVcpu { /// Create a new virtual processor in the given partition. @@ -1023,12 +1027,77 @@ mod imp { index: self.index, } } + + /// Configure AP initial register state after receiving SIPI. + /// + /// Sets the AP into real mode with CS:IP pointing to the SIPI trampoline: + /// - CS.base = sipi_vector * 0x1000, CS.selector = sipi_vector * 0x100 + /// - IP = 0 + /// - DL = APIC ID (Linux convention for AP identification) + /// - All other regs = 0 / default real mode values + pub fn set_ap_initial_regs(&self, sipi_vector: u8, apic_id: u8) -> Result<()> { + use super::super::types::{SegmentRegister, SpecialRegisters, StandardRegisters}; + + let cs_base = (sipi_vector as u64) * 0x1000; + let cs_selector = (sipi_vector as u16) * 0x100; + + let regs = StandardRegisters { + rdx: apic_id as u64, // Linux uses DL for APIC ID on AP startup + ..Default::default() + }; + + let sregs = SpecialRegisters { + cs: SegmentRegister { + base: cs_base, + limit: 0xFFFF, + selector: cs_selector, + access_rights: 0x9B, // present, code, readable, accessed + }, + ds: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, // present, data, writable, accessed + }, + es: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, + }, + fs: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, + }, + gs: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, + }, + ss: SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, + }, + cr0: 0x10, // ET (Extension Type) — required for real mode on x86 + ..Default::default() + }; + + self.set_registers(®s)?; + self.set_special_registers(&sregs)?; + Ok(()) + } } /// Lightweight handle for cancelling a running vCPU from another thread. /// /// Only supports the cancel operation — safe to use from a timer thread /// to preempt the vCPU for interrupt delivery. + #[derive(Clone)] pub struct VcpuCanceller { partition_handle: WHV_PARTITION_HANDLE, index: u32, From 85b7f5e2ffde9c3fabc55e4b7608fff4cdfe1f6b Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 30 Apr 2026 08:12:52 +0800 Subject: [PATCH 22/33] feat(windows): add virtio-rng and virtio-balloon devices Add two new virtio devices for improved guest quality: - virtio-rng (slot 5, IRQ 10): Provides host entropy to guest via /dev/hwrng using OsRng. Purely guest-initiated, no async worker. - virtio-balloon (slot 6, IRQ 11): Protocol-only dynamic memory management with inflate/deflate queues. Actual memory discard deferred to future iteration. Also adds write_config() to VirtioDeviceBackend trait (default no-op) for devices with writable config space (balloon needs it for 'actual'). Co-Authored-By: Claude Opus 4.6 --- src/devices/src/virtio/vsock/muxer.rs | 3 +- src/libkrun/src/lib.rs | 4 +- src/libkrun/src/windows_api.rs | 27 +- src/vmm/src/builder.rs | 4 +- src/vmm/src/windows/boot/acpi.rs | 6 +- src/vmm/src/windows/boot/loader.rs | 89 +++++- src/vmm/src/windows/boot/mod.rs | 1 + src/vmm/src/windows/boot/mp_table.rs | 272 ++++++++++++++++++ src/vmm/src/windows/boot/params.rs | 25 ++ src/vmm/src/windows/devices/ioapic.rs | 4 +- src/vmm/src/windows/devices/irq_chip.rs | 32 ++- src/vmm/src/windows/devices/lapic.rs | 60 ++-- src/vmm/src/windows/devices/manager.rs | 112 +++++++- src/vmm/src/windows/devices/pic.rs | 12 +- src/vmm/src/windows/devices/virtio/balloon.rs | 240 ++++++++++++++++ src/vmm/src/windows/devices/virtio/block.rs | 33 ++- .../windows/devices/virtio/block_worker.rs | 18 +- src/vmm/src/windows/devices/virtio/mmio.rs | 11 + src/vmm/src/windows/devices/virtio/mod.rs | 4 + src/vmm/src/windows/devices/virtio/rng.rs | 151 ++++++++++ .../devices/virtio/vsock/connection.rs | 16 +- .../src/windows/devices/virtio/vsock/mod.rs | 34 ++- src/vmm/src/windows/memory.rs | 55 +++- src/vmm/src/windows/runner.rs | 206 +++++++++---- src/vmm/src/windows/whpx.rs | 71 ++--- 25 files changed, 1266 insertions(+), 224 deletions(-) create mode 100644 src/vmm/src/windows/boot/mp_table.rs create mode 100644 src/vmm/src/windows/devices/virtio/balloon.rs create mode 100644 src/vmm/src/windows/devices/virtio/rng.rs diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index ea790b099..9421a9cc9 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -4,10 +4,9 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use super::super::Queue as VirtQueue; -use super::VsockError; use super::defs; use super::defs::uapi; -use super::muxer_rxq::{MuxerRxQ, rx_to_pkt}; +use super::muxer_rxq::{rx_to_pkt, MuxerRxQ}; use super::muxer_thread::MuxerThread; use super::packet::{TsiConnectReq, TsiGetnameRsp, VsockPacket}; use super::proxy::{Proxy, ProxyRemoval, ProxyUpdate}; diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 7f696e0c3..61ae7d383 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -91,7 +91,7 @@ const KRUNFW_NAME: &str = "libkrunfw-tdx.so.5"; #[cfg(target_os = "macos")] const KRUNFW_NAME: &str = "libkrunfw.5.dylib"; -#[cfg(feature = "nitro")] +#[cfg(feature = "aws-nitro")] static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); const INIT_PATH: &str = "/init.krun"; @@ -2792,7 +2792,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { } } -#[cfg(feature = "nitro")] +#[cfg(feature = "aws-nitro")] #[no_mangle] fn krun_start_enter_nitro(ctx_id: u32) -> i32 { let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { diff --git a/src/libkrun/src/windows_api.rs b/src/libkrun/src/windows_api.rs index 12883d5f5..a484b1de1 100644 --- a/src/libkrun/src/windows_api.rs +++ b/src/libkrun/src/windows_api.rs @@ -507,10 +507,7 @@ pub unsafe extern "C" fn krun_set_mapped_volumes( } #[no_mangle] -pub unsafe extern "C" fn krun_set_port_map( - _ctx_id: u32, - _c_port_map: *const *const c_char, -) -> i32 { +pub unsafe extern "C" fn krun_set_port_map(_ctx_id: u32, _c_port_map: *const *const c_char) -> i32 { 0 // No-op } @@ -557,10 +554,7 @@ pub unsafe extern "C" fn krun_add_vsock_port( } #[no_mangle] -pub unsafe extern "C" fn krun_set_tee_config_file( - _ctx_id: u32, - _c_filepath: *const c_char, -) -> i32 { +pub unsafe extern "C" fn krun_set_tee_config_file(_ctx_id: u32, _c_filepath: *const c_char) -> i32 { -libc::ENOSYS } @@ -625,11 +619,7 @@ pub extern "C" fn krun_stop(ctx_id: u32) -> i32 { /// Otherwise, copies up to `buf_size` bytes into `buf` and returns the number copied. /// Returns -1 if the ctx_id has no console buffer. #[no_mangle] -pub unsafe extern "C" fn krun_get_console_output( - ctx_id: u32, - buf: *mut u8, - buf_size: u32, -) -> i32 { +pub unsafe extern "C" fn krun_get_console_output(ctx_id: u32, buf: *mut u8, buf_size: u32) -> i32 { let output = match devices::get_console_output(ctx_id) { Some(data) => data, None => return -1, @@ -661,11 +651,7 @@ pub unsafe extern "C" fn krun_add_display(_ctx_id: u32, _width: u32, _height: u3 } #[no_mangle] -pub extern "C" fn krun_display_set_refresh_rate( - _ctx_id: u32, - _display_id: u32, - _rate: u32, -) -> i32 { +pub extern "C" fn krun_display_set_refresh_rate(_ctx_id: u32, _display_id: u32, _rate: u32) -> i32 { -libc::ENOSYS } @@ -745,10 +731,7 @@ pub unsafe extern "C" fn krun_add_serial_console_default(_ctx_id: u32) -> i32 { } #[no_mangle] -pub unsafe extern "C" fn krun_set_kernel_console( - _ctx_id: u32, - _console_id: *const c_char, -) -> i32 { +pub unsafe extern "C" fn krun_set_kernel_console(_ctx_id: u32, _console_id: *const c_char) -> i32 { -libc::ENOSYS } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5ddb9deff..2b4671380 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -72,7 +72,7 @@ use device_manager::shm::ShmManager; use devices::virtio::display::DisplayInfo; #[cfg(feature = "gpu")] use devices::virtio::display::NoopDisplayBackend; -#[cfg(not(any(feature = "tee", feature = "nitro")))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] use devices::virtio::{fs::ExportTable, VirtioShmRegion}; use flate2::read::GzDecoder; #[cfg(feature = "gpu")] @@ -90,7 +90,7 @@ use utils::eventfd::EventFd; use utils::worker_message::WorkerMessage; #[cfg(all(target_arch = "x86_64", not(feature = "efi"), not(feature = "tee")))] use vm_memory::mmap::MmapRegion; -#[cfg(not(any(feature = "tee", feature = "nitro")))] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] use vm_memory::Address; use vm_memory::Bytes; #[cfg(not(feature = "aws-nitro"))] diff --git a/src/vmm/src/windows/boot/acpi.rs b/src/vmm/src/windows/boot/acpi.rs index 59e9dbaa9..ba18fb1ce 100644 --- a/src/vmm/src/windows/boot/acpi.rs +++ b/src/vmm/src/windows/boot/acpi.rs @@ -156,7 +156,8 @@ pub fn build_acpi_tables(acpi_base: u64, num_vcpus: u8) -> Vec { assert!( MADT_OFFSET + madt_sz <= ACPI_REGION_SIZE as usize, "MADT ({} bytes for {} vCPUs) exceeds ACPI region", - madt_sz, num_vcpus, + madt_sz, + num_vcpus, ); build_madt(&mut region[MADT_OFFSET..MADT_OFFSET + madt_sz], num_vcpus); @@ -408,8 +409,7 @@ mod tests { #[test] fn test_madt_interrupt_source_override() { let region = build_acpi_tables(TEST_BASE, 1); - let off = - MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE; + let off = MADT_OFFSET + MADT_HEADER_SIZE + MADT_LAPIC_ENTRY_SIZE + MADT_IOAPIC_ENTRY_SIZE; assert_eq!(region[off], 2, "entry type: Interrupt Source Override"); assert_eq!(region[off + 1], 10, "entry length"); diff --git a/src/vmm/src/windows/boot/loader.rs b/src/vmm/src/windows/boot/loader.rs index c43a9afe8..7145ff5ae 100644 --- a/src/vmm/src/windows/boot/loader.rs +++ b/src/vmm/src/windows/boot/loader.rs @@ -7,7 +7,9 @@ use super::super::error::{Result, WkrunError}; use super::params::HDRS_MAGIC; #[cfg(any(target_os = "windows", test))] -use super::super::memory::{MMIO_REGION_SIZE, VIRTIO_MMIO_BASE}; +use super::super::memory::{ + IOAPIC_MMIO_BASE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE, MMIO_REGION_SIZE, VIRTIO_MMIO_BASE, +}; #[cfg(any(target_os = "windows", test))] use super::params::{E820Entry, E820_ACPI, E820_RAM, E820_RESERVED}; @@ -161,13 +163,20 @@ fn build_e820_map(ram_mib: u32, acpi_base: u64, acpi_size: u64) -> Vec 0x100000 { + let apic_start = IOAPIC_MMIO_BASE; + let apic_end = LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE; + if ram_bytes > VIRTIO_MMIO_BASE { let mmio_end = VIRTIO_MMIO_BASE + MMIO_REGION_SIZE; - // High memory below MMIO. + // High memory below Virtio MMIO. entries.push(E820Entry { addr: 0x100000, size: VIRTIO_MMIO_BASE - 0x100000, @@ -175,7 +184,7 @@ fn build_e820_map(ram_mib: u32, acpi_base: u64, acpi_size: u64) -> Vec Vec mmio_end { + if ram_bytes > apic_start { + // RAM extends past APIC region — add APIC hole. + // RAM between Virtio MMIO end and APIC start. + entries.push(E820Entry { + addr: mmio_end, + size: apic_start - mmio_end, + entry_type: E820_RAM, + _pad: 0, + }); + + // APIC MMIO region (reserved: IOAPIC + LAPIC). + entries.push(E820Entry { + addr: apic_start, + size: apic_end - apic_start, + entry_type: E820_RESERVED, + _pad: 0, + }); + + // RAM above APIC region. + if ram_bytes > apic_end { + entries.push(E820Entry { + addr: apic_end, + size: ram_bytes - apic_end, + entry_type: E820_RAM, + _pad: 0, + }); + } + } else if ram_bytes > mmio_end { + // RAM between Virtio MMIO and APIC — no APIC hole needed. entries.push(E820Entry { addr: mmio_end, size: ram_bytes - mmio_end, @@ -295,9 +331,17 @@ pub fn load_kernel_with_initrd( boot_params.set_cmdline_ptr(CMDLINE_START as u32); boot_params.set_cmdline_size(cmdline_bytes.len() as u32); - // Write ACPI tables to guest memory. + // Write ACPI tables to guest memory and tell the kernel where to find them. let acpi_data = acpi::build_acpi_tables(ACPI_START, num_vcpus); guest_mem.write_at_addr(ACPI_START, &acpi_data)?; + boot_params.set_acpi_rsdp_addr(ACPI_START); + + // Write Intel MP table for SMP CPU discovery. + // The kernel (CONFIG_X86_MPPARSE=y) scans 0x9FC00-0x9FFFF for the MP + // Floating Pointer Structure. This is needed because CONFIG_ACPI is not + // enabled in the kernel, so the MADT-based CPU discovery doesn't work. + let mp_data = super::mp_table::build_mp_tables(num_vcpus); + guest_mem.write_at_addr(super::mp_table::MP_FPS_ADDR, &mp_data)?; // Set E820 memory map. let e820_map = build_e820_map(ram_mib, ACPI_START, acpi::ACPI_REGION_SIZE); @@ -514,10 +558,16 @@ mod tests { } #[test] - fn test_build_e820_map_4096mb_has_mmio_hole() { + fn test_build_e820_map_4096mb_has_mmio_and_apic_holes() { let map = build_e820_map(4096, TEST_ACPI_BASE, TEST_ACPI_SIZE); - // Low + BIOS reserved + ACPI + high1 + MMIO reserved + high2 = 6 entries. - assert_eq!(map.len(), 6, "4GB RAM should have MMIO hole: {:?}", map); + // Low + BIOS reserved + ACPI + high1 + VIRTIO reserved + high2 + // + APIC reserved + high3 = 8 entries. + assert_eq!( + map.len(), + 8, + "4GB RAM should have VIRTIO + APIC holes: {:?}", + map + ); // Low memory. assert_eq!(map[0].addr, 0); @@ -532,21 +582,32 @@ mod tests { assert_eq!(map[2].size, TEST_ACPI_SIZE); assert_eq!(map[2].entry_type, E820_ACPI); - // High memory below MMIO. + // High memory below Virtio MMIO. assert_eq!(map[3].addr, 0x100000); assert_eq!(map[3].size, VIRTIO_MMIO_BASE - 0x100000); assert_eq!(map[3].entry_type, E820_RAM); - // MMIO reserved region. + // Virtio MMIO reserved region. assert_eq!(map[4].addr, VIRTIO_MMIO_BASE); assert_eq!(map[4].size, MMIO_REGION_SIZE); assert_eq!(map[4].entry_type, E820_RESERVED); - // High memory above MMIO. + // High memory between Virtio MMIO and APIC. let mmio_end = VIRTIO_MMIO_BASE + MMIO_REGION_SIZE; assert_eq!(map[5].addr, mmio_end); - assert_eq!(map[5].size, 4096 * 1024 * 1024 - mmio_end); + assert_eq!(map[5].size, IOAPIC_MMIO_BASE - mmio_end); assert_eq!(map[5].entry_type, E820_RAM); + + // APIC reserved region (IOAPIC + LAPIC). + let apic_end = LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE; + assert_eq!(map[6].addr, IOAPIC_MMIO_BASE); + assert_eq!(map[6].size, apic_end - IOAPIC_MMIO_BASE); + assert_eq!(map[6].entry_type, E820_RESERVED); + + // High memory above APIC. + assert_eq!(map[7].addr, apic_end); + assert_eq!(map[7].size, 4096 * 1024 * 1024 - apic_end); + assert_eq!(map[7].entry_type, E820_RAM); } #[test] diff --git a/src/vmm/src/windows/boot/mod.rs b/src/vmm/src/windows/boot/mod.rs index d999ad054..17fb4b465 100644 --- a/src/vmm/src/windows/boot/mod.rs +++ b/src/vmm/src/windows/boot/mod.rs @@ -2,5 +2,6 @@ pub mod acpi; pub mod loader; +pub mod mp_table; pub mod params; pub mod setup; diff --git a/src/vmm/src/windows/boot/mp_table.rs b/src/vmm/src/windows/boot/mp_table.rs new file mode 100644 index 000000000..c461069f3 --- /dev/null +++ b/src/vmm/src/windows/boot/mp_table.rs @@ -0,0 +1,272 @@ +//! Intel MultiProcessor Specification table generation. +//! +//! Generates the MP Floating Pointer Structure and MP Configuration Table +//! so the Linux kernel can discover multiple vCPUs when CONFIG_ACPI is +//! not enabled (CONFIG_X86_MPPARSE=y is sufficient). +//! +//! The kernel scans for the MP FPS in: +//! - First 1KB of EBDA +//! - Last 1KB of base memory (0x9FC00-0x9FFFF) +//! - BIOS ROM area (0xF0000-0xFFFFF) + +/// Guest physical address for the MP Floating Pointer Structure. +/// Placed at 0x9FC00 (start of the last 1KB of base memory). +pub const MP_FPS_ADDR: u64 = 0x9_FC00; + +/// Guest physical address for the MP Configuration Table. +/// Placed right after the 16-byte FPS. +const MP_TABLE_ADDR: u64 = MP_FPS_ADDR + 16; + +/// MP FPS size (always 16 bytes). +const FPS_SIZE: usize = 16; + +/// MP Configuration Table header size. +const MP_HEADER_SIZE: usize = 44; + +/// Processor entry size (type 0). +const PROC_ENTRY_SIZE: usize = 20; + +/// I/O APIC entry size (type 2). +const IOAPIC_ENTRY_SIZE: usize = 8; + +/// LAPIC base address (must match memory.rs and acpi.rs). +const LAPIC_BASE: u32 = 0xFEE0_0000; + +/// IOAPIC base address (must match memory.rs and acpi.rs). +const IOAPIC_BASE: u32 = 0xFEC0_0000; + +/// Total size needed for the MP table region. +pub fn mp_region_size(num_vcpus: u8) -> usize { + FPS_SIZE + MP_HEADER_SIZE + (num_vcpus as usize) * PROC_ENTRY_SIZE + IOAPIC_ENTRY_SIZE +} + +/// Build the MP Floating Pointer Structure (16 bytes). +/// +/// Placed at `fps_addr`, points to the MP Configuration Table at `table_addr`. +fn build_fps(fps: &mut [u8], table_addr: u32) { + // Signature "_MP_" + fps[0..4].copy_from_slice(b"_MP_"); + // Physical Address of MP Configuration Table + fps[4..8].copy_from_slice(&table_addr.to_le_bytes()); + // Length in 16-byte paragraphs (always 1) + fps[8] = 1; + // MP Specification revision (1.4) + fps[9] = 4; + // Checksum (computed below) + // fps[10] = checksum + // Feature bytes 1-5 (all 0 = use MP config table) + fps[11] = 0; + fps[12] = 0; + fps[13] = 0; + fps[14] = 0; + fps[15] = 0; + + mp_checksum(fps, 10); +} + +/// Build the MP Configuration Table. +/// +/// Contains: +/// - 44-byte header +/// - N processor entries (20 bytes each) +/// - 1 I/O APIC entry (8 bytes) +fn build_mp_config_table(table: &mut [u8], num_vcpus: u8) { + let entry_count = num_vcpus as u16 + 1; // N processors + 1 I/O APIC + let base_table_length = + MP_HEADER_SIZE + (num_vcpus as usize) * PROC_ENTRY_SIZE + IOAPIC_ENTRY_SIZE; + + // ---- Header (44 bytes) ---- + table[0..4].copy_from_slice(b"PCMP"); // Signature + table[4..6].copy_from_slice(&(base_table_length as u16).to_le_bytes()); // Base Table Length + table[6] = 4; // Spec Revision (1.4) + // table[7] = checksum (computed below) + table[8..16].copy_from_slice(b"BOXLTE\0\0"); // OEM ID (8 bytes) + table[16..28].copy_from_slice(b"BOXLITE-VM\0\0"); // Product ID (12 bytes) + // OEM Table Pointer (offset 28, 4 bytes) = 0 + // OEM Table Size (offset 32, 2 bytes) = 0 + table[34..36].copy_from_slice(&entry_count.to_le_bytes()); // Entry Count + table[36..40].copy_from_slice(&LAPIC_BASE.to_le_bytes()); // Local APIC Address + // Extended Table Length (offset 40, 2 bytes) = 0 + // Extended Table Checksum (offset 42, 1 byte) = 0 + // Reserved (offset 43, 1 byte) = 0 + + // ---- Processor entries (type 0, 20 bytes each) ---- + let mut off = MP_HEADER_SIZE; + for i in 0..num_vcpus { + table[off] = 0; // Entry type: Processor + table[off + 1] = i; // Local APIC ID + table[off + 2] = 0x14; // Local APIC Version + // CPU Flags: bit 0 = EN (usable), bit 1 = BP (bootstrap processor) + table[off + 3] = if i == 0 { 0x03 } else { 0x01 }; // BSP=3, AP=1 + // CPU Signature (4 bytes) — use a generic Family 6 Model signature. + // This doesn't need to match the host exactly; the kernel reads CPUID directly. + table[off + 4..off + 8].copy_from_slice(&0x0006_0000u32.to_le_bytes()); + // Feature Flags (4 bytes) — basic x86-64 features. + table[off + 8..off + 12].copy_from_slice(&0x0000_0000u32.to_le_bytes()); + // Reserved (8 bytes) = 0 + off += PROC_ENTRY_SIZE; + } + + // ---- I/O APIC entry (type 2, 8 bytes) ---- + table[off] = 2; // Entry type: I/O APIC + table[off + 1] = num_vcpus; // I/O APIC ID (after all LAPIC IDs) + table[off + 2] = 0x20; // I/O APIC Version + table[off + 3] = 0x01; // Flags: EN (enabled) + table[off + 4..off + 8].copy_from_slice(&IOAPIC_BASE.to_le_bytes()); // I/O APIC Address + + // Compute header checksum over the entire base table. + mp_checksum(&mut table[..base_table_length], 7); +} + +/// Build the complete MP table region (FPS + Configuration Table). +/// +/// Returns a `Vec` that should be written to guest memory at `MP_FPS_ADDR`. +pub fn build_mp_tables(num_vcpus: u8) -> Vec { + let total_size = mp_region_size(num_vcpus); + let mut region = vec![0u8; total_size]; + + // Build the FPS (first 16 bytes), pointing to the config table. + build_fps(&mut region[..FPS_SIZE], MP_TABLE_ADDR as u32); + + // Build the MP Configuration Table (after the FPS). + build_mp_config_table(&mut region[FPS_SIZE..], num_vcpus); + + region +} + +/// Compute MP checksum and store at `checksum_offset`. +/// +/// The checksum byte is set so the sum of all bytes equals zero (mod 256). +fn mp_checksum(data: &mut [u8], checksum_offset: usize) { + data[checksum_offset] = 0; + let sum: u8 = data.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + data[checksum_offset] = 0u8.wrapping_sub(sum); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fps_signature_and_checksum() { + let region = build_mp_tables(2); + let fps = ®ion[..FPS_SIZE]; + + assert_eq!(&fps[0..4], b"_MP_"); + let sum: u8 = fps.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "FPS checksum must be zero"); + } + + #[test] + fn test_fps_points_to_table() { + let region = build_mp_tables(2); + let fps = ®ion[..FPS_SIZE]; + + let table_addr = u32::from_le_bytes(fps[4..8].try_into().unwrap()); + assert_eq!(table_addr, MP_TABLE_ADDR as u32); + } + + #[test] + fn test_fps_revision() { + let region = build_mp_tables(2); + assert_eq!(region[8], 1, "FPS length must be 1 paragraph"); + assert_eq!(region[9], 4, "FPS revision must be 1.4"); + } + + #[test] + fn test_mp_table_signature_and_checksum() { + let region = build_mp_tables(2); + let table_start = FPS_SIZE; + let table_len = MP_HEADER_SIZE + 2 * PROC_ENTRY_SIZE + IOAPIC_ENTRY_SIZE; + let table = ®ion[table_start..table_start + table_len]; + + assert_eq!(&table[0..4], b"PCMP"); + + let length = u16::from_le_bytes(table[4..6].try_into().unwrap()); + assert_eq!(length, table_len as u16); + + let sum: u8 = table.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)); + assert_eq!(sum, 0, "MP table checksum must be zero"); + } + + #[test] + fn test_mp_table_processor_entries() { + let region = build_mp_tables(4); + let table_start = FPS_SIZE; + + // Check entry count + let entry_count = u16::from_le_bytes( + region[table_start + 34..table_start + 36] + .try_into() + .unwrap(), + ); + assert_eq!(entry_count, 5, "4 processors + 1 I/O APIC"); + + // Check processor entries + for i in 0..4u8 { + let off = table_start + MP_HEADER_SIZE + (i as usize) * PROC_ENTRY_SIZE; + assert_eq!(region[off], 0, "entry type: Processor for vCPU {}", i); + assert_eq!(region[off + 1], i, "APIC ID for vCPU {}", i); + assert_eq!(region[off + 2], 0x14, "APIC version for vCPU {}", i); + if i == 0 { + assert_eq!(region[off + 3], 0x03, "BSP flags (EN + BP)"); + } else { + assert_eq!(region[off + 3], 0x01, "AP flags (EN only)"); + } + } + } + + #[test] + fn test_mp_table_ioapic_entry() { + let region = build_mp_tables(2); + let table_start = FPS_SIZE; + let ioapic_off = table_start + MP_HEADER_SIZE + 2 * PROC_ENTRY_SIZE; + + assert_eq!(region[ioapic_off], 2, "entry type: I/O APIC"); + assert_eq!(region[ioapic_off + 1], 2, "I/O APIC ID"); + assert_eq!(region[ioapic_off + 3], 0x01, "enabled flag"); + + let addr = u32::from_le_bytes(region[ioapic_off + 4..ioapic_off + 8].try_into().unwrap()); + assert_eq!(addr, IOAPIC_BASE); + } + + #[test] + fn test_mp_table_lapic_address() { + let region = build_mp_tables(1); + let table_start = FPS_SIZE; + + let lapic_addr = u32::from_le_bytes( + region[table_start + 36..table_start + 40] + .try_into() + .unwrap(), + ); + assert_eq!(lapic_addr, LAPIC_BASE); + } + + #[test] + fn test_single_vcpu() { + let region = build_mp_tables(1); + let total = mp_region_size(1); + assert_eq!(region.len(), total); + + let table_start = FPS_SIZE; + let entry_count = u16::from_le_bytes( + region[table_start + 34..table_start + 36] + .try_into() + .unwrap(), + ); + assert_eq!(entry_count, 2, "1 processor + 1 I/O APIC"); + } + + #[test] + fn test_region_fits_in_base_memory() { + // MP tables for up to 16 vCPUs must fit in the scan area. + let max_size = mp_region_size(16); + // FPS at 0x9FC00, scan area is 0x9FC00-0x9FFFF (1024 bytes). + assert!( + max_size <= 1024, + "MP tables for 16 vCPUs ({} bytes) exceed scan area", + max_size, + ); + } +} diff --git a/src/vmm/src/windows/boot/params.rs b/src/vmm/src/windows/boot/params.rs index b1c410e6b..8562988ab 100644 --- a/src/vmm/src/windows/boot/params.rs +++ b/src/vmm/src/windows/boot/params.rs @@ -146,6 +146,15 @@ impl BootParams { self.data[0x218..0x21C].copy_from_slice(&addr.to_le_bytes()); self.data[0x21C..0x220].copy_from_slice(&size.to_le_bytes()); } + + /// Set the ACPI RSDP physical address (boot protocol 2.14+, offset 0x070). + /// + /// When set, the kernel uses this address directly instead of scanning + /// the BIOS ROM area (0xE0000-0xFFFFF) for the RSDP signature. + /// For older kernels (protocol < 2.14), this field is padding and ignored. + pub fn set_acpi_rsdp_addr(&mut self, addr: u64) { + self.data[0x070..0x078].copy_from_slice(&addr.to_le_bytes()); + } } #[cfg(test)] @@ -242,4 +251,20 @@ mod tests { assert_eq!(addr, 0x1000000); assert_eq!(size, 0x500000); } + + #[test] + fn test_boot_params_acpi_rsdp_addr() { + let mut params = BootParams::new(); + params.set_acpi_rsdp_addr(0xE0000); + + let addr = u64::from_le_bytes(params.data[0x070..0x078].try_into().unwrap()); + assert_eq!(addr, 0xE0000); + } + + #[test] + fn test_boot_params_acpi_rsdp_addr_default_zero() { + let params = BootParams::new(); + let addr = u64::from_le_bytes(params.data[0x070..0x078].try_into().unwrap()); + assert_eq!(addr, 0, "acpi_rsdp_addr should be 0 by default"); + } } diff --git a/src/vmm/src/windows/devices/ioapic.rs b/src/vmm/src/windows/devices/ioapic.rs index dc6779bcc..00fa4f5d3 100644 --- a/src/vmm/src/windows/devices/ioapic.rs +++ b/src/vmm/src/windows/devices/ioapic.rs @@ -207,8 +207,8 @@ impl IoApic { fn read_register(&self, reg: u8) -> u32 { match reg { 0x00 => (self.id as u32) << 24, // IOAPIC ID - 0x01 => IOAPIC_VERSION, // Version - 0x02 => 0, // Arbitration ID (not used) + 0x01 => IOAPIC_VERSION, // Version + 0x02 => 0, // Arbitration ID (not used) 0x10..=0x3F => { let pin = ((reg - 0x10) / 2) as usize; if pin < NUM_PINS { diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs index a1d7726e9..c9b2b2cb4 100644 --- a/src/vmm/src/windows/devices/irq_chip.rs +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -12,10 +12,10 @@ use std::time::Instant; +use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; use super::ioapic::IoApic; use super::lapic::{IpiAction, LocalApic}; use super::pic::Pic; -use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; /// Result of an IrqChip MMIO write operation. #[derive(Debug)] @@ -56,7 +56,9 @@ impl Default for IrqChip { impl IrqChip { /// Create a new IrqChip in PIC mode (legacy boot) with N LAPICs. pub fn new(num_vcpus: u8) -> Self { - let lapics = (0..num_vcpus).map(|id| LocalApic::new_with_id(id)).collect(); + let lapics = (0..num_vcpus) + .map(|id| LocalApic::new_with_id(id)) + .collect(); Self { pic: Pic::new(), ioapic: IoApic::new(), @@ -117,7 +119,9 @@ impl IrqChip { /// Check if there are any pending interrupts for a specific vCPU. pub fn has_pending(&self, vcpu_id: u8) -> bool { if self.apic_mode { - self.lapics[vcpu_id as usize].get_highest_injectable().is_some() + self.lapics[vcpu_id as usize] + .get_highest_injectable() + .is_some() } else if vcpu_id == 0 { self.pic.has_pending() } else { @@ -199,7 +203,13 @@ impl IrqChip { /// /// Returns an `IrqChipWriteResult` indicating whether the address was handled /// and any IPI action from an ICR write. - pub fn handle_mmio_write(&mut self, vcpu_id: u8, addr: u64, _size: u8, data: u32) -> IrqChipWriteResult { + pub fn handle_mmio_write( + &mut self, + vcpu_id: u8, + addr: u64, + _size: u8, + data: u32, + ) -> IrqChipWriteResult { if addr >= IOAPIC_MMIO_BASE && addr < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { let offset = addr - IOAPIC_MMIO_BASE; self.ioapic.write_mmio(offset, data); @@ -256,9 +266,7 @@ impl IrqChip { } let any_lapic_enabled = self.lapics.iter().any(|l| l.is_enabled()); if any_lapic_enabled && self.ioapic.has_unmasked_entries() { - log::info!( - "APIC mode enabled — LAPIC active + IOAPIC has unmasked entries" - ); + log::info!("APIC mode enabled — LAPIC active + IOAPIC has unmasked entries"); self.apic_mode = true; } } @@ -414,9 +422,15 @@ mod tests { fn test_irq_chip_mmio_read_lapic_id_per_vcpu() { let chip = IrqChip::new(2); // vCPU 0 reads its own LAPIC ID. - assert_eq!(chip.handle_mmio_read(0, LAPIC_MMIO_BASE + 0x020, 4), Some(0 << 24)); + assert_eq!( + chip.handle_mmio_read(0, LAPIC_MMIO_BASE + 0x020, 4), + Some(0 << 24) + ); // vCPU 1 reads its own LAPIC ID. - assert_eq!(chip.handle_mmio_read(1, LAPIC_MMIO_BASE + 0x020, 4), Some(1 << 24)); + assert_eq!( + chip.handle_mmio_read(1, LAPIC_MMIO_BASE + 0x020, 4), + Some(1 << 24) + ); } #[test] diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index 8d09a0c76..e929b4a96 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -197,6 +197,26 @@ impl LocalApic { Some(highest) } + /// Compute the LAPIC timer current count register value. + /// + /// Returns the remaining count based on elapsed time since the timer was + /// armed. The kernel reads this during timer calibration and busy-waits. + fn current_count(&self) -> u32 { + if let Some(deadline) = self.timer_deadline { + let now = Instant::now(); + if now < deadline { + let remaining_ns = deadline.duration_since(now).as_nanos() as u64; + let tick_ns = 100 * self.timer_divisor as u64; + let remaining_ticks = remaining_ns / tick_ns; + (remaining_ticks as u32).min(self.timer_initial) + } else { + 0 + } + } else { + 0 + } + } + /// Tick the LAPIC timer. Returns the timer vector if it fired. pub fn tick_timer(&mut self, now: Instant) -> Option { if self.timer_masked || self.timer_initial == 0 { @@ -229,10 +249,10 @@ impl LocalApic { pub fn read_mmio(&self, offset: u64) -> u32 { match offset { 0x020 => (self.id as u32) << 24, // LAPIC ID - 0x030 => LAPIC_VERSION, // Version - 0x080 => self.tpr as u32, // TPR - 0x0B0 => 0, // EOI (write-only) - 0x0F0 => self.svr, // SVR + 0x030 => LAPIC_VERSION, // Version + 0x080 => self.tpr as u32, // TPR + 0x0B0 => 0, // EOI (write-only) + 0x0F0 => self.svr, // SVR // ISR: 0x100, 0x110, 0x120, ..., 0x170 0x100..=0x170 if offset & 0x0F == 0 => { let idx = ((offset - 0x100) / 0x10) as usize; @@ -255,7 +275,7 @@ impl LocalApic { 0x310 => self.icr_high, // ICR High 0x320 => self.read_lvt_timer(), // LVT Timer 0x380 => self.timer_initial, // Timer Initial Count - 0x390 => 0, // Timer Current Count (approximation) + 0x390 => self.current_count(), // Timer Current Count 0x3E0 => self.timer_divide_reg, // Timer Divide Configuration _ => 0, } @@ -337,7 +357,9 @@ impl LocalApic { // Fixed delivery. log::debug!( "LAPIC {} ICR: Fixed interrupt vector={:#X} → APIC {}", - self.id, vector, dest_apic_id + self.id, + vector, + dest_apic_id ); IpiAction::SendInterrupt { target_apic_id: dest_apic_id, @@ -346,10 +368,7 @@ impl LocalApic { } 0b101 => { // INIT delivery. - log::debug!( - "LAPIC {} ICR: INIT → APIC {}", - self.id, dest_apic_id - ); + log::debug!("LAPIC {} ICR: INIT → APIC {}", self.id, dest_apic_id); IpiAction::SendInit { target_apic_id: dest_apic_id, } @@ -358,7 +377,10 @@ impl LocalApic { // Startup IPI (SIPI). log::debug!( "LAPIC {} ICR: SIPI vector={:#X} → APIC {} (start at {:#X})", - self.id, vector, dest_apic_id, (vector as u32) * 0x1000 + self.id, + vector, + dest_apic_id, + (vector as u32) * 0x1000 ); IpiAction::SendSipi { target_apic_id: dest_apic_id, @@ -368,7 +390,9 @@ impl LocalApic { _ => { log::debug!( "LAPIC {} ICR: unsupported delivery mode {} → APIC {}", - self.id, delivery_mode, dest_apic_id + self.id, + delivery_mode, + dest_apic_id ); IpiAction::None } @@ -380,9 +404,7 @@ impl LocalApic { /// PPR = max(TPR, highest ISR priority class) — determines the minimum /// priority class that can be delivered. fn processor_priority(&self) -> u8 { - let isr_class = Self::highest_bit(&self.isr) - .map(|v| v & 0xF0) - .unwrap_or(0); + let isr_class = Self::highest_bit(&self.isr).map(|v| v & 0xF0).unwrap_or(0); let tpr_class = self.tpr & 0xF0; std::cmp::max(isr_class, tpr_class) } @@ -443,9 +465,9 @@ impl LocalApic { /// Write the Timer Divide Configuration register. fn write_divide_config(&mut self, value: u32) { self.timer_divide_reg = value & 0x0B; // Only bits 0,1,3 are used. - // Decode divisor: bits [3,1,0] encode the divisor. - // 0b000=2, 0b001=4, 0b010=8, 0b011=16, - // 0b100=32, 0b101=64, 0b110=128, 0b111=1 + // Decode divisor: bits [3,1,0] encode the divisor. + // 0b000=2, 0b001=4, 0b010=8, 0b011=16, + // 0b100=32, 0b101=64, 0b110=128, 0b111=1 let div_bits = ((value & 0x08) >> 1) | (value & 0x03); self.timer_divisor = match div_bits { 0b000 => 2, @@ -723,7 +745,7 @@ mod tests { assert_eq!(LocalApic::highest_bit(®), Some(255)); reg[3] = 1 << 16; // bit 112 - // Highest should still be 255. + // Highest should still be 255. assert_eq!(LocalApic::highest_bit(®), Some(255)); } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 1936f5c39..43bad3af9 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -18,12 +18,14 @@ use super::irq_chip::IrqChip; use super::lapic::IpiAction; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; +use super::virtio::balloon::VirtioBalloon; use super::virtio::block::VirtioBlock; use super::virtio::disk::open_disk_backend; use super::virtio::mmio::VirtioMmioDevice; use super::virtio::net::VirtioNet; use super::virtio::p9::Virtio9p; use super::virtio::queue::GuestMemoryAccessor; +use super::virtio::rng::VirtioRng; use super::virtio::vsock::VirtioVsock; /// Shared console output buffer. @@ -215,6 +217,10 @@ pub struct DeviceManager { virtio_net: Option>, /// Second virtio-blk device (slot 4) — optional, for guest rootfs. virtio_blk2: Option>, + /// Virtio-rng device (slot 5) — always present. + virtio_rng: VirtioMmioDevice, + /// Virtio-balloon device (slot 6) — always present. + virtio_balloon: VirtioMmioDevice, /// Diagnostic: count QUEUE_NOTIFY writes to blk devices. blk_queue_notify_count: u64, @@ -340,6 +346,12 @@ impl DeviceManager { None }; + // Virtio-rng (slot 5) — always present. + let virtio_rng = VirtioMmioDevice::new(VirtioRng::new()); + + // Virtio-balloon (slot 6) — always present. + let virtio_balloon = VirtioMmioDevice::new(VirtioBalloon::new()); + // Build MMIO slots for kernel cmdline. let mmio_slots = vec![ MmioSlot { @@ -362,6 +374,14 @@ impl DeviceManager { index: 4, active: virtio_blk2.is_some(), }, + MmioSlot { + index: 5, + active: true, + }, // rng always active + MmioSlot { + index: 6, + active: true, + }, // balloon always active ]; let devices = DeviceManager { @@ -374,6 +394,8 @@ impl DeviceManager { virtio_9p, virtio_net, virtio_blk2, + virtio_rng, + virtio_balloon, blk_queue_notify_count: 0, blk_completion_count: 0, ioapic_mmio_count: 0, @@ -478,7 +500,9 @@ impl DeviceManager { pub fn handle_mmio_read(&mut self, vcpu_id: u8, address: u64, size: u8) -> u64 { // Check IOAPIC/LAPIC ranges first. if let Some(val) = self.irq_chip.handle_mmio_read(vcpu_id, address, size) { - use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; + use super::super::memory::{ + IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE, + }; if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { self.ioapic_mmio_count += 1; } else if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { @@ -492,6 +516,8 @@ impl DeviceManager { let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); let blk2_offset = address.wrapping_sub(mmio_base_for_slot(4)); + let rng_offset = address.wrapping_sub(mmio_base_for_slot(5)); + let balloon_offset = address.wrapping_sub(mmio_base_for_slot(6)); if blk_offset < MMIO_SLOT_SIZE { if let Some(ref dev) = self.virtio_blk { @@ -519,6 +545,10 @@ impl DeviceManager { } else { 0 } + } else if rng_offset < MMIO_SLOT_SIZE { + self.virtio_rng.read(rng_offset, size) as u64 + } else if balloon_offset < MMIO_SLOT_SIZE { + self.virtio_balloon.read(balloon_offset, size) as u64 } else { 0 } @@ -537,9 +567,13 @@ impl DeviceManager { mem: &dyn GuestMemoryAccessor, ) -> IpiAction { // Check IOAPIC/LAPIC ranges first. - let result = self.irq_chip.handle_mmio_write(vcpu_id, address, size as u8, data as u32); + let result = self + .irq_chip + .handle_mmio_write(vcpu_id, address, size as u8, data as u32); if result.handled { - use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; + use super::super::memory::{ + IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE, + }; if address >= IOAPIC_MMIO_BASE && address < IOAPIC_MMIO_BASE + IOAPIC_MMIO_SIZE { self.ioapic_mmio_count += 1; } else if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { @@ -553,6 +587,8 @@ impl DeviceManager { let p9_offset = address.wrapping_sub(mmio_base_for_slot(2)); let net_offset = address.wrapping_sub(mmio_base_for_slot(3)); let blk2_offset = address.wrapping_sub(mmio_base_for_slot(4)); + let rng_offset = address.wrapping_sub(mmio_base_for_slot(5)); + let balloon_offset = address.wrapping_sub(mmio_base_for_slot(6)); if blk_offset < MMIO_SLOT_SIZE { if blk_offset == 0x050 { @@ -591,6 +627,17 @@ impl DeviceManager { self.irq_chip.raise_irq(irq_for_slot(4)); } } + } else if rng_offset < MMIO_SLOT_SIZE { + if self.virtio_rng.write(rng_offset, data as u32, size, mem) { + self.irq_chip.raise_irq(irq_for_slot(5)); + } + } else if balloon_offset < MMIO_SLOT_SIZE { + if self + .virtio_balloon + .write(balloon_offset, data as u32, size, mem) + { + self.irq_chip.raise_irq(irq_for_slot(6)); + } } IpiAction::None } @@ -637,8 +684,14 @@ impl DeviceManager { } } - // Tick LAPIC timer (only fires in APIC mode). - self.irq_chip.tick_timer(vcpu_id, now); + // Tick LAPIC timers for ALL vCPUs (only fires in APIC mode). + // Each AP's LAPIC timer must advance so the kernel scheduler can preempt + // tasks on all CPUs. Without this, AP LAPIC timer calibration hangs. + for i in 0..self.irq_chip.num_vcpus() { + self.irq_chip.tick_timer(i, now); + } + // Suppress unused variable — vcpu_id was the original single-vCPU target. + let _ = vcpu_id; // Drain async block I/O completions. if let Some(ref mut dev) = self.virtio_blk { @@ -741,6 +794,8 @@ pub fn device_manager_with_serial(serial: Serial) -> DeviceManager { virtio_9p: None, virtio_net: None, virtio_blk2: None, + virtio_rng: VirtioMmioDevice::new(VirtioRng::new()), + virtio_balloon: VirtioMmioDevice::new(VirtioBalloon::new()), blk_queue_notify_count: 0, blk_completion_count: 0, ioapic_mmio_count: 0, @@ -867,7 +922,11 @@ mod tests { // Year should be 2025–2099 in BCD (0x25..0x99). assert!(t.year >= 0x25, "year BCD too low: {:#04x}", t.year); // Month 1..12 in BCD (0x01..0x12). - assert!(t.month >= 0x01 && t.month <= 0x12, "month: {:#04x}", t.month); + assert!( + t.month >= 0x01 && t.month <= 0x12, + "month: {:#04x}", + t.month + ); // Day 1..31 in BCD. assert!(t.day_of_month >= 0x01 && t.day_of_month <= 0x31); // Hours 0..23 in BCD. @@ -1065,4 +1124,45 @@ mod tests { let mut dm = make_test_devices(); assert_eq!(dm.handle_io_in(0x604, 2), 0x00); } + + // --- virtio-rng (slot 5) --- + + #[test] + fn test_mmio_read_rng_magic() { + let dm = make_test_devices(); + let magic = dm.handle_mmio_read(0, mmio_base_for_slot(5), 4); + assert_eq!(magic, 0x7472_6976); // "virt" in LE. + } + + #[test] + fn test_mmio_read_rng_device_id() { + let dm = make_test_devices(); + let device_id = dm.handle_mmio_read(0, mmio_base_for_slot(5) + 0x008, 4); + assert_eq!(device_id, 4); // VIRTIO_ID_RNG + } + + // --- virtio-balloon (slot 6) --- + + #[test] + fn test_mmio_read_balloon_magic() { + let dm = make_test_devices(); + let magic = dm.handle_mmio_read(0, mmio_base_for_slot(6), 4); + assert_eq!(magic, 0x7472_6976); // "virt" in LE. + } + + #[test] + fn test_mmio_read_balloon_device_id() { + let dm = make_test_devices(); + let device_id = dm.handle_mmio_read(0, mmio_base_for_slot(6) + 0x008, 4); + assert_eq!(device_id, 5); // VIRTIO_ID_BALLOON + } + + #[test] + fn test_from_context_rng_and_balloon_always_active() { + let ctx = VmContext::default_for_test(); + let setup = DeviceManager::from_context(&ctx).unwrap(); + // Slots 5 (rng) and 6 (balloon) should always be active. + assert!(setup.mmio_slots[5].active, "rng slot should be active"); + assert!(setup.mmio_slots[6].active, "balloon slot should be active"); + } } diff --git a/src/vmm/src/windows/devices/pic.rs b/src/vmm/src/windows/devices/pic.rs index 834f2503a..8838848d1 100644 --- a/src/vmm/src/windows/devices/pic.rs +++ b/src/vmm/src/windows/devices/pic.rs @@ -424,7 +424,11 @@ mod tests { assert_eq!(vector, Some(0x21)); // 0x20 + 1 // IRQ 3 is blocked while IRQ 1 is in-service (lower priority). - assert_eq!(chip.pending_irq(), None, "IRQ 3 blocked while IRQ 1 in-service"); + assert_eq!( + chip.pending_irq(), + None, + "IRQ 3 blocked while IRQ 1 in-service" + ); // After EOI for IRQ 1, IRQ 3 becomes deliverable. chip.write_command(0x61); // Specific EOI for IRQ 1. @@ -444,7 +448,11 @@ mod tests { // With proper 8259A priority masking, IRQ 1 is blocked while // IRQ 0 is in-service (all equal-or-lower priority blocked). chip.raise_irq(1); - assert_eq!(chip.pending_irq(), None, "IRQ 1 must be blocked while IRQ 0 is in-service"); + assert_eq!( + chip.pending_irq(), + None, + "IRQ 1 must be blocked while IRQ 0 is in-service" + ); // After EOI for IRQ 0, IRQ 1 becomes deliverable. chip.write_command(0x60); // Specific EOI for IRQ 0. diff --git a/src/vmm/src/windows/devices/virtio/balloon.rs b/src/vmm/src/windows/devices/virtio/balloon.rs new file mode 100644 index 000000000..ec3aa1250 --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/balloon.rs @@ -0,0 +1,240 @@ +//! Virtio-balloon device (virtio spec v1.2 Section 5.5). +//! +//! Allows the host to request the guest to return or reclaim memory pages. +//! The guest driver inflates the balloon (returns pages) or deflates it +//! (reclaims pages) by sending page frame numbers on the respective queues. +//! +//! This implementation is protocol-only: inflate/deflate queues are processed +//! but no actual memory discard happens on the host side. Actual memory +//! reclamation would require extending `GuestMemoryAccessor` with a `discard()` +//! method (deferred to a future iteration). + +use super::mmio::VirtioDeviceBackend; +use super::queue::{GuestMemoryAccessor, Virtqueue}; + +/// Virtio device ID for balloon (spec 5.5). +const VIRTIO_ID_BALLOON: u32 = 5; + +/// VIRTIO_F_VERSION_1 — bit 32 (feature page 1, bit 0). +const VIRTIO_F_VERSION_1_PAGE1: u32 = 1; + +/// Maximum queue size for inflate/deflate queues. +const QUEUE_MAX_SIZE: u16 = 256; + +/// Inflate queue index (guest returns pages to host). +const INFLATE_QUEUE: u32 = 0; + +/// Deflate queue index (guest reclaims pages from host). +const DEFLATE_QUEUE: u32 = 1; + +/// Virtio-balloon backend. +/// +/// Config space layout (little-endian): +/// - offset 0: `num_pages` (u32) — target number of pages the balloon should hold. +/// - offset 4: `actual` (u32) — actual number of pages the balloon currently holds. +/// +/// The host sets `num_pages` to request inflation/deflation. +/// The guest writes `actual` to report current balloon size. +pub struct VirtioBalloon { + /// Target number of pages (set by host, read by guest). + num_pages: u32, + /// Actual number of pages (set by guest via config write). + actual: u32, +} + +impl VirtioBalloon { + pub fn new() -> Self { + VirtioBalloon { + num_pages: 0, + actual: 0, + } + } + + /// Set the target number of balloon pages (host API). + pub fn set_target_pages(&mut self, pages: u32) { + self.num_pages = pages; + } + + /// Write to config space at the given byte offset. + /// + /// The guest writes `actual` to offset 4 to report the current balloon size. + pub fn write_config(&mut self, offset: u64, value: u32) { + if offset == 4 { + self.actual = value; + } + // Writes to other offsets are silently ignored. + } +} + +impl VirtioDeviceBackend for VirtioBalloon { + fn device_id(&self) -> u32 { + VIRTIO_ID_BALLOON + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 0 => 0, + 1 => VIRTIO_F_VERSION_1_PAGE1, + _ => 0, + } + } + + fn read_config(&self, offset: u64) -> u32 { + match offset { + 0 => self.num_pages, + 4 => self.actual, + _ => 0, + } + } + + fn write_config(&mut self, offset: u64, value: u32) { + // Delegate to the inherent method. + VirtioBalloon::write_config(self, offset, value); + } + + fn num_queues(&self) -> usize { + 2 // inflate + deflate + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + QUEUE_MAX_SIZE + } + + fn queue_notify( + &mut self, + queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + let mut raised = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(e) => { + log::warn!("virtio-balloon: failed to read descriptor chain: {}", e); + break; + } + }; + + // Count page frame numbers (PFNs) in the chain. + // Each PFN is a u32 (4 bytes). The guest sends arrays of PFNs. + let mut pfn_count = 0u32; + for desc in &chain { + if desc.is_write() { + continue; // PFN buffers are device-readable. + } + pfn_count += desc.len / 4; + } + + match queue_idx { + INFLATE_QUEUE => { + // Guest is returning pages. In a full implementation, we would + // call madvise(MADV_DONTNEED) or equivalent on the host pages. + // For now, just track the count. + self.actual = self.actual.saturating_add(pfn_count); + log::trace!( + "virtio-balloon: inflate {} pages, actual={}", + pfn_count, + self.actual + ); + } + DEFLATE_QUEUE => { + // Guest is reclaiming pages. + self.actual = self.actual.saturating_sub(pfn_count); + log::trace!( + "virtio-balloon: deflate {} pages, actual={}", + pfn_count, + self.actual + ); + } + _ => {} + } + + if let Err(e) = queue.add_used(head, 0, mem) { + log::warn!("virtio-balloon: failed to add used buffer: {}", e); + break; + } + raised = true; + } + + raised + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_device_id() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.device_id(), 5); + } + + #[test] + fn test_num_queues() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.num_queues(), 2); + } + + #[test] + fn test_features_page0() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.device_features(0), 0); + } + + #[test] + fn test_features_page1_version_1() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.device_features(1), 1); // VIRTIO_F_VERSION_1 + } + + #[test] + fn test_features_page2_zero() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.device_features(2), 0); + } + + #[test] + fn test_config_defaults() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.read_config(0), 0); // num_pages + assert_eq!(balloon.read_config(4), 0); // actual + } + + #[test] + fn test_set_target_pages() { + let mut balloon = VirtioBalloon::new(); + balloon.set_target_pages(100); + assert_eq!(balloon.read_config(0), 100); + } + + #[test] + fn test_write_config_actual() { + let mut balloon = VirtioBalloon::new(); + balloon.write_config(4, 50); + assert_eq!(balloon.read_config(4), 50); + } + + #[test] + fn test_write_config_ignores_other_offsets() { + let mut balloon = VirtioBalloon::new(); + balloon.write_config(0, 999); // Should not change num_pages. + assert_eq!(balloon.read_config(0), 0); + } + + #[test] + fn test_read_config_unknown_offset() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.read_config(8), 0); + assert_eq!(balloon.read_config(12), 0); + } + + #[test] + fn test_queue_max_size() { + let balloon = VirtioBalloon::new(); + assert_eq!(balloon.queue_max_size(0), 256); + assert_eq!(balloon.queue_max_size(1), 256); + } +} diff --git a/src/vmm/src/windows/devices/virtio/block.rs b/src/vmm/src/windows/devices/virtio/block.rs index ad649cf71..31b5817ee 100644 --- a/src/vmm/src/windows/devices/virtio/block.rs +++ b/src/vmm/src/windows/devices/virtio/block.rs @@ -512,8 +512,7 @@ impl VirtioDeviceBackend for VirtioBlock { let _ = mem.write_at(status_desc.addr, &[status]); } - let total_written: u32 = - chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); + let total_written: u32 = chain.iter().filter(|d| d.is_write()).map(|d| d.len).sum(); let _ = queue.add_used(head, total_written, mem); processed = true; } @@ -942,9 +941,24 @@ mod tests { mem.write_bytes(0x1000, &header); let chain = vec![ - Descriptor { addr: 0x1000, len: 16, flags: 0, next: 0 }, - Descriptor { addr: 0x2000, len: 512, flags: 2, next: 0 }, - Descriptor { addr: 0x3000, len: 1, flags: 2, next: 0 }, + Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }, + Descriptor { + addr: 0x2000, + len: 512, + flags: 2, + next: 0, + }, + Descriptor { + addr: 0x3000, + len: 1, + flags: 2, + next: 0, + }, ]; let req = VirtioBlock::parse_request(&chain, 10, &mem).unwrap(); @@ -959,9 +973,12 @@ mod tests { #[test] fn test_parse_request_short_chain_returns_none() { let mem = MockMem::new(0x10000); - let chain = vec![ - Descriptor { addr: 0x1000, len: 16, flags: 0, next: 0 }, - ]; + let chain = vec![Descriptor { + addr: 0x1000, + len: 16, + flags: 0, + next: 0, + }]; assert!(VirtioBlock::parse_request(&chain, 0, &mem).is_none()); } diff --git a/src/vmm/src/windows/devices/virtio/block_worker.rs b/src/vmm/src/windows/devices/virtio/block_worker.rs index 65ef5884a..30f46e536 100644 --- a/src/vmm/src/windows/devices/virtio/block_worker.rs +++ b/src/vmm/src/windows/devices/virtio/block_worker.rs @@ -256,7 +256,11 @@ impl BlockWorker { read_targets: vec![], }; } - if self.disk.write_at(offset, &write_data[data_offset..end]).is_err() { + if self + .disk + .write_at(offset, &write_data[data_offset..end]) + .is_err() + { return BlockCompletion { head_index: req.head_index, bytes_written: 0, @@ -338,7 +342,11 @@ mod tests { unsafe impl Send for MemDisk {} impl DiskBackend for MemDisk { - fn read_at(&mut self, offset: u64, buf: &mut [u8]) -> super::super::super::super::error::Result<()> { + fn read_at( + &mut self, + offset: u64, + buf: &mut [u8], + ) -> super::super::super::super::error::Result<()> { let start = offset as usize; let end = start + buf.len(); if end > self.data.len() { @@ -350,7 +358,11 @@ mod tests { Ok(()) } - fn write_at(&mut self, offset: u64, buf: &[u8]) -> super::super::super::super::error::Result<()> { + fn write_at( + &mut self, + offset: u64, + buf: &[u8], + ) -> super::super::super::super::error::Result<()> { if self.read_only { return Err(super::super::super::super::error::WkrunError::Device( "read-only disk".into(), diff --git a/src/vmm/src/windows/devices/virtio/mmio.rs b/src/vmm/src/windows/devices/virtio/mmio.rs index d30eaee01..c1cd8a4c3 100644 --- a/src/vmm/src/windows/devices/virtio/mmio.rs +++ b/src/vmm/src/windows/devices/virtio/mmio.rs @@ -72,6 +72,13 @@ pub trait VirtioDeviceBackend { /// Read a 32-bit value from the device config space at the given offset. fn read_config(&self, offset: u64) -> u32; + /// Write a 32-bit value to the device config space at the given offset. + /// + /// Default: no-op (most devices have read-only config space). + /// Devices with writable config fields (e.g., virtio-balloon `actual`) + /// should override this. + fn write_config(&mut self, _offset: u64, _value: u32) {} + /// Handle a queue notification (guest made buffers available). /// /// Returns `true` if the device processed buffers and an interrupt @@ -291,6 +298,10 @@ impl VirtioMmioDevice { QUEUE_USED_HIGH => { // High bits for used ring address (typically 0). } + off if off >= CONFIG_SPACE => { + let config_offset = off - CONFIG_SPACE; + self.backend.write_config(config_offset, value); + } _ => {} } false diff --git a/src/vmm/src/windows/devices/virtio/mod.rs b/src/vmm/src/windows/devices/virtio/mod.rs index 89c4a9e25..1a2a8d6b6 100644 --- a/src/vmm/src/windows/devices/virtio/mod.rs +++ b/src/vmm/src/windows/devices/virtio/mod.rs @@ -6,7 +6,10 @@ //! - virtio-vsock: socket transport (host TCP <-> guest AF_VSOCK) //! - virtio-9p: filesystem sharing (host directory <-> guest 9P mount) //! - virtio-net: network device (userspace proxy via passt/gvproxy) +//! - virtio-rng: entropy source (host OS random) +//! - virtio-balloon: dynamic memory management +pub mod balloon; pub mod block; pub mod block_worker; pub mod disk; @@ -14,4 +17,5 @@ pub mod mmio; pub mod net; pub mod p9; pub mod queue; +pub mod rng; pub mod vsock; diff --git a/src/vmm/src/windows/devices/virtio/rng.rs b/src/vmm/src/windows/devices/virtio/rng.rs new file mode 100644 index 000000000..e723895fa --- /dev/null +++ b/src/vmm/src/windows/devices/virtio/rng.rs @@ -0,0 +1,151 @@ +//! Virtio-rng device (virtio spec v1.2 Section 5.4). +//! +//! Provides entropy to the guest via `/dev/hwrng`. The guest driver +//! submits device-writable buffers; the device fills them with random +//! bytes and returns them on the used ring. + +use super::mmio::VirtioDeviceBackend; +use super::queue::{GuestMemoryAccessor, Virtqueue}; + +/// Virtio device ID for entropy source (spec 5.4). +const VIRTIO_ID_RNG: u32 = 4; + +/// VIRTIO_F_VERSION_1 — bit 32 (feature page 1, bit 0). +const VIRTIO_F_VERSION_1_PAGE1: u32 = 1; + +/// Maximum queue size for the request queue. +const QUEUE_MAX_SIZE: u16 = 256; + +/// Virtio-rng backend. +/// +/// Purely guest-initiated: the guest submits device-writable buffers, +/// the device fills them with random bytes from the host OS entropy pool. +/// No async worker or polling needed. +pub struct VirtioRng { + _priv: (), +} + +impl VirtioRng { + pub fn new() -> Self { + VirtioRng { _priv: () } + } +} + +impl VirtioDeviceBackend for VirtioRng { + fn device_id(&self) -> u32 { + VIRTIO_ID_RNG + } + + fn device_features(&self, page: u32) -> u32 { + match page { + 0 => 0, + 1 => VIRTIO_F_VERSION_1_PAGE1, + _ => 0, + } + } + + fn read_config(&self, _offset: u64) -> u32 { + 0 // No config space. + } + + fn num_queues(&self) -> usize { + 1 + } + + fn queue_max_size(&self, _queue_idx: u32) -> u16 { + QUEUE_MAX_SIZE + } + + fn queue_notify( + &mut self, + _queue_idx: u32, + queue: &mut Virtqueue, + mem: &dyn GuestMemoryAccessor, + ) -> bool { + let mut raised = false; + + while let Ok(Some(head)) = queue.pop_avail(mem) { + let chain = match queue.read_desc_chain(head, mem) { + Ok(c) => c, + Err(e) => { + log::warn!("virtio-rng: failed to read descriptor chain: {}", e); + break; + } + }; + + let mut total_written = 0u32; + for desc in &chain { + if !desc.is_write() { + continue; // Skip device-readable descriptors. + } + + // Fill with random bytes using ThreadRng (infallible, seeds from OS). + let len = desc.len as usize; + let mut buf = vec![0u8; len]; + rand::RngCore::fill_bytes(&mut rand::rng(), &mut buf); + + if let Err(e) = mem.write_at(desc.addr, &buf) { + log::warn!("virtio-rng: failed to write random bytes: {}", e); + break; + } + total_written += desc.len; + } + + if let Err(e) = queue.add_used(head, total_written, mem) { + log::warn!("virtio-rng: failed to add used buffer: {}", e); + break; + } + raised = true; + } + + raised + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_device_id() { + let rng = VirtioRng::new(); + assert_eq!(rng.device_id(), 4); + } + + #[test] + fn test_num_queues() { + let rng = VirtioRng::new(); + assert_eq!(rng.num_queues(), 1); + } + + #[test] + fn test_features_page0() { + let rng = VirtioRng::new(); + assert_eq!(rng.device_features(0), 0); + } + + #[test] + fn test_features_page1_version_1() { + let rng = VirtioRng::new(); + assert_eq!(rng.device_features(1), 1); // VIRTIO_F_VERSION_1 + } + + #[test] + fn test_features_page2_zero() { + let rng = VirtioRng::new(); + assert_eq!(rng.device_features(2), 0); + } + + #[test] + fn test_read_config_returns_zero() { + let rng = VirtioRng::new(); + assert_eq!(rng.read_config(0), 0); + assert_eq!(rng.read_config(4), 0); + } + + #[test] + fn test_queue_max_size() { + let rng = VirtioRng::new(); + assert_eq!(rng.queue_max_size(0), 256); + } +} diff --git a/src/vmm/src/windows/devices/virtio/vsock/connection.rs b/src/vmm/src/windows/devices/virtio/vsock/connection.rs index 5f22fe880..54fc06544 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/connection.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/connection.rs @@ -149,7 +149,9 @@ impl VsockConnection { self.state = ConnState::Connected; log::debug!( "vsock conn ({},{}) → Connected (guest REQUEST, buf_alloc={})", - self.local_port, self.peer_port, hdr.buf_alloc + self.local_port, + self.peer_port, + hdr.buf_alloc ); Some(VsockHeader::new_response( @@ -207,7 +209,10 @@ impl VsockConnection { if self.state != old_state { log::debug!( "vsock conn ({},{}) → {:?} (SHUTDOWN flags=0x{:x})", - self.local_port, self.peer_port, self.state, flags + self.local_port, + self.peer_port, + self.state, + flags ); } } @@ -216,7 +221,8 @@ impl VsockConnection { pub fn handle_rst(&mut self) { log::debug!( "vsock conn ({},{}) → Closed (RST)", - self.local_port, self.peer_port + self.local_port, + self.peer_port ); self.state = ConnState::Closed; } @@ -341,7 +347,9 @@ impl VsockConnection { self.state = ConnState::Connected; log::debug!( "vsock conn ({},{}) → Connected (guest RESPONSE, buf_alloc={})", - self.local_port, self.peer_port, hdr.buf_alloc + self.local_port, + self.peer_port, + hdr.buf_alloc ); } (None, None) diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index e0d1a1cdd..590b99df5 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -222,7 +222,11 @@ impl VirtioVsock { Err(e) => { log::warn!( "vsock write failed: {}/{} bytes, err={}, key=({},{})", - written, data.len(), e, key.0, key.1 + written, + data.len(), + e, + key.0, + key.1 ); break; } @@ -377,16 +381,10 @@ impl VirtioVsock { match listener.accept() { Ok((stream, addr)) => { if let Err(e) = stream.set_nonblocking(true) { - log::warn!( - "vsock set_nonblocking failed: {} (addr={:?})", - e, addr - ); + log::warn!("vsock set_nonblocking failed: {} (addr={:?})", e, addr); } if let Err(e) = stream.set_nodelay(true) { - log::warn!( - "vsock set_nodelay failed: {} (addr={:?})", - e, addr - ); + log::warn!("vsock set_nodelay failed: {} (addr={:?})", e, addr); } stream } @@ -406,7 +404,8 @@ impl VirtioVsock { if let Some(req) = conn.initiate_connect() { log::debug!( "host-initiated CONNECT: vsock_port={}, host_port={}, queuing REQUEST", - vsock_port, host_port + vsock_port, + host_port ); self.rx_pending.push((req, Vec::new())); self.connections.insert(key, conn); @@ -457,14 +456,15 @@ impl VirtioVsock { log::trace!("TCP read {} bytes, key=({},{})", n, key.0, key.1); Some(buf[..n].to_vec()) } - Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { - None - } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => None, Err(ref e) => { // I/O error on TCP stream. RST the vsock connection. log::warn!( "vsock TCP read error: {} (raw={:?}), key=({},{})", - e, e.raw_os_error(), key.0, key.1 + e, + e.raw_os_error(), + key.0, + key.1 ); if let Some(conn) = self.connections.get(&key) { let rst = conn.make_rst(); @@ -486,7 +486,11 @@ impl VirtioVsock { if enqueued < data.len() { log::debug!( "vsock enqueue_tx partial: {}/{} bytes, credit={}, key=({},{})", - enqueued, data.len(), conn.peer_credit(), key.0, key.1 + enqueued, + data.len(), + conn.peer_credit(), + key.0, + key.1 ); } } diff --git a/src/vmm/src/windows/memory.rs b/src/vmm/src/windows/memory.rs index 973e25f5b..3f682aced 100644 --- a/src/vmm/src/windows/memory.rs +++ b/src/vmm/src/windows/memory.rs @@ -217,27 +217,56 @@ mod imp { } impl GuestMemory { - /// Create guest memory, leaving a hole for the MMIO region if RAM exceeds it. + /// Create guest memory, leaving holes for device MMIO regions. /// - /// When `size_mib` is large enough that RAM overlaps `VIRTIO_MMIO_BASE`, - /// the memory is split into two regions with an unmapped gap so that WHPX - /// generates MMIO exits (instead of treating device accesses as RAM reads). + /// When guest RAM overlaps device MMIO addresses, the memory is split + /// into multiple regions with unmapped gaps so that WHPX generates MMIO + /// exits (instead of treating device accesses as RAM reads). + /// + /// Holes are created for: + /// - Virtio MMIO (0xD000_0000 .. 0xD020_0000) — virtio device registers + /// - APIC MMIO (0xFEC0_0000 .. 0xFEE0_1000) — IOAPIC + LAPIC registers pub fn new(size_mib: u32) -> Result { let size = (size_mib as u64) * 1024 * 1024; if size > super::VIRTIO_MMIO_BASE { - // RAM extends past MMIO region — split into two regions. - // Region 1: GPA 0 .. VIRTIO_MMIO_BASE - // (hole): VIRTIO_MMIO_BASE .. VIRTIO_MMIO_BASE + MMIO_REGION_SIZE - // Region 2: VIRTIO_MMIO_BASE + MMIO_REGION_SIZE .. ram_end let mmio_base = super::VIRTIO_MMIO_BASE; let mmio_end = mmio_base + super::MMIO_REGION_SIZE; let region1 = GuestMemoryRegion::new(0, mmio_base)?; - let region2 = GuestMemoryRegion::new(mmio_end, size - mmio_end)?; - Ok(GuestMemory { - regions: vec![region1, region2], - total_size: size, - }) + + // Check if RAM extends into the APIC MMIO region. + // IOAPIC at 0xFEC0_0000 and LAPIC at 0xFEE0_0000 must be + // unmapped so WHPX generates MMIO exits for APIC accesses. + let apic_start = super::IOAPIC_MMIO_BASE; + let apic_end = super::LAPIC_MMIO_BASE + super::LAPIC_MMIO_SIZE; + + if size > apic_start { + // RAM extends past APIC region — 3 regions with 2 holes. + // Region 1: 0 .. VIRTIO_MMIO_BASE + // (hole): VIRTIO MMIO + // Region 2: VIRTIO_MMIO_END .. IOAPIC_MMIO_BASE + // (hole): APIC MMIO (IOAPIC + LAPIC) + // Region 3: APIC_END .. ram_end + let region2 = GuestMemoryRegion::new(mmio_end, apic_start - mmio_end)?; + let mut regions = vec![region1, region2]; + + if size > apic_end { + let region3 = GuestMemoryRegion::new(apic_end, size - apic_end)?; + regions.push(region3); + } + + Ok(GuestMemory { + regions, + total_size: size, + }) + } else { + // RAM between VIRTIO and APIC — 2 regions with 1 hole. + let region2 = GuestMemoryRegion::new(mmio_end, size - mmio_end)?; + Ok(GuestMemory { + regions: vec![region1, region2], + total_size: size, + }) + } } else { // RAM fits below MMIO — single contiguous region. let region = GuestMemoryRegion::new(0, size)?; diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 723b18e5d..a7daa438a 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -308,7 +308,8 @@ mod imp { *canceller_slot.lock().unwrap() = Some(cancellers[0].clone()); // Create per-AP startup state (one per AP, indexed by ap_id - 1). - let ap_states: Vec = (1..num_vcpus).map(|_| ApStartupState::new()).collect(); + let ap_states: Vec = + (1..num_vcpus).map(|_| ApStartupState::new()).collect(); // Shared VM shutdown flag — set by any vCPU to signal all others to exit. let shutdown = Arc::new(AtomicBool::new(false)); @@ -317,11 +318,7 @@ mod imp { // Move diag_log into shared state for BSP diagnostics. let diag_log = Arc::new(Mutex::new(diag_log)); - log::info!( - "Starting VM with {} vCPU(s), ctx_id={}", - num_vcpus, - ctx_id, - ); + log::info!("Starting VM with {} vCPU(s), ctx_id={}", num_vcpus, ctx_id,); eprintln!("[WHPX] Starting {} vCPU(s)", num_vcpus); let mut exit_code = 1i32; @@ -368,6 +365,7 @@ mod imp { cancellers_ref, &ap_states_ref[ap_idx - 1], ctx_id, + diag_ref, ); }); } @@ -495,7 +493,18 @@ mod imp { devices: &mut DeviceManager, ap_states: &[ApStartupState], cancellers: &[VcpuCanceller], + diag_log: &Arc>>, ) { + macro_rules! ipi_diag { + ($($arg:tt)*) => { + if let Ok(mut guard) = diag_log.lock() { + if let Some(ref mut f) = *guard { + let _ = writeln!(f, $($arg)*); + let _ = f.flush(); + } + } + }; + } match action { IpiAction::None => {} IpiAction::SendInit { target_apic_id } => { @@ -504,7 +513,13 @@ mod imp { ap_states[ap_idx - 1] .init_received .store(true, Ordering::Release); - log::info!("INIT delivered to AP{}", target_apic_id); + ipi_diag!("IPI: INIT delivered to AP{}", target_apic_id); + } else { + ipi_diag!( + "IPI: INIT target AP{} out of range (max={})", + target_apic_id, + ap_states.len() + ); } } IpiAction::SendSipi { @@ -518,14 +533,15 @@ mod imp { *state.sipi_vector.lock().unwrap() = Some(vector); *state.started.lock().unwrap() = true; state.condvar.notify_one(); - log::info!( - "SIPI delivered to AP{}: start at {:#X}", + ipi_diag!( + "IPI: SIPI delivered to AP{}, vector={:#X}, start_addr={:#X}", target_apic_id, + vector, (vector as u64) * 0x1000 ); } else { - log::warn!( - "SIPI to AP{} ignored (no INIT received)", + ipi_diag!( + "IPI: SIPI to AP{} IGNORED (no INIT received)", target_apic_id, ); } @@ -542,8 +558,8 @@ mod imp { if idx < cancellers.len() { let _ = cancellers[idx].cancel(); } - log::debug!( - "IPI interrupt vector {:#X} delivered to vCPU{}", + ipi_diag!( + "IPI: interrupt vector={:#X} → vCPU{}", vector, target_apic_id, ); @@ -603,8 +619,15 @@ mod imp { let exit = match vcpu.run() { Ok(exit) => exit, Err(e) => { - log::error!("BSP vcpu.run() FAILED after {} exits: {:?}", stats.exit_count, e); - eprintln!("[WHPX] BSP vcpu.run() FAILED after {} exits: {:?}", stats.exit_count, e); + log::error!( + "BSP vcpu.run() FAILED after {} exits: {:?}", + stats.exit_count, + e + ); + eprintln!( + "[WHPX] BSP vcpu.run() FAILED after {} exits: {:?}", + stats.exit_count, e + ); return 1; } }; @@ -669,9 +692,26 @@ mod imp { diag!("{}", msg); } let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); + // Log LAPIC ICR writes for diagnostics. + if address >= crate::windows::memory::LAPIC_MMIO_BASE { + let offset = address - crate::windows::memory::LAPIC_MMIO_BASE; + if offset == 0x300 || offset == 0x310 { + diag!( + "LAPIC ICR write: offset={:#X} data={:#X}", + offset, + data + ); + } + } // Dispatch IPI if this was an ICR write. if !matches!(ipi_action, IpiAction::None) { - dispatch_ipi(ipi_action, &mut dm, ap_states, cancellers); + dispatch_ipi( + ipi_action, + &mut dm, + ap_states, + cancellers, + diag_log, + ); } drop(dm); if let Err(e) = vcpu.skip_instruction() { @@ -695,8 +735,7 @@ mod imp { let mut dm = devices.lock().unwrap(); dm.tick_and_poll(0, guest_mem); if dm.irq_chip.has_pending(0) { - let already_pending = - vcpu.has_pending_interruption().unwrap_or(false); + let already_pending = vcpu.has_pending_interruption().unwrap_or(false); if !already_pending { if let Some(vector) = dm.irq_chip.acknowledge(0) { let _ = vcpu.inject_interrupt(vector); @@ -870,7 +909,11 @@ mod imp { return -1; } VcpuExit::Unknown(reason) => { - log::error!("BSP: Unknown exit reason {} after {} exits", reason, stats.exit_count); + log::error!( + "BSP: Unknown exit reason {} after {} exits", + reason, + stats.exit_count + ); return -1; } } @@ -898,7 +941,21 @@ mod imp { cancellers: &[VcpuCanceller], startup: &ApStartupState, _ctx_id: u32, + diag_log: &Arc>>, ) { + macro_rules! diag { + ($($arg:tt)*) => { + if let Ok(mut guard) = diag_log.lock() { + if let Some(ref mut f) = *guard { + let _ = writeln!(f, $($arg)*); + let _ = f.flush(); + } + } + }; + } + + diag!("AP{}: thread started, waiting for SIPI", ap_id); + // Wait for SIPI from BSP. { let mut started = startup.started.lock().unwrap(); @@ -909,25 +966,41 @@ mod imp { // Check if we were woken for shutdown rather than SIPI. if shutdown.load(Ordering::Relaxed) { - log::info!("AP{}: woken for shutdown, not SIPI", ap_id); + diag!("AP{}: woken for shutdown, not SIPI", ap_id); return; } // Configure AP initial register state from SIPI vector. let sipi_vector = startup.sipi_vector.lock().unwrap().unwrap_or(0); - log::info!( - "AP{}: SIPI received, starting at vector={:#X} ({:#X})", + diag!( + "AP{}: SIPI received, vector={:#X}, CS:IP={:#X}:0000", ap_id, sipi_vector, (sipi_vector as u64) * 0x1000 ); + // Dump WHPX default registers before modification for diagnostics. + if let Ok(sregs) = vcpu.get_special_registers() { + diag!( + "AP{}: WHPX defaults TR=sel:{:#X}/base:{:#X}/lim:{:#X}/ar:{:#X} \ + LDT=sel:{:#X}/base:{:#X}/lim:{:#X}/ar:{:#X} \ + GDT=base:{:#X}/lim:{:#X} IDT=base:{:#X}/lim:{:#X} \ + CR0={:#X} CR4={:#X} EFER={:#X}", + ap_id, + sregs.tr.selector, sregs.tr.base, sregs.tr.limit, sregs.tr.access_rights, + sregs.ldt.selector, sregs.ldt.base, sregs.ldt.limit, sregs.ldt.access_rights, + sregs.gdt.base, sregs.gdt.limit, sregs.idt.base, sregs.idt.limit, + sregs.cr0, sregs.cr4, sregs.efer + ); + } + // AP starts in real mode: CS:IP = (sipi_vector * 0x100):0x0000 // The Linux kernel SMP trampoline is placed at sipi_vector * 0x1000. if let Err(e) = vcpu.set_ap_initial_regs(sipi_vector, ap_id) { - log::error!("AP{}: failed to set initial registers: {:?}", ap_id, e); + diag!("AP{}: FAILED to set initial registers: {:?}", ap_id, e); return; } + diag!("AP{}: initial regs set, entering run loop", ap_id); let mut stats = VcpuStats::new(); @@ -948,7 +1021,7 @@ mod imp { let exit = match vcpu.run() { Ok(exit) => exit, Err(e) => { - log::error!( + diag!( "AP{}: vcpu.run() FAILED after {} exits: {:?}", ap_id, stats.exit_count, @@ -959,6 +1032,29 @@ mod imp { }; stats.exit_count += 1; + // Log first few AP exits for diagnostics. + if stats.exit_count <= 10 { + let desc = match &exit { + VcpuExit::IoOut { port, .. } => format!("IoOut(port={:#X})", port), + VcpuExit::IoIn { port, .. } => format!("IoIn(port={:#X})", port), + VcpuExit::MmioRead { address, .. } => format!("MmioRead({:#X})", address), + VcpuExit::MmioWrite { address, .. } => format!("MmioWrite({:#X})", address), + VcpuExit::Halt => "Halt".into(), + VcpuExit::Cancelled => "Cancelled".into(), + VcpuExit::InterruptWindow => "InterruptWindow".into(), + VcpuExit::Shutdown => "Shutdown".into(), + VcpuExit::UnrecoverableException => "UnrecoverableException".into(), + VcpuExit::MsrAccess { + msr_number, + is_write, + .. + } => format!("MSR({:#X}, write={})", msr_number, is_write), + VcpuExit::CpuidAccess { rax, .. } => format!("CPUID({:#X})", rax), + VcpuExit::Unknown(r) => format!("Unknown({})", r), + }; + diag!("AP{}: exit #{} = {}", ap_id, stats.exit_count, desc); + } + match exit { VcpuExit::IoOut { port, size, data } => { stats.halt_count = 0; @@ -985,7 +1081,10 @@ mod imp { VcpuExit::MmioRead { address, size } => { stats.halt_count = 0; stats.mmio_count += 1; - let data = devices.lock().unwrap().handle_mmio_read(ap_id, address, size); + let data = devices + .lock() + .unwrap() + .handle_mmio_read(ap_id, address, size); let _ = vcpu.complete_mmio_read(data); } VcpuExit::MmioWrite { @@ -999,7 +1098,7 @@ mod imp { let ipi_action = dm.handle_mmio_write(ap_id, address, size, data, guest_mem); if !matches!(ipi_action, IpiAction::None) { // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). - dispatch_ipi(ipi_action, &mut dm, &[], cancellers); + dispatch_ipi(ipi_action, &mut dm, &[], cancellers, diag_log); } drop(dm); let _ = vcpu.skip_instruction(); @@ -1019,8 +1118,7 @@ mod imp { { let mut dm = devices.lock().unwrap(); if dm.irq_chip.has_pending(ap_id) { - let already_pending = - vcpu.has_pending_interruption().unwrap_or(false); + let already_pending = vcpu.has_pending_interruption().unwrap_or(false); if !already_pending { if let Some(vector) = dm.irq_chip.acknowledge(ap_id) { let _ = vcpu.inject_interrupt(vector); @@ -1122,19 +1220,23 @@ mod imp { } } VcpuExit::UnrecoverableException => { - log::error!( - "AP{}: triple fault after {} exits", + let regs = vcpu.get_registers().ok(); + let sregs = vcpu.get_special_registers().ok(); + diag!( + "AP{}: TRIPLE FAULT after {} exits, RIP={:#X}, CR0={:#X}, CR3={:#X}, EFER={:#X}", ap_id, - stats.exit_count + stats.exit_count, + regs.as_ref().map_or(0, |r| r.rip), + sregs.as_ref().map_or(0, |s| s.cr0), + sregs.as_ref().map_or(0, |s| s.cr3), + sregs.as_ref().map_or(0, |s| s.efer), ); return; } VcpuExit::Unknown(reason) => { - log::error!( + diag!( "AP{}: unknown exit reason {} after {} exits", - ap_id, - reason, - stats.exit_count + ap_id, reason, stats.exit_count ); return; } @@ -1469,7 +1571,11 @@ mod tests { super::handle_cpuid(0, 2, 1, 0x1234, 0x0000_0000_0000_5678, 0x8000_0001, 0xABCD); // EBX[23:16] = num_vcpus = 2, EBX[31:24] = vcpu_id = 0 assert_eq!(rbx & 0x00FF_0000, 0x0002_0000, "EBX[23:16] should be 2"); - assert_eq!(rbx & 0xFF00_0000, 0x0000_0000, "EBX[31:24] should be 0 for BSP"); + assert_eq!( + rbx & 0xFF00_0000, + 0x0000_0000, + "EBX[31:24] should be 0 for BSP" + ); // EBX[15:0] preserved from default assert_eq!(rbx & 0xFFFF, 0x5678, "EBX[15:0] should be preserved"); // ECX bit 31 (hypervisor present) must be cleared @@ -1482,18 +1588,9 @@ mod tests { #[test] fn test_cpuid_leaf1_topology_ap() { // AP (vcpu 3) with 4 vCPUs. - let (_, rbx, _, _) = - super::handle_cpuid(3, 4, 1, 0, 0, 0, 0); - assert_eq!( - (rbx >> 16) & 0xFF, - 4, - "EBX[23:16] should be num_vcpus=4" - ); - assert_eq!( - (rbx >> 24) & 0xFF, - 3, - "EBX[31:24] should be vcpu_id=3" - ); + let (_, rbx, _, _) = super::handle_cpuid(3, 4, 1, 0, 0, 0, 0); + assert_eq!((rbx >> 16) & 0xFF, 4, "EBX[23:16] should be num_vcpus=4"); + assert_eq!((rbx >> 24) & 0xFF, 3, "EBX[31:24] should be vcpu_id=3"); } #[test] @@ -1502,19 +1599,22 @@ mod tests { for leaf in [0x40000000u32, 0x40000001, 0x400000FF] { let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 1, leaf, 0xDEAD, 0xBEEF, 0xCAFE, 0xF00D); - assert_eq!((rax, rbx, rcx, rdx), (0, 0, 0, 0), "Hyper-V leaf 0x{:X} must be zeroed", leaf); + assert_eq!( + (rax, rbx, rcx, rdx), + (0, 0, 0, 0), + "Hyper-V leaf 0x{:X} must be zeroed", + leaf + ); } } #[test] fn test_cpuid_passthrough_other_leaves() { // Non-special leaves should pass through defaults unchanged. - let (rax, rbx, rcx, rdx) = - super::handle_cpuid(0, 2, 0, 0x1111, 0x2222, 0x3333, 0x4444); + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 0, 0x1111, 0x2222, 0x3333, 0x4444); assert_eq!((rax, rbx, rcx, rdx), (0x1111, 0x2222, 0x3333, 0x4444)); - let (rax, rbx, rcx, rdx) = - super::handle_cpuid(0, 2, 7, 0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD); + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 7, 0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD); assert_eq!((rax, rbx, rcx, rdx), (0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD)); } diff --git a/src/vmm/src/windows/whpx.rs b/src/vmm/src/windows/whpx.rs index 7e9a7901f..d4448655e 100644 --- a/src/vmm/src/windows/whpx.rs +++ b/src/vmm/src/windows/whpx.rs @@ -615,11 +615,7 @@ mod imp { let byte_count = mem_ctx.InstructionByteCount as usize; let insn_bytes = &mem_ctx.InstructionBytes[..byte_count.min(16)]; let regs = self.get_registers().map_err(|e| { - log::error!( - "MMIO get_registers FAILED at GPA 0x{:x}: {:?}", - address, - e - ); + log::error!("MMIO get_registers FAILED at GPA 0x{:x}: {:?}", address, e); e })?; let insn = match super::super::insn::decode_mmio_insn(insn_bytes, ®s) { @@ -1036,56 +1032,41 @@ mod imp { /// - DL = APIC ID (Linux convention for AP identification) /// - All other regs = 0 / default real mode values pub fn set_ap_initial_regs(&self, sipi_vector: u8, apic_id: u8) -> Result<()> { - use super::super::types::{SegmentRegister, SpecialRegisters, StandardRegisters}; + use super::super::types::{SegmentRegister, StandardRegisters}; let cs_base = (sipi_vector as u64) * 0x1000; let cs_selector = (sipi_vector as u16) * 0x100; let regs = StandardRegisters { rdx: apic_id as u64, // Linux uses DL for APIC ID on AP startup + rflags: 0x2, // x86 requires RFLAGS bit 1 always set ..Default::default() }; - let sregs = SpecialRegisters { - cs: SegmentRegister { - base: cs_base, - limit: 0xFFFF, - selector: cs_selector, - access_rights: 0x9B, // present, code, readable, accessed - }, - ds: SegmentRegister { - base: 0, - limit: 0xFFFF, - selector: 0, - access_rights: 0x93, // present, data, writable, accessed - }, - es: SegmentRegister { - base: 0, - limit: 0xFFFF, - selector: 0, - access_rights: 0x93, - }, - fs: SegmentRegister { - base: 0, - limit: 0xFFFF, - selector: 0, - access_rights: 0x93, - }, - gs: SegmentRegister { - base: 0, - limit: 0xFFFF, - selector: 0, - access_rights: 0x93, - }, - ss: SegmentRegister { - base: 0, - limit: 0xFFFF, - selector: 0, - access_rights: 0x93, - }, - cr0: 0x10, // ET (Extension Type) — required for real mode on x86 - ..Default::default() + // Read existing special registers to preserve WHPX defaults for TR, LDT, + // GDT, IDT. WHPX requires valid access_rights for these even in real mode; + // overwriting them with zeros causes WHvRunVpExitReasonInvalidVpRegisterValue + // (exit reason 5). + let mut sregs = self.get_special_registers()?; + + sregs.cs = SegmentRegister { + base: cs_base, + limit: 0xFFFF, + selector: cs_selector, + access_rights: 0x9B, // present, code, readable, accessed + }; + let data_seg = SegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0, + access_rights: 0x93, // present, data, writable, accessed }; + sregs.ds = data_seg; + sregs.es = data_seg; + sregs.fs = data_seg; + sregs.gs = data_seg; + sregs.ss = data_seg; + sregs.cr0 = 0x10; // ET (Extension Type) — required for real mode on x86 self.set_registers(®s)?; self.set_special_registers(&sregs)?; From 0affe973f77f47e380a2ee65a8d1cb70a8f06f9b Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 30 Apr 2026 10:14:17 +0800 Subject: [PATCH 23/33] feat(windows): HLT tiered sleep, structured logging, LAPIC timer throttle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Performance + production polish for Windows WHPX VMM: - HLT sleep: replace flat 1ms sleep with tiered strategy (50 spin-yield iterations then 200µs sleep), reducing interrupt delivery latency ~5x - Diagnostic logging: replace all eprintln! with structured log::info!/ log::warn!/log::debug! using "whpx::diag" target, respecting RUST_LOG - Diagnostic file: clean up temp whpx-diag.log on normal VM exit - IPI diagnostics: add elapsed timestamps to IPI dispatch log entries - LAPIC timer throttle: skip LAPIC timer ticks when <500µs elapsed, reducing wasted CPU (PIT timer still fires every tick) Warm exec improved: Win11 14→5.5ms (-61%), Win10 45→33ms (-27%). vm-bench 8/8 PASS, net-test 8/8 PASS on both platforms. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/manager.rs | 8 +- src/vmm/src/windows/runner.rs | 136 ++++++++++++++----------- 2 files changed, 80 insertions(+), 64 deletions(-) diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 43bad3af9..cd53fae34 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -687,8 +687,12 @@ impl DeviceManager { // Tick LAPIC timers for ALL vCPUs (only fires in APIC mode). // Each AP's LAPIC timer must advance so the kernel scheduler can preempt // tasks on all CPUs. Without this, AP LAPIC timer calibration hangs. - for i in 0..self.irq_chip.num_vcpus() { - self.irq_chip.tick_timer(i, now); + // Throttle: LAPIC timers fire at ~100Hz (10ms period), so checking more + // than every 500µs wastes CPU. PIT timer (IRQ 0) still fires every tick. + if elapsed_ns > 500_000 { + for i in 0..self.irq_chip.num_vcpus() { + self.irq_chip.tick_timer(i, now); + } } // Suppress unused variable — vcpu_id was the original single-vCPU target. let _ = vcpu_id; diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index a7daa438a..e4bce538f 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -10,7 +10,7 @@ #[cfg(target_os = "windows")] mod imp { use std::collections::HashMap; - use std::io::Write as IoWrite; + use std::io::Write; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Condvar, Mutex}; use std::time::{Duration, Instant}; @@ -55,6 +55,13 @@ mod imp { /// NOT indicate the VM is stuck. const MAX_HALTS: u64 = 50_000; + /// Number of spin-yield iterations before sleeping on HLT. + /// ~50µs of yielding to catch imminent timer interrupts. + const HLT_SPIN_ITERS: u32 = 50; + + /// Short sleep duration (µs) after spin phase completes without interrupt. + const HLT_SLEEP_US: u64 = 200; + /// Per-AP (Application Processor) startup state. /// /// Each AP thread waits on its condvar until the BSP delivers an @@ -276,15 +283,6 @@ mod imp { regs.rip, cmdline ); - // Also emit to stderr so shim can capture it (log::* may be silently dropped - // when no log→tracing bridge is installed). - eprintln!( - "[WHPX] Kernel loaded, RIP={:#X}, ram={}MB, mmio_hole={}, cmdline_len={}", - regs.rip, - ctx.ram_mib, - ctx.ram_mib as u64 * 1024 * 1024 > crate::windows::memory::VIRTIO_MMIO_BASE, - cmdline.len() - ); diag!("Kernel loaded, RIP={:#X}, ram={}MB", regs.rip, ctx.ram_mib); // Create all vCPUs. BSP (index 0) gets the boot registers. @@ -318,8 +316,7 @@ mod imp { // Move diag_log into shared state for BSP diagnostics. let diag_log = Arc::new(Mutex::new(diag_log)); - log::info!("Starting VM with {} vCPU(s), ctx_id={}", num_vcpus, ctx_id,); - eprintln!("[WHPX] Starting {} vCPU(s)", num_vcpus); + log::info!("Starting VM with {} vCPU(s), ctx_id={}", num_vcpus, ctx_id); let mut exit_code = 1i32; @@ -405,7 +402,11 @@ mod imp { } log::info!("VM exited with code {}", exit_code); - eprintln!("[WHPX] VM exited, code={}", exit_code); + + // Clean up diagnostic file on normal exit. + // Drop the file handle first, then remove the temp file. + drop(diag_log); + let _ = std::fs::remove_file(&diag_path); Ok(exit_code) } @@ -494,11 +495,13 @@ mod imp { ap_states: &[ApStartupState], cancellers: &[VcpuCanceller], diag_log: &Arc>>, + start_time: Instant, ) { macro_rules! ipi_diag { ($($arg:tt)*) => { if let Ok(mut guard) = diag_log.lock() { if let Some(ref mut f) = *guard { + let _ = write!(f, "[{:.3}s] ", start_time.elapsed().as_secs_f64()); let _ = writeln!(f, $($arg)*); let _ = f.flush(); } @@ -624,10 +627,6 @@ mod imp { stats.exit_count, e ); - eprintln!( - "[WHPX] BSP vcpu.run() FAILED after {} exits: {:?}", - stats.exit_count, e - ); return 1; } }; @@ -682,14 +681,13 @@ mod imp { if !blk_workers_started && !sync_block { dm.start_blk_workers(); blk_workers_started = true; - let msg = format!( + log::info!( + target: "whpx::diag", "Block workers started at exit={} mmio={} elapsed={:.1}ms", stats.exit_count, stats.mmio_count, stats.start_time.elapsed().as_secs_f64() * 1000.0 ); - eprintln!("[WHPX] {}", msg); - diag!("{}", msg); } let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); // Log LAPIC ICR writes for diagnostics. @@ -711,6 +709,7 @@ mod imp { ap_states, cancellers, diag_log, + stats.start_time, ); } drop(dm); @@ -758,28 +757,18 @@ mod imp { .map(|b| b.len()) .unwrap_or(0); let if_flag = vcpu.interrupts_enabled().unwrap_or(false); - eprintln!( - "[WHPX] BSP HLT stuck: consecutive={} total_halt={} halt_with_irq={} \ - exits={} RIP={:#X} RFLAGS={:#X} IF={} console={}B mmio={} vcpus={}", + log::warn!( + target: "whpx::diag", + "BSP HLT stuck: consecutive={} total_halt={} halt_w_irq={} \ + exits={} RIP={:#X} IF={} console={}B mmio={} vcpus={}", stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, - stats.exit_count, regs.rip, regs.rflags, + stats.exit_count, regs.rip, if_flag, console_len, stats.mmio_count, num_vcpus ); } } if stats.halt_count > MAX_HALTS { - if let Ok(regs) = vcpu.get_registers() { - let console_len = devices::get_console_output(ctx_id) - .map(|b| b.len()) - .unwrap_or(0); - eprintln!( - "[WHPX] BSP HALT_MAX: consecutive={} total_halt={} halt_with_irq={} \ - exits={} RIP={:#X} console={}B mmio={}", - stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, - stats.exit_count, regs.rip, console_len, stats.mmio_count - ); - } log::warn!( "BSP halted {} times consecutively after {} exits", stats.halt_count, @@ -788,7 +777,24 @@ mod imp { _last_exit_reason = "HALT_MAX_REACHED"; return 0; } - std::thread::sleep(Duration::from_millis(1)); + + // Tiered sleep: spin-yield phase to catch imminent interrupts, + // then short sleep if no interrupt arrived. + let mut woke_by_irq = false; + for i in 0..HLT_SPIN_ITERS { + std::thread::yield_now(); + if i % 10 == 9 { + let mut dm = devices.lock().unwrap(); + dm.tick_and_poll(0, guest_mem); + if dm.irq_chip.has_pending(0) { + woke_by_irq = true; + break; + } + } + } + if !woke_by_irq { + std::thread::sleep(Duration::from_micros(HLT_SLEEP_US)); + } } VcpuExit::Shutdown => { log::info!("BSP: VM shutdown after {} exits", stats.exit_count); @@ -807,29 +813,24 @@ mod imp { .map(|b| b.len()) .unwrap_or(0); let (qn, bc) = dm.blk_stats(); - let (ioapic_mmio, lapic_mmio) = dm.apic_mmio_stats(); - let (irr, isr, imr, vbase) = dm.irq_chip.pic_master_state(); - let (s_irr, s_isr, s_imr, s_vbase) = dm.irq_chip.pic_slave_state(); let apic_mode = dm.irq_chip.apic_mode(); - let msg = format!( - "BSP @ {:.1}s: exits={} RIP={:#X} console={}B io_out={} serial={} mmio={} blk_qn={} blk_comp={} halt={}/{} halt_w_irq={} inj={} ioapic_mmio={} lapic_mmio={} pic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} spic=irr:{:02X}/isr:{:02X}/imr:{:02X}/vb:{:02X} irq={} mode={} vcpus={}", + let blk_mode = if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }; + drop(dm); + log::info!( + target: "whpx::diag", + "vCPU0 @ {:.1}s: exits={} RIP={:#X} console={}B mmio={} halt={}/{} inj={} blk_comp={} mode={}/{}", stats.start_time.elapsed().as_secs_f64(), stats.exit_count, regs.rip, console_len, - stats.io_out_count, stats.serial_out_count, - stats.mmio_count, qn, bc, - stats.halt_count, stats.total_halt_exits, - stats.halt_with_irq, stats.inject_count, - ioapic_mmio, lapic_mmio, - irr, isr, imr, vbase, - s_irr, s_isr, s_imr, s_vbase, - if apic_mode { "apic" } else { "pic" }, - if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }, - num_vcpus, + stats.mmio_count, stats.halt_count, stats.total_halt_exits, + stats.inject_count, bc, + if apic_mode { "apic" } else { "pic" }, blk_mode, + ); + log::debug!( + target: "whpx::diag", + "vCPU0 detail: io_out={} serial={} blk_qn={} halt_w_irq={} vcpus={}", + stats.io_out_count, stats.serial_out_count, qn, + stats.halt_with_irq, num_vcpus, ); - drop(dm); - log::info!("{}", msg); - eprintln!("[WHPX] {}", msg); - diag!("{}", msg); } } } @@ -901,11 +902,6 @@ mod imp { sregs.as_ref().map_or(0, |s| s.cr4), sregs.as_ref().map_or(0, |s| s.efer), ); - eprintln!( - "[WHPX] BSP TRIPLE FAULT after {} exits, RIP={:#X}", - stats.exit_count, - regs.as_ref().map_or(0, |r| r.rip), - ); return -1; } VcpuExit::Unknown(reason) => { @@ -1098,7 +1094,7 @@ mod imp { let ipi_action = dm.handle_mmio_write(ap_id, address, size, data, guest_mem); if !matches!(ipi_action, IpiAction::None) { // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). - dispatch_ipi(ipi_action, &mut dm, &[], cancellers, diag_log); + dispatch_ipi(ipi_action, &mut dm, &[], cancellers, diag_log, stats.start_time); } drop(dm); let _ = vcpu.skip_instruction(); @@ -1145,7 +1141,23 @@ mod imp { // Don't exit, just keep waiting for interrupts. stats.halt_count = 0; } - std::thread::sleep(Duration::from_millis(1)); + + // Tiered sleep: spin-yield phase to catch imminent interrupts, + // then short sleep if no interrupt arrived. + let mut woke_by_irq = false; + for i in 0..HLT_SPIN_ITERS { + std::thread::yield_now(); + if i % 10 == 9 { + let mut dm = devices.lock().unwrap(); + if dm.irq_chip.has_pending(ap_id) { + woke_by_irq = true; + break; + } + } + } + if !woke_by_irq { + std::thread::sleep(Duration::from_micros(HLT_SLEEP_US)); + } } VcpuExit::Shutdown => { log::info!("AP{}: shutdown after {} exits", ap_id, stats.exit_count); From 55d032c44a062417bc9144d52059982e0254a2cc Mon Sep 17 00:00:00 2001 From: lile Date: Thu, 30 Apr 2026 22:41:36 +0800 Subject: [PATCH 24/33] feat(windows): add QCOW2 backing file support to disk backend Enable copy-on-write (COW) disk chains in the Windows VMM by supporting QCOW2 backing file references. Unallocated clusters now delegate reads to the backing file instead of returning zeros, enabling thin COW overlays that eliminate expensive ~256MB disk copies per box. Also includes formatting cleanup from cargo fmt. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/lapic.rs | 2 +- src/vmm/src/windows/devices/virtio/disk.rs | 262 +++++++++++++++++++-- src/vmm/src/windows/runner.rs | 46 +++- 3 files changed, 276 insertions(+), 34 deletions(-) diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index e929b4a96..eb0c71bec 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -275,7 +275,7 @@ impl LocalApic { 0x310 => self.icr_high, // ICR High 0x320 => self.read_lvt_timer(), // LVT Timer 0x380 => self.timer_initial, // Timer Initial Count - 0x390 => self.current_count(), // Timer Current Count + 0x390 => self.current_count(), // Timer Current Count 0x3E0 => self.timer_divide_reg, // Timer Divide Configuration _ => 0, } diff --git a/src/vmm/src/windows/devices/virtio/disk.rs b/src/vmm/src/windows/devices/virtio/disk.rs index 87586e142..f33a6f62a 100644 --- a/src/vmm/src/windows/devices/virtio/disk.rs +++ b/src/vmm/src/windows/devices/virtio/disk.rs @@ -5,7 +5,7 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; -use std::path::Path; +use std::path::{Path, PathBuf}; use super::super::super::error::{Result, WkrunError}; /// Disk format: raw file passthrough. @@ -106,6 +106,8 @@ const L2_OFFSET_MASK: u64 = 0x00FF_FFFF_FFFF_FE00; struct Qcow2Header { #[allow(dead_code)] version: u32, + backing_file_offset: u64, // File offset of the backing file path (0 = none). + backing_file_size: u32, // Length of the backing file path in bytes. cluster_bits: u32, size: u64, // Virtual disk size in bytes. l1_size: u32, // Number of entries in the L1 table. @@ -134,13 +136,10 @@ impl Qcow2Header { ))); } - // Backing file offset (u64 at 8) — must be zero (no backing files). + // Backing file offset and size (parsed but not validated here — + // the backend's open() method handles backing file resolution). let backing_file_offset = u64::from_be_bytes(buf[8..16].try_into().unwrap()); - if backing_file_offset != 0 { - return Err(WkrunError::Device( - "qcow2 backing files are not supported".into(), - )); - } + let backing_file_size = u32::from_be_bytes(buf[16..20].try_into().unwrap()); let cluster_bits = u32::from_be_bytes([buf[20], buf[21], buf[22], buf[23]]); if !(9..=21).contains(&cluster_bits) { @@ -179,6 +178,8 @@ impl Qcow2Header { Ok(Qcow2Header { version, + backing_file_offset, + backing_file_size, cluster_bits, size, l1_size, @@ -190,11 +191,36 @@ impl Qcow2Header { } } +/// Detect disk format by checking for QCOW2 magic bytes. +fn detect_disk_format(path: &Path) -> Result { + let mut f = File::open(path).map_err(|e| { + WkrunError::Device(format!( + "failed to open '{}' for format detection: {}", + path.display(), + e + )) + })?; + let mut magic = [0u8; 4]; + f.read_exact(&mut magic).map_err(|e| { + WkrunError::Device(format!( + "failed to read magic from '{}': {}", + path.display(), + e + )) + })?; + if u32::from_be_bytes(magic) == QCOW2_MAGIC { + Ok(DISK_FORMAT_QCOW2) + } else { + Ok(DISK_FORMAT_RAW) + } +} + /// qcow2 image backend with two-level L1/L2 table navigation. /// /// Supports reading and writing existing qcow2 images. New clusters /// are allocated by appending to the end of the file (append-only). -/// No compression, encryption, snapshots, or backing file support. +/// Unallocated clusters delegate to an optional backing file. +/// No compression, encryption, or snapshot support. struct Qcow2DiskBackend { file: File, header: Qcow2Header, @@ -204,6 +230,7 @@ struct Qcow2DiskBackend { refcount_table: Vec, next_free_cluster: u64, read_only: bool, + backing: Option>, } impl Qcow2DiskBackend { @@ -260,6 +287,37 @@ impl Qcow2DiskBackend { .map_err(|e| WkrunError::Device(format!("failed to get qcow2 file size: {}", e)))?; let next_free_cluster = file_len.div_ceil(cluster_size) * cluster_size; + // Open backing file if referenced in the header. + let backing = if header.backing_file_offset != 0 && header.backing_file_size > 0 { + file.seek(SeekFrom::Start(header.backing_file_offset)) + .map_err(|e| { + WkrunError::Device(format!("failed to seek to backing file path: {}", e)) + })?; + let mut path_buf = vec![0u8; header.backing_file_size as usize]; + file.read_exact(&mut path_buf).map_err(|e| { + WkrunError::Device(format!("failed to read backing file path: {}", e)) + })?; + let backing_path_str = String::from_utf8(path_buf).map_err(|e| { + WkrunError::Device(format!("invalid UTF-8 in backing file path: {}", e)) + })?; + + // Resolve relative paths against the parent directory of this qcow2 file. + let backing_path = { + let p = PathBuf::from(&backing_path_str); + if p.is_absolute() { + p + } else { + path.parent().unwrap_or_else(|| Path::new(".")).join(&p) + } + }; + + let backing_format = detect_disk_format(&backing_path)?; + let backend = open_disk_backend(&backing_path, backing_format, true)?; + Some(backend) + } else { + None + }; + Ok(Qcow2DiskBackend { file, header, @@ -269,6 +327,7 @@ impl Qcow2DiskBackend { refcount_table, next_free_cluster, read_only, + backing, }) } @@ -476,8 +535,13 @@ impl DiskBackend for Qcow2DiskBackend { .map_err(|e| WkrunError::Device(format!("qcow2: read failed: {}", e)))?; } None => { - // Unallocated cluster — return zeros. - buf[pos..pos + chunk_len].fill(0); + // Unallocated cluster — read from backing file or return zeros. + match self.backing { + Some(ref mut b) => { + b.read_at(guest_offset, &mut buf[pos..pos + chunk_len])? + } + None => buf[pos..pos + chunk_len].fill(0), + } } } @@ -848,17 +912,23 @@ mod tests { } #[test] - fn test_qcow2_header_backing_file_rejected() { - let mut tmp = NamedTempFile::new().unwrap(); - let mut data = vec![0u8; 512]; - data[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); - data[4..8].copy_from_slice(&2u32.to_be_bytes()); - data[8..16].copy_from_slice(&100u64.to_be_bytes()); // Backing file offset. - tmp.write_all(&data).unwrap(); - tmp.flush().unwrap(); - - let err = Qcow2DiskBackend::open(tmp.path(), false).err().unwrap(); - assert!(err.to_string().contains("backing"), "error was: {}", err); + fn test_qcow2_header_backing_file_parsed() { + // Verify that header parsing accepts backing_file_offset != 0. + let mut buf = [0u8; 104]; + buf[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + buf[4..8].copy_from_slice(&2u32.to_be_bytes()); + buf[8..16].copy_from_slice(&100u64.to_be_bytes()); // Backing file offset. + buf[16..20].copy_from_slice(&10u32.to_be_bytes()); // Backing file size. + buf[20..24].copy_from_slice(&16u32.to_be_bytes()); // cluster_bits. + buf[24..32].copy_from_slice(&(1024u64 * 1024).to_be_bytes()); // size. + buf[36..40].copy_from_slice(&1u32.to_be_bytes()); // l1_size. + buf[40..48].copy_from_slice(&(65536u64).to_be_bytes()); // l1_table_offset. + buf[48..56].copy_from_slice(&(65536u64).to_be_bytes()); // refcount_table_offset. + buf[56..60].copy_from_slice(&1u32.to_be_bytes()); // refcount_table_clusters. + + let header = Qcow2Header::parse(&buf).unwrap(); + assert_eq!(header.backing_file_offset, 100); + assert_eq!(header.backing_file_size, 10); } #[test] @@ -1075,4 +1145,154 @@ mod tests { backend.read_at(0, &mut buf).unwrap(); assert!(buf.iter().all(|&b| b == 0x99)); } + + // ----------------------------------------------------------------------- + // Backing file support + // ----------------------------------------------------------------------- + + /// Create a minimal qcow2 v2 image with a backing file reference. + /// + /// Layout (cluster_size = 512): + /// Cluster 0: header + backing file path + /// Cluster 1: refcount table + /// Cluster 2: refcount block + /// Cluster 3: L1 table (all zeros — everything reads from backing) + fn create_test_qcow2_with_backing( + virtual_size: u64, + cluster_bits: u32, + backing_path: &Path, + ) -> NamedTempFile { + let cluster_size = 1u64 << cluster_bits; + let l2_entries = cluster_size / 8; + let l1_entries = virtual_size.div_ceil(cluster_size * l2_entries) as u32; + + let backing_path_bytes = backing_path.to_string_lossy().as_bytes().to_vec(); + let backing_path_len = backing_path_bytes.len() as u32; + // Store backing path right after the 104-byte header. + let backing_file_offset: u64 = 104; + + let refcount_table_offset = cluster_size; + let refcount_block_offset = cluster_size * 2; + let l1_table_offset = cluster_size * 3; + let total_clusters = 4u64; + let file_size = cluster_size * total_clusters; + + let mut f = NamedTempFile::new().unwrap(); + let mut image = vec![0u8; file_size as usize]; + + // --- Header (cluster 0) --- + image[0..4].copy_from_slice(&QCOW2_MAGIC.to_be_bytes()); + image[4..8].copy_from_slice(&2u32.to_be_bytes()); // version + image[8..16].copy_from_slice(&backing_file_offset.to_be_bytes()); + image[16..20].copy_from_slice(&backing_path_len.to_be_bytes()); + image[20..24].copy_from_slice(&cluster_bits.to_be_bytes()); + image[24..32].copy_from_slice(&virtual_size.to_be_bytes()); + image[32..36].copy_from_slice(&0u32.to_be_bytes()); // crypt_method + image[36..40].copy_from_slice(&l1_entries.to_be_bytes()); + image[40..48].copy_from_slice(&l1_table_offset.to_be_bytes()); + image[48..56].copy_from_slice(&refcount_table_offset.to_be_bytes()); + image[56..60].copy_from_slice(&1u32.to_be_bytes()); // refcount_table_clusters + image[60..64].copy_from_slice(&0u32.to_be_bytes()); // nb_snapshots + + // Backing file path (after header). + let start = backing_file_offset as usize; + image[start..start + backing_path_bytes.len()].copy_from_slice(&backing_path_bytes); + + // --- Refcount table (cluster 1) --- + let rt_off = refcount_table_offset as usize; + image[rt_off..rt_off + 8].copy_from_slice(&refcount_block_offset.to_be_bytes()); + + // --- Refcount block (cluster 2) --- + let rb_off = refcount_block_offset as usize; + for i in 0..total_clusters { + let entry_off = rb_off + (i as usize) * 2; + image[entry_off..entry_off + 2].copy_from_slice(&1u16.to_be_bytes()); + } + + // L1 table (cluster 3) — all zeros (everything unallocated → backing). + + f.write_all(&image).unwrap(); + f.flush().unwrap(); + f + } + + #[test] + fn test_qcow2_backing_file_read() { + // Create a raw base disk with a known pattern. + let base = create_raw_file_with_pattern(8); // 8 sectors = 4096 bytes + let base_path = base.path().to_path_buf(); + + // Create a QCOW2 child that references the base as backing. + let child = create_test_qcow2_with_backing(4096, 9, &base_path); + + let mut backend = Qcow2DiskBackend::open(child.path(), false).unwrap(); + + // Read sector 0 — should come from backing (pattern byte = 0x00). + let mut buf = [0u8; 512]; + backend.read_at(0, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x00)); + + // Read sector 3 — should come from backing (pattern byte = 0x03). + backend.read_at(512 * 3, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x03)); + + // Read sector 7 — should come from backing (pattern byte = 0x07). + backend.read_at(512 * 7, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x07)); + } + + #[test] + fn test_qcow2_backing_file_cow_write() { + // Create a raw base disk with pattern. + let base = create_raw_file_with_pattern(8); + let base_path = base.path().to_path_buf(); + + let child = create_test_qcow2_with_backing(4096, 9, &base_path); + let mut backend = Qcow2DiskBackend::open(child.path(), false).unwrap(); + + // Write to sector 2 in the child. + backend.write_at(512 * 2, &[0xFF; 512]).unwrap(); + + // Read sector 2 — should reflect the child write (0xFF). + let mut buf = [0u8; 512]; + backend.read_at(512 * 2, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0xFF)); + + // Read sector 3 — should still come from backing (0x03). + backend.read_at(512 * 3, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x03)); + + // Read sector 0 — should still come from backing (0x00). + backend.read_at(0, &mut buf).unwrap(); + assert!(buf.iter().all(|&b| b == 0x00)); + } + + #[test] + fn test_qcow2_backing_file_missing_errors() { + let missing_path = Path::new("/nonexistent/backing/file.raw"); + let child = create_test_qcow2_with_backing(4096, 9, missing_path); + + let result = Qcow2DiskBackend::open(child.path(), false); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("nonexistent") || err.contains("No such file"), + "error was: {}", + err + ); + } + + #[test] + fn test_detect_disk_format_raw() { + let tmp = create_raw_file(1024); + let fmt = detect_disk_format(tmp.path()).unwrap(); + assert_eq!(fmt, DISK_FORMAT_RAW); + } + + #[test] + fn test_detect_disk_format_qcow2() { + let tmp = create_test_qcow2(1024 * 1024, 9, &[]); + let fmt = detect_disk_format(tmp.path()).unwrap(); + assert_eq!(fmt, DISK_FORMAT_QCOW2); + } } diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index e4bce538f..f57395c26 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -694,11 +694,7 @@ mod imp { if address >= crate::windows::memory::LAPIC_MMIO_BASE { let offset = address - crate::windows::memory::LAPIC_MMIO_BASE; if offset == 0x300 || offset == 0x310 { - diag!( - "LAPIC ICR write: offset={:#X} data={:#X}", - offset, - data - ); + diag!("LAPIC ICR write: offset={:#X} data={:#X}", offset, data); } } // Dispatch IPI if this was an ICR write. @@ -814,7 +810,13 @@ mod imp { .unwrap_or(0); let (qn, bc) = dm.blk_stats(); let apic_mode = dm.irq_chip.apic_mode(); - let blk_mode = if sync_block { "sync" } else if blk_workers_started { "async" } else { "pending" }; + let blk_mode = if sync_block { + "sync" + } else if blk_workers_started { + "async" + } else { + "pending" + }; drop(dm); log::info!( target: "whpx::diag", @@ -983,10 +985,21 @@ mod imp { GDT=base:{:#X}/lim:{:#X} IDT=base:{:#X}/lim:{:#X} \ CR0={:#X} CR4={:#X} EFER={:#X}", ap_id, - sregs.tr.selector, sregs.tr.base, sregs.tr.limit, sregs.tr.access_rights, - sregs.ldt.selector, sregs.ldt.base, sregs.ldt.limit, sregs.ldt.access_rights, - sregs.gdt.base, sregs.gdt.limit, sregs.idt.base, sregs.idt.limit, - sregs.cr0, sregs.cr4, sregs.efer + sregs.tr.selector, + sregs.tr.base, + sregs.tr.limit, + sregs.tr.access_rights, + sregs.ldt.selector, + sregs.ldt.base, + sregs.ldt.limit, + sregs.ldt.access_rights, + sregs.gdt.base, + sregs.gdt.limit, + sregs.idt.base, + sregs.idt.limit, + sregs.cr0, + sregs.cr4, + sregs.efer ); } @@ -1094,7 +1107,14 @@ mod imp { let ipi_action = dm.handle_mmio_write(ap_id, address, size, data, guest_mem); if !matches!(ipi_action, IpiAction::None) { // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). - dispatch_ipi(ipi_action, &mut dm, &[], cancellers, diag_log, stats.start_time); + dispatch_ipi( + ipi_action, + &mut dm, + &[], + cancellers, + diag_log, + stats.start_time, + ); } drop(dm); let _ = vcpu.skip_instruction(); @@ -1248,7 +1268,9 @@ mod imp { VcpuExit::Unknown(reason) => { diag!( "AP{}: unknown exit reason {} after {} exits", - ap_id, reason, stats.exit_count + ap_id, + reason, + stats.exit_count ); return; } From 55fbee8409f90649bf4ff3129219ea48dd086a9f Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 1 May 2026 08:17:48 +0800 Subject: [PATCH 25/33] feat(windows): per-vCPU LAPIC locking for 4+ vCPU support Change IrqChip.lapics from Vec to Vec>> to eliminate cross-vCPU lock contention during LAPIC MMIO access. Add fast path in runner that bypasses DeviceManager lock entirely for per-vCPU LAPIC reads (all offsets) and simple writes (TPR, ICR High, LVT Timer, Initial Count, Divide Config). Writes with cross-device side effects (EOI, SVR, ICR Low) still go through the DeviceManager path. Optimize AP interrupt injection and HLT spin-yield to check per-LAPIC first (cheap) before acquiring DeviceManager lock (expensive). This resolves the 4+ vCPU hang where SMP timer calibration caused all vCPU threads to contend the single DeviceManager mutex for CCR reads, starving BSP's tick_and_poll() and blocking I/O completions. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/irq_chip.rs | 102 +++++++++++-- src/vmm/src/windows/devices/manager.rs | 12 +- src/vmm/src/windows/runner.rs | 194 +++++++++++++++++------- 3 files changed, 236 insertions(+), 72 deletions(-) diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs index c9b2b2cb4..929484c46 100644 --- a/src/vmm/src/windows/devices/irq_chip.rs +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -10,6 +10,7 @@ //! The APIC mode is auto-detected: when the guest writes to the LAPIC SVR //! register with the enable bit set, the IrqChip switches to APIC mode. +use std::sync::{Arc, Mutex}; use std::time::Instant; use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; @@ -42,7 +43,12 @@ pub struct IrqChip { /// I/O APIC for routing device interrupts to the LAPICs. ioapic: IoApic, /// Per-vCPU Local APICs (indexed by vCPU ID). - lapics: Vec, + /// + /// Each LAPIC is wrapped in its own Arc> to allow per-vCPU locking. + /// This eliminates cross-vCPU contention during LAPIC MMIO reads (esp. timer + /// CCR at 0x390), which is critical for 4+ vCPU support — without this, SMP + /// timer calibration causes BSP starvation on tick_and_poll(). + lapics: Vec>>, /// false = PIC mode (early boot), true = APIC mode. apic_mode: bool, } @@ -57,7 +63,7 @@ impl IrqChip { /// Create a new IrqChip in PIC mode (legacy boot) with N LAPICs. pub fn new(num_vcpus: u8) -> Self { let lapics = (0..num_vcpus) - .map(|id| LocalApic::new_with_id(id)) + .map(|id| Arc::new(Mutex::new(LocalApic::new_with_id(id)))) .collect(); Self { pic: Pic::new(), @@ -67,6 +73,14 @@ impl IrqChip { } } + /// Get a clone of the Arc> for a specific vCPU. + /// + /// Used by the runner to acquire per-vCPU LAPIC refs that can be locked + /// independently of the DeviceManager lock (fast path for MMIO reads). + pub fn get_lapic_ref(&self, vcpu_id: u32) -> Arc> { + self.lapics[vcpu_id as usize].clone() + } + /// Number of vCPUs (LAPICs). pub fn num_vcpus(&self) -> u8 { self.lapics.len() as u8 @@ -90,7 +104,7 @@ impl IrqChip { let gsi = if irq == 0 { 2 } else { irq }; if let Some((vector, dest)) = self.ioapic.service_irq(gsi, true) { let target = (dest as usize).min(self.lapics.len() - 1); - self.lapics[target].accept_interrupt(vector); + self.lapics[target].lock().unwrap().accept_interrupt(vector); } } else { self.pic.raise_irq(irq); @@ -102,7 +116,10 @@ impl IrqChip { /// Checks LAPIC (APIC mode) or PIC (legacy mode, only for BSP / vCPU 0). pub fn get_injectable_vector(&self, vcpu_id: u8) -> Option { if self.apic_mode { - self.lapics[vcpu_id as usize].get_highest_injectable() + self.lapics[vcpu_id as usize] + .lock() + .unwrap() + .get_highest_injectable() } else if vcpu_id == 0 { if self.pic.has_pending() { // PIC has pending, but we need to peek — can't acknowledge yet. @@ -120,6 +137,8 @@ impl IrqChip { pub fn has_pending(&self, vcpu_id: u8) -> bool { if self.apic_mode { self.lapics[vcpu_id as usize] + .lock() + .unwrap() .get_highest_injectable() .is_some() } else if vcpu_id == 0 { @@ -135,7 +154,10 @@ impl IrqChip { /// In APIC mode: returns the highest injectable from the vCPU's LAPIC. pub fn acknowledge(&mut self, vcpu_id: u8) -> Option { if self.apic_mode { - self.lapics[vcpu_id as usize].get_highest_injectable() + self.lapics[vcpu_id as usize] + .lock() + .unwrap() + .get_highest_injectable() } else if vcpu_id == 0 { self.pic.acknowledge() } else { @@ -149,7 +171,10 @@ impl IrqChip { /// In PIC mode: no-op (PIC acknowledge already moved to ISR). pub fn notify_injected(&mut self, vcpu_id: u8, vector: u8) { if self.apic_mode { - self.lapics[vcpu_id as usize].start_of_interrupt(vector); + self.lapics[vcpu_id as usize] + .lock() + .unwrap() + .start_of_interrupt(vector); } } @@ -162,7 +187,10 @@ impl IrqChip { // Pin still asserted — re-deliver using the correct IOAPIC pin. if let Some((new_vector, dest)) = self.ioapic.service_irq(pin, true) { let target = (dest as usize).min(self.lapics.len() - 1); - self.lapics[target].accept_interrupt(new_vector); + self.lapics[target] + .lock() + .unwrap() + .accept_interrupt(new_vector); } } // Suppress unused variable warning — vcpu_id is used for routing context. @@ -174,7 +202,7 @@ impl IrqChip { if !self.apic_mode { return None; } - let lapic = &mut self.lapics[vcpu_id as usize]; + let mut lapic = self.lapics[vcpu_id as usize].lock().unwrap(); if let Some(vector) = lapic.tick_timer(now) { lapic.accept_interrupt(vector); Some(vector) @@ -193,7 +221,7 @@ impl IrqChip { Some(self.ioapic.read_mmio(offset)) } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = addr - LAPIC_MMIO_BASE; - Some(self.lapics[vcpu_id as usize].read_mmio(offset)) + Some(self.lapics[vcpu_id as usize].lock().unwrap().read_mmio(offset)) } else { None } @@ -221,7 +249,10 @@ impl IrqChip { } } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = addr - LAPIC_MMIO_BASE; - let result = self.lapics[vcpu_id as usize].write_mmio(offset, data); + let result = self.lapics[vcpu_id as usize] + .lock() + .unwrap() + .write_mmio(offset, data); // LAPIC SVR may have been enabled — check transition. self.check_apic_transition(); @@ -247,7 +278,10 @@ impl IrqChip { /// are handled by the runner's AP startup logic). pub fn deliver_ipi_interrupt(&mut self, target_apic_id: u8, vector: u8) { if (target_apic_id as usize) < self.lapics.len() { - self.lapics[target_apic_id as usize].accept_interrupt(vector); + self.lapics[target_apic_id as usize] + .lock() + .unwrap() + .accept_interrupt(vector); } } @@ -264,7 +298,7 @@ impl IrqChip { if self.apic_mode { return; } - let any_lapic_enabled = self.lapics.iter().any(|l| l.is_enabled()); + let any_lapic_enabled = self.lapics.iter().any(|l| l.lock().unwrap().is_enabled()); if any_lapic_enabled && self.ioapic.has_unmasked_entries() { log::info!("APIC mode enabled — LAPIC active + IOAPIC has unmasked entries"); self.apic_mode = true; @@ -296,10 +330,10 @@ mod tests { fn test_irq_chip_multi_vcpu_creates_lapics() { let chip = IrqChip::new(4); assert_eq!(chip.num_vcpus(), 4); - assert_eq!(chip.lapics[0].id(), 0); - assert_eq!(chip.lapics[1].id(), 1); - assert_eq!(chip.lapics[2].id(), 2); - assert_eq!(chip.lapics[3].id(), 3); + assert_eq!(chip.lapics[0].lock().unwrap().id(), 0); + assert_eq!(chip.lapics[1].lock().unwrap().id(), 1); + assert_eq!(chip.lapics[2].lock().unwrap().id(), 2); + assert_eq!(chip.lapics[3].lock().unwrap().id(), 3); } #[test] @@ -531,4 +565,40 @@ mod tests { let chip = IrqChip::default(); assert_eq!(chip.num_vcpus(), 1); } + + #[test] + fn test_get_lapic_ref_returns_correct_lapic() { + let chip = IrqChip::new(4); + for i in 0..4u32 { + let lapic_ref = chip.get_lapic_ref(i); + assert_eq!(lapic_ref.lock().unwrap().id(), i as u8); + } + } + + #[test] + fn test_concurrent_lapic_access() { + // Verify that per-LAPIC locks allow concurrent access from multiple threads. + let chip = IrqChip::new(4); + let refs: Vec>> = (0..4).map(|i| chip.get_lapic_ref(i)).collect(); + + std::thread::scope(|s| { + for (vcpu_id, lapic_ref) in refs.iter().enumerate() { + let lapic = lapic_ref.clone(); + s.spawn(move || { + // Each thread reads/writes its own LAPIC 1000 times. + for _ in 0..1000 { + let mut l = lapic.lock().unwrap(); + // Read LAPIC ID register (offset 0x020). + let id_val = l.read_mmio(0x020); + assert_eq!(id_val >> 24, vcpu_id as u32); + // Write TPR (offset 0x080). + l.write_mmio(0x080, 0x10); + // Read TPR back. + let tpr = l.read_mmio(0x080); + assert_eq!(tpr, 0x10); + } + }); + } + }); + } } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index cd53fae34..562944913 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -15,7 +15,7 @@ use super::super::context::VmContext; use super::super::error::{Result, WkrunError}; use super::super::vcpu::IoHandler; use super::irq_chip::IrqChip; -use super::lapic::IpiAction; +use super::lapic::{IpiAction, LocalApic}; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; use super::virtio::balloon::VirtioBalloon; @@ -783,6 +783,16 @@ impl DeviceManager { pub fn set_window_requested(&mut self, requested: bool) { self.window_requested = requested; } + + /// Get per-vCPU LAPIC references for the runner fast path. + /// + /// Each ref can be locked independently of the DeviceManager lock, + /// eliminating cross-vCPU contention on LAPIC MMIO reads. + pub fn get_lapic_refs(&self) -> Vec>> { + (0..self.irq_chip.num_vcpus()) + .map(|i| self.irq_chip.get_lapic_ref(i as u32)) + .collect() + } } /// Create a `DeviceManager` from explicit components (for testing). diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index f57395c26..a59735fc1 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -18,11 +18,11 @@ mod imp { use super::super::boot::loader::load_kernel_with_initrd; use super::super::cmdline::build_kernel_cmdline; use super::super::context::VmContext; - use super::super::devices::lapic::IpiAction; + use super::super::devices::lapic::{IpiAction, LocalApic}; use super::super::devices::manager::{self as devices, DeviceManager}; use super::super::devices::virtio::queue::GuestMemoryAccessor; use super::super::error::{Result, WkrunError}; - use super::super::memory::GuestMemory; + use super::super::memory::{GuestMemory, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; use super::super::types::VcpuExit; use super::super::vcpu::VcpuRunConfig; use super::super::whpx::{VcpuCanceller, WhpxPartition, WhpxVcpu}; @@ -311,6 +311,12 @@ mod imp { // Shared VM shutdown flag — set by any vCPU to signal all others to exit. let shutdown = Arc::new(AtomicBool::new(false)); + + // Extract per-vCPU LAPIC refs BEFORE wrapping in Arc>. + // These allow the runner fast path to bypass the DeviceManager lock + // for LAPIC MMIO reads/writes, eliminating cross-vCPU contention. + let lapic_refs: Vec>> = devices.get_lapic_refs(); + let devices = Arc::new(Mutex::new(devices)); // Move diag_log into shared state for BSP diagnostics. @@ -331,6 +337,7 @@ mod imp { let run_config_ref = &run_config; let guest_mem_ref: &GuestMemory = &guest_mem; let diag_ref = &diag_log; + let lapic_refs_ref = &lapic_refs; // Spawn timer thread — cancels ALL vCPUs every 1ms. let timer_flag = run_config.running.clone(); @@ -350,6 +357,7 @@ mod imp { // Spawn AP threads (vCPU 1..N-1). for ap_idx in 1..num_vcpus as usize { let vcpu = &vcpus[ap_idx]; + let my_lapic = &lapic_refs_ref[ap_idx]; s.spawn(move || { run_ap_loop( ap_idx as u8, @@ -363,6 +371,7 @@ mod imp { &ap_states_ref[ap_idx - 1], ctx_id, diag_ref, + my_lapic, ); }); } @@ -380,6 +389,7 @@ mod imp { ctx_id, diag_ref, num_vcpus, + &lapic_refs_ref[0], ); // BSP exited — signal all APs to exit. shutdown_ref.store(true, Ordering::Release); @@ -570,6 +580,52 @@ mod imp { } } + /// Fast path: read from LAPIC MMIO without acquiring DeviceManager lock. + /// + /// All LAPIC register reads are safe to handle via the per-vCPU LAPIC lock + /// since they only access the vCPU's own LAPIC state (IRR, ISR, TPR, CCR, etc). + /// Returns Some(value) if the address is in the LAPIC range, None otherwise. + fn handle_lapic_mmio_read_fast(lapic: &Mutex, address: u64) -> Option { + if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + let offset = address - LAPIC_MMIO_BASE; + Some(lapic.lock().unwrap().read_mmio(offset) as u64) + } else { + None + } + } + + /// Fast path: write to LAPIC MMIO without acquiring DeviceManager lock. + /// + /// Only handles registers that have NO side effects requiring other devices + /// (no EOI → IOAPIC propagation, no SVR → APIC transition check, no ICR → IPI). + /// Returns Some(()) if handled, None if the write needs the DeviceManager path. + fn handle_lapic_mmio_write_fast(lapic: &Mutex, address: u64, data: u64) -> Option<()> { + if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { + let offset = address - LAPIC_MMIO_BASE; + match offset { + // TPR: no cross-device side effects. + 0x080 | + // ICR High: sets destination, no IPI dispatch until ICR Low write. + 0x310 | + // LVT Timer: configures timer mode/vector, no immediate side effects. + 0x320 | + // Timer Initial Count: starts/resets timer countdown. + 0x380 | + // Timer Divide Config: sets timer divider. + 0x3E0 => { + lapic.lock().unwrap().write_mmio(offset, data as u32); + Some(()) + } + // EOI (0x0B0): needs IOAPIC propagation → DeviceManager path. + // SVR (0x0F0): needs APIC transition check → DeviceManager path. + // ICR Low (0x300): needs IPI dispatch → DeviceManager path. + _ => None, + } + } else { + None + } + } + /// BSP (Bootstrap Processor, vCPU 0) main loop. /// /// Handles timer ticking, device polling, interrupt injection, block worker @@ -586,6 +642,7 @@ mod imp { ctx_id: u32, diag_log: &Arc>>, num_vcpus: u8, + my_lapic: &Arc>, ) -> i32 { macro_rules! diag { ($($arg:tt)*) => { @@ -664,7 +721,12 @@ mod imp { VcpuExit::MmioRead { address, size } => { stats.halt_count = 0; stats.mmio_count += 1; - let data = devices.lock().unwrap().handle_mmio_read(0, address, size); + // Fast path: LAPIC reads bypass DeviceManager lock. + let data = if let Some(val) = handle_lapic_mmio_read_fast(my_lapic, address) { + val + } else { + devices.lock().unwrap().handle_mmio_read(0, address, size) + }; if let Err(e) = vcpu.complete_mmio_read(data) { log::error!("BSP complete_mmio_read error: {:?}", e); return 1; @@ -677,38 +739,42 @@ mod imp { } => { stats.halt_count = 0; stats.mmio_count += 1; - let mut dm = devices.lock().unwrap(); - if !blk_workers_started && !sync_block { - dm.start_blk_workers(); - blk_workers_started = true; - log::info!( - target: "whpx::diag", - "Block workers started at exit={} mmio={} elapsed={:.1}ms", - stats.exit_count, - stats.mmio_count, - stats.start_time.elapsed().as_secs_f64() * 1000.0 - ); - } - let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); - // Log LAPIC ICR writes for diagnostics. - if address >= crate::windows::memory::LAPIC_MMIO_BASE { - let offset = address - crate::windows::memory::LAPIC_MMIO_BASE; - if offset == 0x300 || offset == 0x310 { - diag!("LAPIC ICR write: offset={:#X} data={:#X}", offset, data); + // Fast path: simple LAPIC writes bypass DeviceManager lock. + if handle_lapic_mmio_write_fast(my_lapic, address, data).is_none() { + // Slow path: needs DeviceManager for EOI/SVR/ICR or non-LAPIC devices. + let mut dm = devices.lock().unwrap(); + if !blk_workers_started && !sync_block { + dm.start_blk_workers(); + blk_workers_started = true; + log::info!( + target: "whpx::diag", + "Block workers started at exit={} mmio={} elapsed={:.1}ms", + stats.exit_count, + stats.mmio_count, + stats.start_time.elapsed().as_secs_f64() * 1000.0 + ); } + let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); + // Log LAPIC ICR writes for diagnostics. + if address >= LAPIC_MMIO_BASE { + let offset = address - LAPIC_MMIO_BASE; + if offset == 0x300 || offset == 0x310 { + diag!("LAPIC ICR write: offset={:#X} data={:#X}", offset, data); + } + } + // Dispatch IPI if this was an ICR write. + if !matches!(ipi_action, IpiAction::None) { + dispatch_ipi( + ipi_action, + &mut dm, + ap_states, + cancellers, + diag_log, + stats.start_time, + ); + } + drop(dm); } - // Dispatch IPI if this was an ICR write. - if !matches!(ipi_action, IpiAction::None) { - dispatch_ipi( - ipi_action, - &mut dm, - ap_states, - cancellers, - diag_log, - stats.start_time, - ); - } - drop(dm); if let Err(e) = vcpu.skip_instruction() { log::error!("BSP skip_instruction error: {:?}", e); return 1; @@ -780,6 +846,12 @@ mod imp { for i in 0..HLT_SPIN_ITERS { std::thread::yield_now(); if i % 10 == 9 { + // Fast check: per-LAPIC only (no DeviceManager lock). + if my_lapic.lock().unwrap().get_highest_injectable().is_some() { + woke_by_irq = true; + break; + } + // Slow check: tick PIT + poll devices. let mut dm = devices.lock().unwrap(); dm.tick_and_poll(0, guest_mem); if dm.irq_chip.has_pending(0) { @@ -940,6 +1012,7 @@ mod imp { startup: &ApStartupState, _ctx_id: u32, diag_log: &Arc>>, + my_lapic: &Arc>, ) { macro_rules! diag { ($($arg:tt)*) => { @@ -1020,7 +1093,8 @@ mod imp { } // Try to inject pending interrupt (no timer ticking for APs). - { + // Fast path: check per-LAPIC first — usually no pending → skip DeviceManager. + if my_lapic.lock().unwrap().get_highest_injectable().is_some() { let mut dm = devices.lock().unwrap(); if let Err(e) = try_inject_interrupt(vcpu, ap_id, &mut dm, &mut stats) { log::error!("AP{}: interrupt injection error: {:?}", ap_id, e); @@ -1090,10 +1164,15 @@ mod imp { VcpuExit::MmioRead { address, size } => { stats.halt_count = 0; stats.mmio_count += 1; - let data = devices - .lock() - .unwrap() - .handle_mmio_read(ap_id, address, size); + // Fast path: LAPIC reads bypass DeviceManager lock. + let data = if let Some(val) = handle_lapic_mmio_read_fast(my_lapic, address) { + val + } else { + devices + .lock() + .unwrap() + .handle_mmio_read(ap_id, address, size) + }; let _ = vcpu.complete_mmio_read(data); } VcpuExit::MmioWrite { @@ -1103,20 +1182,25 @@ mod imp { } => { stats.halt_count = 0; stats.mmio_count += 1; - let mut dm = devices.lock().unwrap(); - let ipi_action = dm.handle_mmio_write(ap_id, address, size, data, guest_mem); - if !matches!(ipi_action, IpiAction::None) { - // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). - dispatch_ipi( - ipi_action, - &mut dm, - &[], - cancellers, - diag_log, - stats.start_time, - ); + // Fast path: simple LAPIC writes bypass DeviceManager lock. + if handle_lapic_mmio_write_fast(my_lapic, address, data).is_none() { + // Slow path: needs DeviceManager for EOI/SVR/ICR or non-LAPIC devices. + let mut dm = devices.lock().unwrap(); + let ipi_action = + dm.handle_mmio_write(ap_id, address, size, data, guest_mem); + if !matches!(ipi_action, IpiAction::None) { + // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). + dispatch_ipi( + ipi_action, + &mut dm, + &[], + cancellers, + diag_log, + stats.start_time, + ); + } + drop(dm); } - drop(dm); let _ = vcpu.skip_instruction(); } VcpuExit::InterruptWindow => { @@ -1130,8 +1214,8 @@ mod imp { return; } - // Check for pending interrupts. - { + // Check for pending interrupts (fast path: per-LAPIC only). + if my_lapic.lock().unwrap().get_highest_injectable().is_some() { let mut dm = devices.lock().unwrap(); if dm.irq_chip.has_pending(ap_id) { let already_pending = vcpu.has_pending_interruption().unwrap_or(false); @@ -1164,12 +1248,12 @@ mod imp { // Tiered sleep: spin-yield phase to catch imminent interrupts, // then short sleep if no interrupt arrived. + // Fast path: per-LAPIC only (no DeviceManager lock). let mut woke_by_irq = false; for i in 0..HLT_SPIN_ITERS { std::thread::yield_now(); if i % 10 == 9 { - let mut dm = devices.lock().unwrap(); - if dm.irq_chip.has_pending(ap_id) { + if my_lapic.lock().unwrap().get_highest_injectable().is_some() { woke_by_irq = true; break; } From e812fa3b2dc6d448ad96d18509d620ff34128bb9 Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 2 May 2026 10:01:51 +0800 Subject: [PATCH 26/33] feat(windows): lock-free LAPIC + CPUID topology override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lock-free interrupt delivery: - SharedApicState with atomic pending_irr for cross-vCPU interrupt injection without DeviceManager lock contention - try_inject_interrupt_fast() bypasses lock for common case - LapicWriteFastResult for inline ICR/EOI handling in vCPU loop - pull_irr after tick_and_poll to catch device-raised interrupts CPUID topology override: - Intercept leaf 0xB/0x1F (Extended Topology Enumeration) to return correct guest topology instead of host passthrough - Intercept leaf 4 (Deterministic Cache Parameters) to match guest vCPU count in max_cores and max_threads_sharing fields - Add input_rcx parameter to handle_cpuid() — guest's original ECX (sub-leaf number) is distinct from WHPX's default_rcx (output) MMIO diagnostic counters for tight loop detection (BSP hang analysis). Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/cmdline.rs | 1 + src/vmm/src/windows/devices/irq_chip.rs | 105 ++++- src/vmm/src/windows/devices/lapic.rs | 133 +++++- src/vmm/src/windows/devices/manager.rs | 21 +- src/vmm/src/windows/runner.rs | 574 ++++++++++++++++++++---- 5 files changed, 705 insertions(+), 129 deletions(-) diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index bac75f617..bf3422625 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -464,4 +464,5 @@ mod tests { assert!(mmio_pos < sep_pos); assert!(!cmdline.contains("root=/dev/vda")); } + } diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs index 929484c46..0b36ad1b6 100644 --- a/src/vmm/src/windows/devices/irq_chip.rs +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -15,7 +15,7 @@ use std::time::Instant; use super::super::memory::{IOAPIC_MMIO_BASE, IOAPIC_MMIO_SIZE, LAPIC_MMIO_BASE, LAPIC_MMIO_SIZE}; use super::ioapic::IoApic; -use super::lapic::{IpiAction, LocalApic}; +use super::lapic::{IpiAction, LocalApic, SharedApicState}; use super::pic::Pic; /// Result of an IrqChip MMIO write operation. @@ -49,6 +49,11 @@ pub struct IrqChip { /// CCR at 0x390), which is critical for 4+ vCPU support — without this, SMP /// timer calibration causes BSP starvation on tick_and_poll(). lapics: Vec>>, + /// Per-vCPU shared APIC state for lock-free cross-vCPU interrupt delivery. + /// + /// Source vCPUs atomically OR vector bits into the target's SharedApicState. + /// The owning vCPU pulls these into its local IRR via `pull_irr()`. + shared_states: Vec>, /// false = PIC mode (early boot), true = APIC mode. apic_mode: bool, } @@ -65,10 +70,14 @@ impl IrqChip { let lapics = (0..num_vcpus) .map(|id| Arc::new(Mutex::new(LocalApic::new_with_id(id)))) .collect(); + let shared_states = (0..num_vcpus) + .map(|_| Arc::new(SharedApicState::new())) + .collect(); Self { pic: Pic::new(), ioapic: IoApic::new(), lapics, + shared_states, apic_mode: false, } } @@ -81,6 +90,13 @@ impl IrqChip { self.lapics[vcpu_id as usize].clone() } + /// Get a clone of the Arc for a specific vCPU. + /// + /// Used by the runner for lock-free cross-vCPU interrupt delivery. + pub fn get_shared_state(&self, vcpu_id: u32) -> Arc { + self.shared_states[vcpu_id as usize].clone() + } + /// Number of vCPUs (LAPICs). pub fn num_vcpus(&self) -> u8 { self.lapics.len() as u8 @@ -104,7 +120,8 @@ impl IrqChip { let gsi = if irq == 0 { 2 } else { irq }; if let Some((vector, dest)) = self.ioapic.service_irq(gsi, true) { let target = (dest as usize).min(self.lapics.len() - 1); - self.lapics[target].lock().unwrap().accept_interrupt(vector); + // Lock-free: atomic OR into shared state instead of locking LAPIC. + self.shared_states[target].request_interrupt(vector); } } else { self.pic.raise_irq(irq); @@ -187,10 +204,8 @@ impl IrqChip { // Pin still asserted — re-deliver using the correct IOAPIC pin. if let Some((new_vector, dest)) = self.ioapic.service_irq(pin, true) { let target = (dest as usize).min(self.lapics.len() - 1); - self.lapics[target] - .lock() - .unwrap() - .accept_interrupt(new_vector); + // Lock-free: atomic OR into shared state. + self.shared_states[target].request_interrupt(new_vector); } } // Suppress unused variable warning — vcpu_id is used for routing context. @@ -277,11 +292,10 @@ impl IrqChip { /// that targets another LAPIC (SendInterrupt variant only — INIT and SIPI /// are handled by the runner's AP startup logic). pub fn deliver_ipi_interrupt(&mut self, target_apic_id: u8, vector: u8) { - if (target_apic_id as usize) < self.lapics.len() { - self.lapics[target_apic_id as usize] - .lock() - .unwrap() - .accept_interrupt(vector); + let idx = target_apic_id as usize; + if idx < self.shared_states.len() { + // Lock-free: atomic OR into shared state instead of locking LAPIC. + self.shared_states[idx].request_interrupt(vector); } } @@ -409,6 +423,8 @@ mod tests { assert!(chip.apic_mode()); chip.raise_irq(5); + // pull_irr: merge shared state into local IRR (lock-free delivery path). + chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); let vector = chip.acknowledge(0); @@ -429,6 +445,8 @@ mod tests { // raise_irq(0) should remap to IOAPIC pin 2 and deliver vector 0x22. chip.raise_irq(0); + // pull_irr: merge shared state into local IRR (lock-free delivery path). + chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); let vector = chip.acknowledge(0); @@ -494,6 +512,8 @@ mod tests { // Raise IRQ 3. chip.raise_irq(3); + // pull_irr: merge shared state into local IRR (lock-free delivery path). + chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x33)); @@ -503,7 +523,8 @@ mod tests { // Write EOI to LAPIC (offset 0x0B0). chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0B0, 4, 0); - // After EOI, the pin is still asserted → re-injection. + // After EOI, the pin is still asserted → re-injection via shared state. + chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); } @@ -544,6 +565,8 @@ mod tests { // Deliver IPI to vCPU 1. chip.deliver_ipi_interrupt(1, 0x40); + // pull_irr: merge shared state into local IRR (lock-free delivery path). + chip.lapics[1].lock().unwrap().pull_irr(&chip.shared_states[1]); assert!(chip.has_pending(1)); assert_eq!(chip.acknowledge(1), Some(0x40)); } @@ -601,4 +624,62 @@ mod tests { } }); } + + // ---- Lock-free SharedApicState integration tests ---- + + #[test] + fn test_raise_irq_uses_shared_state() { + use super::lapic::SharedApicState; + + let mut chip = IrqChip::new(2); + // Enable APIC mode: SVR on vCPU 0. + chip.lapics[0].lock().unwrap().write_mmio(0x0F0, 0x1FF); + // Unmask IOAPIC entry 1 → GSI 1, vector 49, dest = LAPIC 0. + chip.ioapic.set_entry(1, 49, 0, false); + chip.apic_mode = true; + + // raise_irq goes through shared state (lock-free). + chip.raise_irq(1); + + // Before pull_irr, LAPIC has nothing. + assert_eq!(chip.lapics[0].lock().unwrap().get_highest_injectable(), None); + + // After pull_irr, LAPIC sees vector 49. + let shared = chip.get_shared_state(0); + chip.lapics[0].lock().unwrap().pull_irr(&shared); + assert_eq!( + chip.lapics[0].lock().unwrap().get_highest_injectable(), + Some(49) + ); + } + + #[test] + fn test_deliver_ipi_lock_free() { + use super::lapic::SharedApicState; + + let mut chip = IrqChip::new(2); + // Enable both LAPICs. + chip.lapics[0].lock().unwrap().write_mmio(0x0F0, 0x1FF); + chip.lapics[1].lock().unwrap().write_mmio(0x0F0, 0x1FF); + chip.apic_mode = true; + + // Deliver IPI: vector 80 to LAPIC 1. + chip.deliver_ipi_interrupt(1, 80); + + // Before pull_irr, LAPIC 1 has nothing. + assert_eq!(chip.lapics[1].lock().unwrap().get_highest_injectable(), None); + + // After pull_irr, LAPIC 1 sees vector 80. + let shared = chip.get_shared_state(1); + chip.lapics[1].lock().unwrap().pull_irr(&shared); + assert_eq!( + chip.lapics[1].lock().unwrap().get_highest_injectable(), + Some(80) + ); + + // LAPIC 0 should be unaffected. + let shared0 = chip.get_shared_state(0); + chip.lapics[0].lock().unwrap().pull_irr(&shared0); + assert_eq!(chip.lapics[0].lock().unwrap().get_highest_injectable(), None); + } } diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index eb0c71bec..8ec0244e7 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -19,8 +19,45 @@ //! - 0x390: Timer Current Count //! - 0x3E0: Timer Divide Configuration +use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Instant; +/// Shared APIC state for lock-free cross-vCPU interrupt delivery. +/// +/// Other vCPUs atomically OR bits into `new_irr`. The owning vCPU +/// periodically calls `pull_irr()` to merge into its local IRR. +/// Inspired by OpenVMM's `virt_support_apic::SharedState`. +pub struct SharedApicState { + /// Remote interrupt requests (256 bits = 8 x AtomicU32). + /// Source vCPUs atomic-OR the vector bit here. + new_irr: [AtomicU32; 8], +} + +impl SharedApicState { + /// Create a new shared state with no pending interrupts. + pub fn new() -> Self { + Self { + new_irr: std::array::from_fn(|_| AtomicU32::new(0)), + } + } + + /// Atomically request an interrupt vector on this vCPU. + /// + /// Returns `true` if the bit was newly set (caller should wake target vCPU). + pub fn request_interrupt(&self, vector: u8) -> bool { + let (bank, mask) = bank_mask(vector); + let prev = self.new_irr[bank].fetch_or(mask, Ordering::Release); + prev & mask == 0 + } +} + +/// Compute the bank index and bit mask for a vector (0-255). +fn bank_mask(vector: u8) -> (usize, u32) { + let bank = (vector / 32) as usize; + let bit = vector % 32; + (bank, 1u32 << bit) +} + /// Action resulting from an ICR write (Inter-Processor Interrupt). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum IpiAction { @@ -151,6 +188,19 @@ impl LocalApic { self.svr & SVR_APIC_ENABLE != 0 } + /// Pull remote interrupt requests from the shared state into the local IRR. + /// + /// Atomically swaps each bank to 0 and ORs the bits into the local IRR. + /// Called at the top of each vCPU loop iteration (lock-free fast path). + pub fn pull_irr(&mut self, shared: &SharedApicState) { + for i in 0..8 { + let bits = shared.new_irr[i].swap(0, Ordering::Acquire); + if bits != 0 { + self.irr[i] |= bits; + } + } + } + /// Accept an interrupt vector into the IRR. pub fn accept_interrupt(&mut self, vector: u8) { let word = (vector / 32) as usize; @@ -338,15 +388,6 @@ impl LocalApic { } /// Parse the ICR low/high registers to produce an IPI action. - /// - /// ICR Low bits: - /// - [7:0] Vector - /// - [10:8] Delivery mode: 000=Fixed, 101=INIT, 110=Startup (SIPI) - /// - [11] Destination mode (ignored, always physical) - /// - [17:16] Destination shorthand (00=field, others unsupported) - /// - /// ICR High bits: - /// - [31:24] Destination APIC ID fn parse_icr(&self) -> IpiAction { let vector = (self.icr_low & 0xFF) as u8; let delivery_mode = (self.icr_low >> 8) & 0x7; @@ -872,4 +913,78 @@ mod tests { assert_eq!(result.ipi_action, IpiAction::None); assert_eq!(result.eoi_vector, None); } + + + // ---- SharedApicState tests ---- + + #[test] + fn test_shared_request_interrupt() { + let shared = SharedApicState::new(); + // Vector 32 → bank 1, bit 0. + assert!(shared.request_interrupt(32)); // first set → true + assert!(!shared.request_interrupt(32)); // already set → false + // Vector 33 → bank 1, bit 1. + assert!(shared.request_interrupt(33)); // different bit → true + } + + #[test] + fn test_shared_pull_irr() { + let shared = SharedApicState::new(); + let mut lapic = LocalApic::new(); + + shared.request_interrupt(48); // bank 1, bit 16 + shared.request_interrupt(100); // bank 3, bit 4 + + lapic.pull_irr(&shared); + + // After pull, shared should be cleared. + assert!(shared.request_interrupt(48)); // re-setting returns true (was cleared) + + // LAPIC should now have vector 100 injectable (highest). + // Enable LAPIC first (SVR bit 8). + lapic.write_mmio(0x0F0, 0x1FF); + assert_eq!(lapic.get_highest_injectable(), Some(100)); + } + + #[test] + fn test_shared_concurrent_ipi() { + use std::sync::Arc; + + let shared = Arc::new(SharedApicState::new()); + let num_threads = 8; + // Each thread sets a distinct vector: 32, 33, ..., 39. + std::thread::scope(|s| { + for t in 0..num_threads { + let sh = shared.clone(); + let vector = 32 + t as u8; + s.spawn(move || { + assert!(sh.request_interrupt(vector)); + }); + } + }); + + // Pull all into a LAPIC and verify all vectors present. + let mut lapic = LocalApic::new(); + lapic.pull_irr(&shared); + lapic.write_mmio(0x0F0, 0x1FF); // enable + // Highest should be 39. + assert_eq!(lapic.get_highest_injectable(), Some(39)); + } + + #[test] + fn test_pull_irr_priority() { + let shared = SharedApicState::new(); + let mut lapic = LocalApic::new(); + + shared.request_interrupt(64); // lower priority + shared.request_interrupt(200); // higher priority + + lapic.pull_irr(&shared); + lapic.write_mmio(0x0F0, 0x1FF); // enable + assert_eq!(lapic.get_highest_injectable(), Some(200)); + + // Acknowledge 200, next should be 64. + lapic.start_of_interrupt(200); + assert_eq!(lapic.get_highest_injectable(), Some(64)); + } } diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index 562944913..ced419eef 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -15,7 +15,7 @@ use super::super::context::VmContext; use super::super::error::{Result, WkrunError}; use super::super::vcpu::IoHandler; use super::irq_chip::IrqChip; -use super::lapic::{IpiAction, LocalApic}; +use super::lapic::{IpiAction, LocalApic, SharedApicState}; use super::pit::Pit; use super::serial::{Serial, COM1_BASE}; use super::virtio::balloon::VirtioBalloon; @@ -684,16 +684,8 @@ impl DeviceManager { } } - // Tick LAPIC timers for ALL vCPUs (only fires in APIC mode). - // Each AP's LAPIC timer must advance so the kernel scheduler can preempt - // tasks on all CPUs. Without this, AP LAPIC timer calibration hangs. - // Throttle: LAPIC timers fire at ~100Hz (10ms period), so checking more - // than every 500µs wastes CPU. PIT timer (IRQ 0) still fires every tick. - if elapsed_ns > 500_000 { - for i in 0..self.irq_chip.num_vcpus() { - self.irq_chip.tick_timer(i, now); - } - } + // LAPIC timers are now ticked per-vCPU in the runner loop (lock-free). + // This eliminates cross-vCPU contention on tick_and_poll(). // Suppress unused variable — vcpu_id was the original single-vCPU target. let _ = vcpu_id; @@ -793,6 +785,13 @@ impl DeviceManager { .map(|i| self.irq_chip.get_lapic_ref(i as u32)) .collect() } + + /// Get per-vCPU shared APIC states for lock-free cross-vCPU interrupt delivery. + pub fn get_shared_states(&self) -> Vec> { + (0..self.irq_chip.num_vcpus()) + .map(|i| self.irq_chip.get_shared_state(i as u32)) + .collect() + } } /// Create a `DeviceManager` from explicit components (for testing). diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index a59735fc1..682c6767b 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -18,7 +18,7 @@ mod imp { use super::super::boot::loader::load_kernel_with_initrd; use super::super::cmdline::build_kernel_cmdline; use super::super::context::VmContext; - use super::super::devices::lapic::{IpiAction, LocalApic}; + use super::super::devices::lapic::{IpiAction, LocalApic, SharedApicState}; use super::super::devices::manager::{self as devices, DeviceManager}; use super::super::devices::virtio::queue::GuestMemoryAccessor; use super::super::error::{Result, WkrunError}; @@ -317,6 +317,10 @@ mod imp { // for LAPIC MMIO reads/writes, eliminating cross-vCPU contention. let lapic_refs: Vec>> = devices.get_lapic_refs(); + // Extract per-vCPU shared APIC states for lock-free cross-vCPU interrupt delivery. + // Source vCPUs atomic-OR vector bits here; owning vCPU pulls into local IRR. + let shared_states: Vec> = devices.get_shared_states(); + let devices = Arc::new(Mutex::new(devices)); // Move diag_log into shared state for BSP diagnostics. @@ -338,6 +342,7 @@ mod imp { let guest_mem_ref: &GuestMemory = &guest_mem; let diag_ref = &diag_log; let lapic_refs_ref = &lapic_refs; + let shared_states_ref = &shared_states; // Spawn timer thread — cancels ALL vCPUs every 1ms. let timer_flag = run_config.running.clone(); @@ -358,6 +363,7 @@ mod imp { for ap_idx in 1..num_vcpus as usize { let vcpu = &vcpus[ap_idx]; let my_lapic = &lapic_refs_ref[ap_idx]; + let my_shared = &shared_states_ref[ap_idx]; s.spawn(move || { run_ap_loop( ap_idx as u8, @@ -372,6 +378,8 @@ mod imp { ctx_id, diag_ref, my_lapic, + my_shared, + shared_states_ref, ); }); } @@ -390,6 +398,8 @@ mod imp { diag_ref, num_vcpus, &lapic_refs_ref[0], + &shared_states_ref[0], + shared_states_ref, ); // BSP exited — signal all APs to exit. shutdown_ref.store(true, Ordering::Release); @@ -435,6 +445,10 @@ mod imp { last_progress: Instant, start_time: Instant, window_requested: bool, + /// Last MMIO read address (for tight loop detection). + last_mmio_read_addr: u64, + /// Consecutive MMIO reads to the same address. + consecutive_mmio_reads: u64, } impl VcpuStats { @@ -453,6 +467,8 @@ mod imp { last_progress: now, start_time: now, window_requested: false, + last_mmio_read_addr: u64::MAX, + consecutive_mmio_reads: 0, } } } @@ -498,6 +514,49 @@ mod imp { Ok(()) } + /// Lock-free interrupt injection for APIC mode — no DeviceManager lock. + /// + /// Checks the per-vCPU LAPIC for injectable vectors and injects directly + /// via the WHPX vCPU API. Only the owning vCPU's LAPIC mutex is acquired. + fn try_inject_interrupt_fast( + vcpu: &WhpxVcpu, + lapic: &Mutex, + stats: &mut VcpuStats, + ) -> Result<()> { + let has_injectable = lapic.lock().unwrap().get_highest_injectable().is_some(); + if !has_injectable { + return Ok(()); + } + + let already_pending = vcpu.has_pending_interruption().unwrap_or(false); + if already_pending { + return Ok(()); + } + + match vcpu.interrupts_enabled() { + Ok(true) => { + let mut guard = lapic.lock().unwrap(); + if let Some(vector) = guard.get_highest_injectable() { + guard.start_of_interrupt(vector); + drop(guard); + vcpu.inject_interrupt(vector)?; + stats.inject_count += 1; + stats.window_requested = false; + } + } + Ok(false) => { + if !stats.window_requested { + vcpu.request_interrupt_window()?; + stats.window_requested = true; + } + } + Err(ref e) => { + log::warn!("try_inject_interrupt_fast: interrupts_enabled() error: {:?}", e); + } + } + Ok(()) + } + /// Dispatch an IPI action from a LAPIC ICR write. fn dispatch_ipi( action: IpiAction, @@ -594,12 +653,27 @@ mod imp { } } + /// Result of a fast-path LAPIC MMIO write. + enum LapicWriteFastResult { + /// Write handled completely (no further action needed). + Handled, + /// ICR Low write: IPI action needs dispatching inline (lock-free). + IpiAction(IpiAction), + /// Not handled: needs DeviceManager slow path (EOI, SVR, non-LAPIC). + NotHandled, + } + /// Fast path: write to LAPIC MMIO without acquiring DeviceManager lock. /// - /// Only handles registers that have NO side effects requiring other devices - /// (no EOI → IOAPIC propagation, no SVR → APIC transition check, no ICR → IPI). - /// Returns Some(()) if handled, None if the write needs the DeviceManager path. - fn handle_lapic_mmio_write_fast(lapic: &Mutex, address: u64, data: u64) -> Option<()> { + /// Handles most LAPIC registers directly via per-vCPU lock. ICR Low writes + /// are parsed and returned as IpiAction for inline dispatch (lock-free IPI). + /// Only EOI (→ IOAPIC propagation) and SVR (→ APIC transition check) + /// require the DeviceManager slow path. + fn handle_lapic_mmio_write_fast( + lapic: &Mutex, + address: u64, + data: u64, + ) -> LapicWriteFastResult { if address >= LAPIC_MMIO_BASE && address < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = address - LAPIC_MMIO_BASE; match offset { @@ -614,15 +688,20 @@ mod imp { // Timer Divide Config: sets timer divider. 0x3E0 => { lapic.lock().unwrap().write_mmio(offset, data as u32); - Some(()) + LapicWriteFastResult::Handled + } + // ICR Low (0x300): parse ICR and return IPI action for inline dispatch. + // This eliminates the DeviceManager lock for ALL IPI dispatch. + 0x300 => { + let result = lapic.lock().unwrap().write_mmio(offset, data as u32); + LapicWriteFastResult::IpiAction(result.ipi_action) } // EOI (0x0B0): needs IOAPIC propagation → DeviceManager path. // SVR (0x0F0): needs APIC transition check → DeviceManager path. - // ICR Low (0x300): needs IPI dispatch → DeviceManager path. - _ => None, + _ => LapicWriteFastResult::NotHandled, } } else { - None + LapicWriteFastResult::NotHandled } } @@ -643,6 +722,8 @@ mod imp { diag_log: &Arc>>, num_vcpus: u8, my_lapic: &Arc>, + my_shared: &Arc, + all_shared: &[Arc], ) -> i32 { macro_rules! diag { ($($arg:tt)*) => { @@ -666,15 +747,38 @@ mod imp { return 0; } - // Tick PIT and poll devices (BSP only). + // 1. Pull remote interrupts (lock-free) + tick own LAPIC timer. + { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if let Some(vector) = lapic.tick_timer(Instant::now()) { + lapic.accept_interrupt(vector); + } + } + // 2. Lock-free interrupt injection (APIC mode, per-vCPU only). + if let Err(e) = try_inject_interrupt_fast(vcpu, my_lapic, &mut stats) { + log::error!("BSP lock-free inject error: {:?}", e); + } + // 3. Tick PIT + poll devices (reduced lock time — no LAPIC timer loop). { let mut dm = devices.lock().unwrap(); dm.tick_and_poll(0, guest_mem); - // Try to inject pending interrupt. - if let Err(e) = try_inject_interrupt(vcpu, 0, &mut dm, &mut stats) { - log::error!("BSP interrupt injection error: {:?}", e); + // PIC mode fallback: inject via DeviceManager path. + if !dm.irq_chip.apic_mode() { + if let Err(e) = try_inject_interrupt(vcpu, 0, &mut dm, &mut stats) { + log::error!("BSP PIC inject error: {:?}", e); + } } } + // 4. Pull any interrupts raised by tick_and_poll (device completions, + // PIT timer) and inject before entering the guest. + { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + } + if let Err(e) = try_inject_interrupt_fast(vcpu, my_lapic, &mut stats) { + log::error!("BSP post-poll inject error: {:?}", e); + } let exit = match vcpu.run() { Ok(exit) => exit, @@ -721,6 +825,23 @@ mod imp { VcpuExit::MmioRead { address, size } => { stats.halt_count = 0; stats.mmio_count += 1; + // Detect tight MMIO read loops (same address read 10K+ times). + // This catches BSP hang during LAPIC timer calibration. + if address == stats.last_mmio_read_addr { + stats.consecutive_mmio_reads += 1; + if stats.consecutive_mmio_reads == 10_000 { + log::warn!( + "BSP: tight MMIO read loop: addr={:#X} count={} exit={}", + address, stats.consecutive_mmio_reads, stats.exit_count + ); + if let Ok(regs) = vcpu.get_registers() { + log::warn!("BSP: RIP={:#X} at tight MMIO loop", regs.rip); + } + } + } else { + stats.last_mmio_read_addr = address; + stats.consecutive_mmio_reads = 1; + } // Fast path: LAPIC reads bypass DeviceManager lock. let data = if let Some(val) = handle_lapic_mmio_read_fast(my_lapic, address) { val @@ -739,41 +860,63 @@ mod imp { } => { stats.halt_count = 0; stats.mmio_count += 1; - // Fast path: simple LAPIC writes bypass DeviceManager lock. - if handle_lapic_mmio_write_fast(my_lapic, address, data).is_none() { - // Slow path: needs DeviceManager for EOI/SVR/ICR or non-LAPIC devices. - let mut dm = devices.lock().unwrap(); - if !blk_workers_started && !sync_block { - dm.start_blk_workers(); - blk_workers_started = true; - log::info!( - target: "whpx::diag", - "Block workers started at exit={} mmio={} elapsed={:.1}ms", - stats.exit_count, - stats.mmio_count, - stats.start_time.elapsed().as_secs_f64() * 1000.0 - ); + match handle_lapic_mmio_write_fast(my_lapic, address, data) { + LapicWriteFastResult::Handled => { + // Fast path: handled via per-vCPU lock only. } - let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); - // Log LAPIC ICR writes for diagnostics. - if address >= LAPIC_MMIO_BASE { - let offset = address - LAPIC_MMIO_BASE; - if offset == 0x300 || offset == 0x310 { - diag!("LAPIC ICR write: offset={:#X} data={:#X}", offset, data); + LapicWriteFastResult::IpiAction(action) => { + // ICR fast path: dispatch IPI inline (lock-free). + match action { + IpiAction::SendInterrupt { target_apic_id, vector } => { + let idx = target_apic_id as usize; + if idx < all_shared.len() { + all_shared[idx].request_interrupt(vector); + if idx < cancellers.len() { + let _ = cancellers[idx].cancel(); + } + } + } + IpiAction::SendInit { .. } | IpiAction::SendSipi { .. } => { + // INIT/SIPI use existing condvar mechanism. + dispatch_ipi( + action, + &mut devices.lock().unwrap(), + ap_states, + cancellers, + diag_log, + stats.start_time, + ); + } + IpiAction::None => {} } } - // Dispatch IPI if this was an ICR write. - if !matches!(ipi_action, IpiAction::None) { - dispatch_ipi( - ipi_action, - &mut dm, - ap_states, - cancellers, - diag_log, - stats.start_time, - ); + LapicWriteFastResult::NotHandled => { + // Slow path: needs DeviceManager for EOI/SVR or non-LAPIC devices. + let mut dm = devices.lock().unwrap(); + if !blk_workers_started && !sync_block { + dm.start_blk_workers(); + blk_workers_started = true; + log::info!( + target: "whpx::diag", + "Block workers started at exit={} mmio={} elapsed={:.1}ms", + stats.exit_count, + stats.mmio_count, + stats.start_time.elapsed().as_secs_f64() * 1000.0 + ); + } + let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); + if !matches!(ipi_action, IpiAction::None) { + dispatch_ipi( + ipi_action, + &mut dm, + ap_states, + cancellers, + diag_log, + stats.start_time, + ); + } + drop(dm); } - drop(dm); } if let Err(e) = vcpu.skip_instruction() { log::error!("BSP skip_instruction error: {:?}", e); @@ -791,11 +934,28 @@ mod imp { return 0; } - // Poll devices before sleeping. + // Pull remote interrupts + tick timer before sleeping. + { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if let Some(vector) = lapic.tick_timer(Instant::now()) { + lapic.accept_interrupt(vector); + } + if lapic.get_highest_injectable().is_some() { + drop(lapic); + if let Err(e) = try_inject_interrupt_fast(vcpu, my_lapic, &mut stats) { + log::error!("BSP HLT inject error: {:?}", e); + } + stats.halt_with_irq += 1; + stats.halt_count = 0; + continue; + } + } + // Also poll PIT + devices (PIC mode fallback + block I/O). { let mut dm = devices.lock().unwrap(); dm.tick_and_poll(0, guest_mem); - if dm.irq_chip.has_pending(0) { + if !dm.irq_chip.apic_mode() && dm.irq_chip.has_pending(0) { let already_pending = vcpu.has_pending_interruption().unwrap_or(false); if !already_pending { if let Some(vector) = dm.irq_chip.acknowledge(0) { @@ -846,18 +1006,17 @@ mod imp { for i in 0..HLT_SPIN_ITERS { std::thread::yield_now(); if i % 10 == 9 { - // Fast check: per-LAPIC only (no DeviceManager lock). - if my_lapic.lock().unwrap().get_highest_injectable().is_some() { + // Fast check: pull_irr + per-LAPIC (no DeviceManager lock). + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if lapic.get_highest_injectable().is_some() { woke_by_irq = true; break; } + drop(lapic); // Slow check: tick PIT + poll devices. let mut dm = devices.lock().unwrap(); dm.tick_and_poll(0, guest_mem); - if dm.irq_chip.has_pending(0) { - woke_by_irq = true; - break; - } } } if !woke_by_irq { @@ -947,6 +1106,7 @@ mod imp { 0, num_vcpus, rax as u32, + rcx, default_rax, default_rbx, default_rcx, @@ -1013,6 +1173,8 @@ mod imp { _ctx_id: u32, diag_log: &Arc>>, my_lapic: &Arc>, + my_shared: &Arc, + all_shared: &[Arc], ) { macro_rules! diag { ($($arg:tt)*) => { @@ -1092,14 +1254,18 @@ mod imp { return; } - // Try to inject pending interrupt (no timer ticking for APs). - // Fast path: check per-LAPIC first — usually no pending → skip DeviceManager. - if my_lapic.lock().unwrap().get_highest_injectable().is_some() { - let mut dm = devices.lock().unwrap(); - if let Err(e) = try_inject_interrupt(vcpu, ap_id, &mut dm, &mut stats) { - log::error!("AP{}: interrupt injection error: {:?}", ap_id, e); + // Pull remote interrupts (lock-free) + tick own LAPIC timer. + { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if let Some(vector) = lapic.tick_timer(Instant::now()) { + lapic.accept_interrupt(vector); } } + // Lock-free interrupt injection (APIC mode, per-vCPU only). + if let Err(e) = try_inject_interrupt_fast(vcpu, my_lapic, &mut stats) { + log::error!("AP{}: lock-free inject error: {:?}", ap_id, e); + } let exit = match vcpu.run() { Ok(exit) => exit, @@ -1164,6 +1330,22 @@ mod imp { VcpuExit::MmioRead { address, size } => { stats.halt_count = 0; stats.mmio_count += 1; + // Detect tight MMIO read loops (same address read 10K+ times). + if address == stats.last_mmio_read_addr { + stats.consecutive_mmio_reads += 1; + if stats.consecutive_mmio_reads == 10_000 { + log::warn!( + "AP{}: tight MMIO read loop: addr={:#X} count={} exit={}", + ap_id, address, stats.consecutive_mmio_reads, stats.exit_count + ); + if let Ok(regs) = vcpu.get_registers() { + log::warn!("AP{}: RIP={:#X} at tight MMIO loop", ap_id, regs.rip); + } + } + } else { + stats.last_mmio_read_addr = address; + stats.consecutive_mmio_reads = 1; + } // Fast path: LAPIC reads bypass DeviceManager lock. let data = if let Some(val) = handle_lapic_mmio_read_fast(my_lapic, address) { val @@ -1182,24 +1364,52 @@ mod imp { } => { stats.halt_count = 0; stats.mmio_count += 1; - // Fast path: simple LAPIC writes bypass DeviceManager lock. - if handle_lapic_mmio_write_fast(my_lapic, address, data).is_none() { - // Slow path: needs DeviceManager for EOI/SVR/ICR or non-LAPIC devices. - let mut dm = devices.lock().unwrap(); - let ipi_action = - dm.handle_mmio_write(ap_id, address, size, data, guest_mem); - if !matches!(ipi_action, IpiAction::None) { - // APs can send IPIs too (e.g., IPI to BSP for TLB shootdown). - dispatch_ipi( - ipi_action, - &mut dm, - &[], - cancellers, - diag_log, - stats.start_time, - ); + match handle_lapic_mmio_write_fast(my_lapic, address, data) { + LapicWriteFastResult::Handled => { + // Fast path: handled via per-vCPU lock only. + } + LapicWriteFastResult::IpiAction(action) => { + // ICR fast path: dispatch IPI inline (lock-free). + match action { + IpiAction::SendInterrupt { target_apic_id, vector } => { + let idx = target_apic_id as usize; + if idx < all_shared.len() { + all_shared[idx].request_interrupt(vector); + if idx < cancellers.len() { + let _ = cancellers[idx].cancel(); + } + } + } + IpiAction::SendInit { .. } | IpiAction::SendSipi { .. } => { + dispatch_ipi( + action, + &mut devices.lock().unwrap(), + &[], + cancellers, + diag_log, + stats.start_time, + ); + } + IpiAction::None => {} + } + } + LapicWriteFastResult::NotHandled => { + // Slow path: needs DeviceManager for EOI/SVR or non-LAPIC devices. + let mut dm = devices.lock().unwrap(); + let ipi_action = + dm.handle_mmio_write(ap_id, address, size, data, guest_mem); + if !matches!(ipi_action, IpiAction::None) { + dispatch_ipi( + ipi_action, + &mut dm, + &[], + cancellers, + diag_log, + stats.start_time, + ); + } + drop(dm); } - drop(dm); } let _ = vcpu.skip_instruction(); } @@ -1214,23 +1424,22 @@ mod imp { return; } - // Check for pending interrupts (fast path: per-LAPIC only). + // Pull remote interrupts + tick timer before checking. + { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if let Some(vector) = lapic.tick_timer(Instant::now()) { + lapic.accept_interrupt(vector); + } + } + // Lock-free check: inject from per-vCPU LAPIC only. if my_lapic.lock().unwrap().get_highest_injectable().is_some() { - let mut dm = devices.lock().unwrap(); - if dm.irq_chip.has_pending(ap_id) { - let already_pending = vcpu.has_pending_interruption().unwrap_or(false); - if !already_pending { - if let Some(vector) = dm.irq_chip.acknowledge(ap_id) { - let _ = vcpu.inject_interrupt(vector); - dm.irq_chip.notify_injected(ap_id, vector); - stats.window_requested = false; - stats.inject_count += 1; - } - } - stats.halt_with_irq += 1; - stats.halt_count = 0; - continue; + if let Err(e) = try_inject_interrupt_fast(vcpu, my_lapic, &mut stats) { + log::error!("AP{}: HLT inject error: {:?}", ap_id, e); } + stats.halt_with_irq += 1; + stats.halt_count = 0; + continue; } stats.halt_count += 1; @@ -1248,12 +1457,14 @@ mod imp { // Tiered sleep: spin-yield phase to catch imminent interrupts, // then short sleep if no interrupt arrived. - // Fast path: per-LAPIC only (no DeviceManager lock). + // Fast path: pull_irr + per-LAPIC only (no DeviceManager lock). let mut woke_by_irq = false; for i in 0..HLT_SPIN_ITERS { std::thread::yield_now(); if i % 10 == 9 { - if my_lapic.lock().unwrap().get_highest_injectable().is_some() { + let mut lapic = my_lapic.lock().unwrap(); + lapic.pull_irr(my_shared); + if lapic.get_highest_injectable().is_some() { woke_by_irq = true; break; } @@ -1318,6 +1529,7 @@ mod imp { ap_id, num_vcpus, rax as u32, + rcx, default_rax, default_rbx, default_rcx, @@ -1492,10 +1704,15 @@ pub fn stop(_ctx_id: u32) -> super::error::Result<()> { /// /// Injects CPU topology info into leaf 1 and masks Hyper-V leaves. /// This is a pure function (no side effects) for testability. +/// +/// `input_rcx` is the guest's original ECX value (sub-leaf number for leaves +/// 0xB/0x1F/4). This is distinct from `default_rcx` which is WHPX's computed +/// default OUTPUT for ECX. fn handle_cpuid( vcpu_id: u8, num_vcpus: u8, leaf: u32, + input_rcx: u64, default_rax: u64, default_rbx: u64, default_rcx: u64, @@ -1519,6 +1736,76 @@ fn handle_cpuid( default_rdx, ) } + // Leaf 0xB / 0x1F: Extended Topology Enumeration. + // + // WHPX passes through the HOST topology (e.g., 4C/8T on i5-1135G7), + // which confuses the guest kernel when num_vcpus differs from the host. + // The kernel's parse_topology_leaf() loops over sub-leaves calling + // cpuid_subleaf() until type==0 (INVALID). If the host topology reports + // more logical processors than the guest has, the kernel hangs in + // topology parsing (BSP stuck in parse_topology_leaf at 4+ vCPUs). + // + // We override to present a flat topology: 1 thread per core, num_vcpus + // cores, no HT. This matches what the MADT advertises. + 0xB | 0x1F => { + let subleaf = input_rcx & 0xFF; // guest's ECX input = sub-leaf number + match subleaf { + // Sub-leaf 0: SMT level — 1 thread per core (no hyperthreading). + 0 => { + let eax = 0u64; // shift = 0 (1 thread per core) + let ebx = 1u64; // 1 logical processor at this level + let ecx = (1u64 << 8) | subleaf; // type=1 (SMT), level=0 + let edx = vcpu_id as u64; // x2APIC ID + (eax, ebx, ecx, edx) + } + // Sub-leaf 1: Core level — num_vcpus cores total. + 1 => { + // shift = ceil(log2(num_vcpus)): bits to shift right to get + // package-level ID from x2APIC ID. + let shift = if num_vcpus <= 1 { + 0u64 + } else { + (num_vcpus as u64).next_power_of_two().trailing_zeros() as u64 + }; + let eax = shift; + let ebx = num_vcpus as u64; // total logical processors + let ecx = (2u64 << 8) | subleaf; // type=2 (Core), level=1 + let edx = vcpu_id as u64; // x2APIC ID + (eax, ebx, ecx, edx) + } + // Sub-leaf 2+: invalid — terminates the kernel's enumeration loop. + _ => { + let ecx = subleaf; // type=0 (INVALID), level=subleaf + (0, 0, ecx, vcpu_id as u64) + } + } + } + // Leaf 4: Deterministic Cache Parameters. + // + // Host reports max_cores_in_package (EAX[31:26]) and max_threads_sharing + // (EAX[25:14]) based on host topology. Override to match guest vCPU count + // so cache topology is consistent with leaf 0xB. + 4 => { + let cache_type = default_rax & 0x1F; + if cache_type == 0 { + // No more cache levels. + (default_rax, default_rbx, default_rcx, default_rdx) + } else { + let mut eax = default_rax; + // EAX[25:14] = max threads sharing this cache - 1. + // For L1/L2: 0 (not shared). For L3: num_vcpus - 1 (shared). + let max_sharing = if (default_rax & 0x1F) == 3 { + // Unified cache (L3): shared by all vCPUs. + (num_vcpus as u64).saturating_sub(1) + } else { + 0 // L1/L2: per-core, not shared. + }; + eax = (eax & !(0xFFF << 14)) | (max_sharing << 14); + // EAX[31:26] = max cores in package - 1. + eax = (eax & !(0x3F << 26)) | (((num_vcpus as u64).saturating_sub(1)) << 26); + (eax, default_rbx, default_rcx, default_rdx) + } + } // Hyper-V CPUID range: return zeros. 0x40000000..=0x400000FF => (0, 0, 0, 0), _ => (default_rax, default_rbx, default_rcx, default_rdx), @@ -1685,8 +1972,9 @@ mod tests { #[test] fn test_cpuid_leaf1_topology_bsp() { // BSP (vcpu 0) with 2 vCPUs. + // input_rcx=0 (leaf 1 doesn't use sub-leaves). let (rax, rbx, rcx, rdx) = - super::handle_cpuid(0, 2, 1, 0x1234, 0x0000_0000_0000_5678, 0x8000_0001, 0xABCD); + super::handle_cpuid(0, 2, 1, 0, 0x1234, 0x0000_0000_0000_5678, 0x8000_0001, 0xABCD); // EBX[23:16] = num_vcpus = 2, EBX[31:24] = vcpu_id = 0 assert_eq!(rbx & 0x00FF_0000, 0x0002_0000, "EBX[23:16] should be 2"); assert_eq!( @@ -1706,7 +1994,7 @@ mod tests { #[test] fn test_cpuid_leaf1_topology_ap() { // AP (vcpu 3) with 4 vCPUs. - let (_, rbx, _, _) = super::handle_cpuid(3, 4, 1, 0, 0, 0, 0); + let (_, rbx, _, _) = super::handle_cpuid(3, 4, 1, 0, 0, 0, 0, 0); assert_eq!((rbx >> 16) & 0xFF, 4, "EBX[23:16] should be num_vcpus=4"); assert_eq!((rbx >> 24) & 0xFF, 3, "EBX[31:24] should be vcpu_id=3"); } @@ -1716,7 +2004,7 @@ mod tests { // Hyper-V CPUID range should return all zeros. for leaf in [0x40000000u32, 0x40000001, 0x400000FF] { let (rax, rbx, rcx, rdx) = - super::handle_cpuid(0, 1, leaf, 0xDEAD, 0xBEEF, 0xCAFE, 0xF00D); + super::handle_cpuid(0, 1, leaf, 0, 0xDEAD, 0xBEEF, 0xCAFE, 0xF00D); assert_eq!( (rax, rbx, rcx, rdx), (0, 0, 0, 0), @@ -1729,13 +2017,104 @@ mod tests { #[test] fn test_cpuid_passthrough_other_leaves() { // Non-special leaves should pass through defaults unchanged. - let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 0, 0x1111, 0x2222, 0x3333, 0x4444); + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 0, 0, 0x1111, 0x2222, 0x3333, 0x4444); assert_eq!((rax, rbx, rcx, rdx), (0x1111, 0x2222, 0x3333, 0x4444)); - let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 7, 0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD); + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 2, 7, 0, 0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD); assert_eq!((rax, rbx, rcx, rdx), (0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD)); } + // --- CPUID leaf 0xB tests --- + + #[test] + fn test_cpuid_leaf_0xb_smt_level() { + // Sub-leaf 0 = SMT: shift=0, np=1, type=1, edx=vcpu_id. + // input_rcx=0 (sub-leaf 0). + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 4, 0xB, 0, 0, 0, 0, 0); + assert_eq!(rax & 0x1F, 0, "SMT shift should be 0 (no HT)"); + assert_eq!(rbx & 0xFFFF, 1, "SMT should report 1 logical proc"); + assert_eq!((rcx >> 8) & 0xFF, 1, "type should be 1 (SMT)"); + assert_eq!(rdx, 0, "x2APIC ID should be vcpu_id=0"); + + // Same for vcpu 3. + let (_, _, _, rdx) = super::handle_cpuid(3, 4, 0xB, 0, 0, 0, 0, 0); + assert_eq!(rdx, 3, "x2APIC ID should be vcpu_id=3"); + } + + #[test] + fn test_cpuid_leaf_0xb_core_level_4vcpus() { + // Sub-leaf 1 = Core: shift=ceil(log2(4))=2, np=4, type=2. + // input_rcx=1 (sub-leaf 1). + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 4, 0xB, 1, 0, 0, 0, 0); + assert_eq!(rax & 0x1F, 2, "Core shift should be 2 for 4 vCPUs"); + assert_eq!(rbx & 0xFFFF, 4, "Core should report 4 logical procs"); + assert_eq!((rcx >> 8) & 0xFF, 2, "type should be 2 (Core)"); + assert_eq!(rdx, 0, "x2APIC ID should be vcpu_id=0"); + } + + #[test] + fn test_cpuid_leaf_0xb_core_level_2vcpus() { + let (rax, rbx, _, _) = super::handle_cpuid(1, 2, 0xB, 1, 0, 0, 0, 0); + assert_eq!(rax & 0x1F, 1, "Core shift should be 1 for 2 vCPUs"); + assert_eq!(rbx & 0xFFFF, 2, "Core should report 2 logical procs"); + } + + #[test] + fn test_cpuid_leaf_0xb_core_level_1vcpu() { + let (rax, rbx, _, _) = super::handle_cpuid(0, 1, 0xB, 1, 0, 0, 0, 0); + assert_eq!(rax & 0x1F, 0, "Core shift should be 0 for 1 vCPU"); + assert_eq!(rbx & 0xFFFF, 1, "Core should report 1 logical proc"); + } + + #[test] + fn test_cpuid_leaf_0xb_invalid_subleaf() { + // Sub-leaf 2+ should return type=0 (INVALID) to terminate kernel loop. + let (rax, rbx, rcx, _) = super::handle_cpuid(0, 4, 0xB, 2, 0, 0, 0, 0); + assert_eq!(rax, 0); + assert_eq!(rbx, 0); + assert_eq!((rcx >> 8) & 0xFF, 0, "type should be 0 (INVALID)"); + } + + #[test] + fn test_cpuid_leaf_0x1f_same_as_0xb() { + // Leaf 0x1F should produce identical results to 0xB. + for subleaf in 0..3u64 { + let r_b = super::handle_cpuid(0, 4, 0xB, subleaf, 0, 0, 0, 0); + let r_1f = super::handle_cpuid(0, 4, 0x1F, subleaf, 0, 0, 0, 0); + assert_eq!(r_b, r_1f, "Leaf 0xB and 0x1F should match for sub-leaf {}", subleaf); + } + } + + #[test] + fn test_cpuid_leaf4_cache_topology() { + // Leaf 4 with cache_type != 0 should override max_cores and max_threads. + // Simulate L1 data cache (type=1) with host values. + let host_eax: u64 = 1 // cache_type = 1 (data) + | (7 << 14) // max_threads_sharing = 8 (host value) + | (7 << 26); // max_cores = 8 (host value) + let (rax, _, _, _) = super::handle_cpuid(0, 4, 4, 0, host_eax, 0, 0, 0); + // For L1 (non-unified): max_threads_sharing should be 0 (per-core). + assert_eq!((rax >> 14) & 0xFFF, 0, "L1 max_threads_sharing should be 0"); + // max_cores should be num_vcpus - 1 = 3. + assert_eq!((rax >> 26) & 0x3F, 3, "max_cores should be num_vcpus-1=3"); + + // Simulate L3 unified cache (type=3). + let host_eax: u64 = 3 // cache_type = 3 (unified) + | (15 << 14) // max_threads_sharing = 16 (host) + | (7 << 26); // max_cores = 8 (host) + let (rax, _, _, _) = super::handle_cpuid(0, 4, 4, 2, host_eax, 0, 0, 0); + // L3: max_threads_sharing should be num_vcpus - 1 = 3. + assert_eq!((rax >> 14) & 0xFFF, 3, "L3 max_threads_sharing should be 3"); + assert_eq!((rax >> 26) & 0x3F, 3, "max_cores should be num_vcpus-1=3"); + } + + #[test] + fn test_cpuid_leaf4_no_cache_passthrough() { + // cache_type = 0 means no more caches — pass through unchanged. + let (rax, rbx, rcx, rdx) = super::handle_cpuid(0, 4, 4, 0, 0, 0xBEEF, 0xCAFE, 0xDEAD); + assert_eq!((rax, rbx, rcx, rdx), (0, 0xBEEF, 0xCAFE, 0xDEAD)); + } + // --- handle_msr_read tests --- #[test] @@ -1762,4 +2141,5 @@ mod tests { assert_eq!(super::handle_msr_read(0, 0x174), 0); assert_eq!(super::handle_msr_read(1, 0xC000_0080), 0); } + } From 0f5708785df1b73417462b1f3ada5b75d88a847e Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 2 May 2026 12:37:08 +0800 Subject: [PATCH 27/33] feat(windows): ICR broadcast shorthand + 4-vCPU AP diagnostic logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix 3+ vCPU BSP hang caused by missing ICR destination shorthand handling in parse_icr(). Linux kernel uses "All Excluding Self" (bits 19:18 = 0b11) to broadcast wakeup IPI to all APs during SMP init. Previously only single-target dispatch from ICR High was implemented, so APs 2+ never received the wakeup. Changes: - Add BroadcastInterrupt variant to IpiAction enum - Parse ICR destination shorthand (Self/All-Incl/All-Excl) in parse_icr() - Handle BroadcastInterrupt in BSP and AP fast paths (lock-free) - Handle BroadcastInterrupt in dispatch_ipi() slow path - Add vcpu_running flags to guard timer cancel (only cancel running vCPUs) - Add periodic AP progress logging (RIP + stats every 500 Cancelled exits) - Add cancelled_count and cpuid_count to VcpuStats - Add AP exit summary on shutdown/max_exits Tested: Win11 (T14) and Win10 (MBP 2014) — vm-bench 8/8, net-test 8/8 at 4 vCPUs. Co-Authored-By: Claude Opus 4.6 --- src/vmm/src/windows/devices/lapic.rs | 53 ++++++++++ src/vmm/src/windows/runner.rs | 144 +++++++++++++++++++++++---- 2 files changed, 179 insertions(+), 18 deletions(-) diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index 8ec0244e7..235dd6bcc 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -65,6 +65,8 @@ pub enum IpiAction { None, /// Fixed delivery: send interrupt vector to target LAPIC. SendInterrupt { target_apic_id: u8, vector: u8 }, + /// Broadcast fixed interrupt to all vCPUs except the sender. + BroadcastInterrupt { source_apic_id: u8, vector: u8 }, /// INIT delivery: reset target processor. SendInit { target_apic_id: u8 }, /// Startup IPI (SIPI): start target processor at vector * 0x1000. @@ -388,11 +390,62 @@ impl LocalApic { } /// Parse the ICR low/high registers to produce an IPI action. + /// + /// ICR Low bits: + /// - [7:0] Vector + /// - [10:8] Delivery mode (000=Fixed, 101=INIT, 110=SIPI) + /// - [11] Destination mode (0=physical, 1=logical) + /// - [17:12] Reserved/status + /// - [19:18] Destination shorthand (00=none, 01=self, 10=all-incl-self, 11=all-excl-self) fn parse_icr(&self) -> IpiAction { let vector = (self.icr_low & 0xFF) as u8; let delivery_mode = (self.icr_low >> 8) & 0x7; + let dest_shorthand = (self.icr_low >> 18) & 0x3; let dest_apic_id = ((self.icr_high >> 24) & 0xFF) as u8; + // Handle destination shorthand first. + match dest_shorthand { + 0b01 => { + // Self: send to own LAPIC (used for self-IPI). + log::debug!( + "LAPIC {} ICR: Self IPI vector={:#X}", + self.id, + vector + ); + return IpiAction::SendInterrupt { + target_apic_id: self.id, + vector, + }; + } + 0b10 | 0b11 => { + // All Including Self (0b10) or All Excluding Self (0b11). + // For fixed delivery, broadcast to all other vCPUs. + if delivery_mode == 0b000 { + log::debug!( + "LAPIC {} ICR: Broadcast vector={:#X} (shorthand={})", + self.id, + vector, + if dest_shorthand == 0b10 { "all-incl" } else { "all-excl" } + ); + return IpiAction::BroadcastInterrupt { + source_apic_id: self.id, + vector, + }; + } + // Non-fixed broadcast (e.g., INIT to all) — fallthrough to per-target. + // For now, treat as no-op (Linux doesn't broadcast INIT/SIPI with shorthand). + log::debug!( + "LAPIC {} ICR: Broadcast delivery_mode={} (unsupported, ignored)", + self.id, + delivery_mode + ); + return IpiAction::None; + } + _ => { + // 0b00: No shorthand — use destination field (normal path). + } + } + match delivery_mode { 0b000 => { // Fixed delivery. diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 682c6767b..586a9cb72 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -302,6 +302,14 @@ mod imp { // need to be able to wake any vCPU. let cancellers: Vec = vcpus.iter().map(|v| v.canceller()).collect(); + // Track which vCPUs have actually entered WHvRunVirtualProcessor. + // The timer thread only cancels running vCPUs — cancelling a VP that + // hasn't been run yet has undefined behavior on WHPX and may corrupt + // partition state (suspected cause of 4-vCPU BSP hang). + let vcpu_running: Vec> = (0..num_vcpus as usize) + .map(|_| Arc::new(AtomicBool::new(false))) + .collect(); + // Store BSP canceller so stop() can wake the VM. *canceller_slot.lock().unwrap() = Some(cancellers[0].clone()); @@ -343,17 +351,25 @@ mod imp { let diag_ref = &diag_log; let lapic_refs_ref = &lapic_refs; let shared_states_ref = &shared_states; + let vcpu_running_ref = &vcpu_running; - // Spawn timer thread — cancels ALL vCPUs every 1ms. + // Spawn timer thread — cancels only RUNNING vCPUs every 1ms. + // Previously this cancelled ALL vCPUs including APs that hadn't + // entered WHvRunVirtualProcessor yet. At 4 vCPUs, calling + // WHvCancelRunVirtualProcessor on 3 non-running APs every 1ms + // may corrupt WHPX partition state, causing BSP hang. let timer_flag = run_config.running.clone(); let timer_cancellers: Vec = cancellers.clone(); let timer_shutdown = shutdown.clone(); + let timer_vcpu_running: Vec> = vcpu_running.clone(); let timer_thread = std::thread::spawn(move || { while timer_flag.load(Ordering::Relaxed) && !timer_shutdown.load(Ordering::Relaxed) { std::thread::sleep(Duration::from_millis(1)); - for c in &timer_cancellers { - let _ = c.cancel(); + for (i, c) in timer_cancellers.iter().enumerate() { + if timer_vcpu_running[i].load(Ordering::Relaxed) { + let _ = c.cancel(); + } } } }); @@ -364,6 +380,7 @@ mod imp { let vcpu = &vcpus[ap_idx]; let my_lapic = &lapic_refs_ref[ap_idx]; let my_shared = &shared_states_ref[ap_idx]; + let my_running = &vcpu_running_ref[ap_idx]; s.spawn(move || { run_ap_loop( ap_idx as u8, @@ -380,11 +397,14 @@ mod imp { my_lapic, my_shared, shared_states_ref, + my_running, ); }); } // BSP runs on the current thread. + // Mark BSP as running before entering the loop so timer can cancel it. + vcpu_running_ref[0].store(true, Ordering::Release); let bsp_vcpu = &vcpus[0]; let bsp_code = run_bsp_loop( bsp_vcpu, @@ -442,6 +462,8 @@ mod imp { io_out_count: u64, io_in_count: u64, inject_count: u64, + cancelled_count: u64, + cpuid_count: u64, last_progress: Instant, start_time: Instant, window_requested: bool, @@ -464,6 +486,8 @@ mod imp { io_out_count: 0, io_in_count: 0, inject_count: 0, + cancelled_count: 0, + cpuid_count: 0, last_progress: now, start_time: now, window_requested: false, @@ -636,6 +660,25 @@ mod imp { target_apic_id, ); } + IpiAction::BroadcastInterrupt { + source_apic_id, + vector, + } => { + // Send to all vCPUs except the source. + let num = cancellers.len(); + for idx in 0..num { + if idx as u8 != source_apic_id { + devices.irq_chip.deliver_ipi_interrupt(idx as u8, vector); + let _ = cancellers[idx].cancel(); + } + } + ipi_diag!( + "IPI: broadcast vector={:#X} from vCPU{} → all-excl-self ({} targets)", + vector, + source_apic_id, + num - 1, + ); + } } } @@ -793,6 +836,22 @@ mod imp { }; stats.exit_count += 1; + // Periodic progress logging for early-boot diagnostics. + // Written to diag file (not log::info!) because shim's tracing + // subscriber doesn't capture log crate output from VMM. + if stats.exit_count % 50_000 == 0 { + let console_len = devices::get_console_output(ctx_id) + .map(|b| b.len()) + .unwrap_or(0); + diag!( + "BSP progress: exit={} serial={} mmio={} io_out={} io_in={} inj={} hlt={} console={}B elapsed={:.1}s", + stats.exit_count, stats.serial_out_count, stats.mmio_count, + stats.io_out_count, stats.io_in_count, stats.inject_count, + stats.total_halt_exits, console_len, + stats.start_time.elapsed().as_secs_f64(), + ); + } + match exit { VcpuExit::IoOut { port, size, data } => { stats.halt_count = 0; @@ -830,12 +889,12 @@ mod imp { if address == stats.last_mmio_read_addr { stats.consecutive_mmio_reads += 1; if stats.consecutive_mmio_reads == 10_000 { - log::warn!( + diag!( "BSP: tight MMIO read loop: addr={:#X} count={} exit={}", address, stats.consecutive_mmio_reads, stats.exit_count ); if let Ok(regs) = vcpu.get_registers() { - log::warn!("BSP: RIP={:#X} at tight MMIO loop", regs.rip); + diag!("BSP: RIP={:#X} at tight MMIO loop", regs.rip); } } } else { @@ -876,6 +935,17 @@ mod imp { } } } + IpiAction::BroadcastInterrupt { source_apic_id, vector } => { + // Broadcast to all vCPUs except source (lock-free). + for idx in 0..all_shared.len() { + if idx as u8 != source_apic_id { + all_shared[idx].request_interrupt(vector); + if idx < cancellers.len() { + let _ = cancellers[idx].cancel(); + } + } + } + } IpiAction::SendInit { .. } | IpiAction::SendSipi { .. } => { // INIT/SIPI use existing condvar mechanism. dispatch_ipi( @@ -979,8 +1049,7 @@ mod imp { .map(|b| b.len()) .unwrap_or(0); let if_flag = vcpu.interrupts_enabled().unwrap_or(false); - log::warn!( - target: "whpx::diag", + diag!( "BSP HLT stuck: consecutive={} total_halt={} halt_w_irq={} \ exits={} RIP={:#X} IF={} console={}B mmio={} vcpus={}", stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, @@ -1049,20 +1118,14 @@ mod imp { "pending" }; drop(dm); - log::info!( - target: "whpx::diag", - "vCPU0 @ {:.1}s: exits={} RIP={:#X} console={}B mmio={} halt={}/{} inj={} blk_comp={} mode={}/{}", + diag!( + "vCPU0 @ {:.1}s: exits={} RIP={:#X} console={}B mmio={} halt={}/{} inj={} blk_comp={} mode={}/{} io_out={} serial={} blk_qn={} vcpus={}", stats.start_time.elapsed().as_secs_f64(), stats.exit_count, regs.rip, console_len, stats.mmio_count, stats.halt_count, stats.total_halt_exits, stats.inject_count, bc, if apic_mode { "apic" } else { "pic" }, blk_mode, - ); - log::debug!( - target: "whpx::diag", - "vCPU0 detail: io_out={} serial={} blk_qn={} halt_w_irq={} vcpus={}", - stats.io_out_count, stats.serial_out_count, qn, - stats.halt_with_irq, num_vcpus, + stats.io_out_count, stats.serial_out_count, qn, num_vcpus, ); } } @@ -1102,6 +1165,7 @@ mod imp { default_rdx, } => { stats.halt_count = 0; + stats.cpuid_count += 1; let (out_rax, out_rbx, out_rcx, out_rdx) = super::handle_cpuid( 0, num_vcpus, @@ -1175,6 +1239,7 @@ mod imp { my_lapic: &Arc>, my_shared: &Arc, all_shared: &[Arc], + vcpu_running_flag: &AtomicBool, ) { macro_rules! diag { ($($arg:tt)*) => { @@ -1246,11 +1311,19 @@ mod imp { } diag!("AP{}: initial regs set, entering run loop", ap_id); + // Mark AP as running so the timer thread can cancel it. + // This MUST happen after SIPI wake + register setup, just before first vcpu.run(). + vcpu_running_flag.store(true, Ordering::Release); + let mut stats = VcpuStats::new(); loop { if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { - log::info!("AP{}: shutdown signal received", ap_id); + diag!( + "AP{}: EXIT (shutdown) exits={} cancelled={} halt={} cpuid={} mmio={}", + ap_id, stats.exit_count, stats.cancelled_count, + stats.total_halt_exits, stats.cpuid_count, stats.mmio_count, + ); return; } @@ -1380,6 +1453,17 @@ mod imp { } } } + IpiAction::BroadcastInterrupt { source_apic_id, vector } => { + // Broadcast to all vCPUs except source (lock-free). + for idx in 0..all_shared.len() { + if idx as u8 != source_apic_id { + all_shared[idx].request_interrupt(vector); + if idx < cancellers.len() { + let _ = cancellers[idx].cancel(); + } + } + } + } IpiAction::SendInit { .. } | IpiAction::SendSipi { .. } => { dispatch_ipi( action, @@ -1483,6 +1567,25 @@ mod imp { log::info!("AP{}: stop requested on Cancelled", ap_id); return; } + stats.cancelled_count += 1; + // Periodic AP progress logging (every 500 Cancelled exits ≈ every 500ms). + if stats.cancelled_count % 500 == 0 { + let rip = vcpu + .get_registers() + .map(|r| r.rip) + .unwrap_or(0xDEAD); + diag!( + "AP{} @ {:.1}s: exits={} cancelled={} halt={} cpuid={} mmio={} RIP={:#X}", + ap_id, + stats.start_time.elapsed().as_secs_f64(), + stats.exit_count, + stats.cancelled_count, + stats.total_halt_exits, + stats.cpuid_count, + stats.mmio_count, + rip, + ); + } } VcpuExit::MsrAccess { msr_number, @@ -1525,6 +1628,7 @@ mod imp { default_rdx, } => { stats.halt_count = 0; + stats.cpuid_count += 1; let (out_rax, out_rbx, out_rcx, out_rdx) = super::handle_cpuid( ap_id, num_vcpus, @@ -1573,7 +1677,11 @@ mod imp { } if stats.exit_count >= MAX_EXITS { - log::warn!("AP{}: reached {} exit limit", ap_id, MAX_EXITS); + diag!( + "AP{}: EXIT (max_exits) exits={} cancelled={} halt={} cpuid={} mmio={}", + ap_id, stats.exit_count, stats.cancelled_count, + stats.total_halt_exits, stats.cpuid_count, stats.mmio_count, + ); return; } } From 2bbabdf56ee6cbeb164086caadb147c4eab0f125 Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 2 May 2026 18:05:40 +0800 Subject: [PATCH 28/33] feat(windows): replace TCP with AF_UNIX sockets in VMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - vsock device: TcpListener/TcpStream → UnixListener/UnixStream (uds_windows on Windows, std::os::unix::net on Unix) - listen_on() now takes socket_path instead of host_port - connect_to() uses UnixStream::connect instead of TcpStream::connect - Remove set_nodelay() (not applicable to Unix sockets) - Rename poll_tcp_listeners/poll_tcp_streams → poll_listeners/poll_streams - manager.rs: simplify vsock config to pass socket paths directly - net.rs: replace TcpTransport with UdsTransport (Windows) - All 15+ vsock tests rewritten for Unix sockets Co-Authored-By: Claude Opus 4.6 --- src/vmm/Cargo.toml | 2 + src/vmm/src/windows/devices/manager.rs | 57 +-- src/vmm/src/windows/devices/virtio/net.rs | 21 +- .../src/windows/devices/virtio/vsock/mod.rs | 437 +++++++----------- 4 files changed, 194 insertions(+), 323 deletions(-) diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 131bb2f0f..3ec0b5be8 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -75,8 +75,10 @@ windows-sys = { version = "0.61", features = [ ] } zerocopy = { version = "0.8", features = ["derive"] } rand = "0.9" +uds_windows = "1.2" [target.'cfg(target_os = "windows")'.dev-dependencies] env_logger = "0.11" [dev-dependencies] +tempfile = "3" diff --git a/src/vmm/src/windows/devices/manager.rs b/src/vmm/src/windows/devices/manager.rs index ced419eef..9fe05f990 100644 --- a/src/vmm/src/windows/devices/manager.rs +++ b/src/vmm/src/windows/devices/manager.rs @@ -83,7 +83,6 @@ const PM1A_EVT_BLK: u16 = 0x600; const PM1A_CNT_BLK: u16 = 0x604; /// Default vsock listen ports (BoxLite: 2695=gRPC, 2696=ready signal). -const DEFAULT_VSOCK_PORTS: &[u32] = &[2695, 2696]; /// Convert a value to BCD (Binary-Coded Decimal). /// E.g. 26 → 0x26, 59 → 0x59. @@ -293,40 +292,14 @@ impl DeviceManager { // Virtio-vsock (slot 1) — always present. let mut vsock_backend = VirtioVsock::new(GUEST_CID); - // Configure ports: listen=true creates TCP listener (host→guest), + // Configure ports: listen=true creates Unix socket listener (host→guest), // listen=false registers outbound target (guest→host). - if ctx.vsock_ports.is_empty() { - for &port in DEFAULT_VSOCK_PORTS { - let _ = vsock_backend.listen(port); - } - } else { - for vp in &ctx.vsock_ports { - // Resolve the host TCP address from either: - // 1. Explicit host_tcp_port (set by boot_kernel CLI) - // 2. host_path as "host:port" string (set by krun_add_vsock_port2 API) - // 3. Fallback: vsock port number as TCP port - let host_addr = if let Some(tcp_port) = vp.host_tcp_port { - format!("127.0.0.1:{}", tcp_port) - } else { - let path_str = vp.host_path.to_string_lossy(); - if path_str.contains(':') { - // host_path is "host:port" format (e.g., "127.0.0.1:55008") - path_str.to_string() - } else { - format!("127.0.0.1:{}", vp.port) - } - }; - if vp.listen { - // Parse port from host_addr for listen_on - let port = host_addr - .rsplit(':') - .next() - .and_then(|s| s.parse::().ok()) - .unwrap_or(vp.port as u16); - let _ = vsock_backend.listen_on(vp.port, port); - } else { - vsock_backend.connect_to(vp.port, host_addr); - } + for vp in &ctx.vsock_ports { + let socket_path = vp.host_path.to_string_lossy(); + if vp.listen { + let _ = vsock_backend.listen_on(vp.port, &socket_path); + } else { + vsock_backend.connect_to(vp.port, socket_path.to_string()); } } let virtio_vsock = VirtioMmioDevice::new(vsock_backend); @@ -719,8 +692,7 @@ impl DeviceManager { /// Connect to the userspace networking proxy and return a transport. /// - /// On Unix: connects via Unix stream socket. - /// On Windows: parses "host:port" and connects via TCP. + /// Connects via Unix stream socket on all platforms. fn connect_net_transport( socket_path: &Path, ) -> Result>> { @@ -738,13 +710,16 @@ impl DeviceManager { })?; Ok(Some(Box::new(transport))) } - #[cfg(not(unix))] + #[cfg(windows)] { - let addr = socket_path.to_string_lossy(); - let stream = std::net::TcpStream::connect(addr.as_ref()).map_err(|e| { - WkrunError::Device(format!("failed to connect to net proxy '{}': {}", addr, e)) + let stream = uds_windows::UnixStream::connect(socket_path).map_err(|e| { + WkrunError::Device(format!( + "failed to connect to net socket '{}': {}", + socket_path.display(), + e + )) })?; - let transport = super::virtio::net::TcpTransport::new(stream).map_err(|e| { + let transport = super::virtio::net::UdsTransport::new(stream).map_err(|e| { WkrunError::Device(format!("failed to configure net socket: {}", e)) })?; Ok(Some(Box::new(transport))) diff --git a/src/vmm/src/windows/devices/virtio/net.rs b/src/vmm/src/windows/devices/virtio/net.rs index 950aa2030..7c78e4f8c 100644 --- a/src/vmm/src/windows/devices/virtio/net.rs +++ b/src/vmm/src/windows/devices/virtio/net.rs @@ -44,7 +44,7 @@ const VIRTIO_NET_S_LINK_UP: u16 = 1; /// Transport trait for pluggable networking backends. /// -/// Both Unix socket and TCP transports use the passt/gvproxy wire +/// Unix socket transports use the passt/gvproxy wire /// protocol: each frame is `[4-byte big-endian length][frame bytes]`. pub trait NetTransport: Send { /// Try to receive a complete Ethernet frame. Returns `None` if no @@ -106,24 +106,27 @@ impl NetTransport for UnixStreamTransport { } } -/// TCP transport (cross-platform, used on Windows). -pub struct TcpTransport { - stream: std::net::TcpStream, +/// Unix domain socket transport (Windows, via uds_windows crate). +#[cfg(windows)] +pub struct UdsTransport { + stream: uds_windows::UnixStream, state: RecvState, } -impl TcpTransport { - /// Wrap a non-blocking TCP stream. - pub fn new(stream: std::net::TcpStream) -> io::Result { +#[cfg(windows)] +impl UdsTransport { + /// Wrap a non-blocking Unix domain socket stream. + pub fn new(stream: uds_windows::UnixStream) -> io::Result { stream.set_nonblocking(true)?; - Ok(TcpTransport { + Ok(UdsTransport { stream, state: RecvState::default(), }) } } -impl NetTransport for TcpTransport { +#[cfg(windows)] +impl NetTransport for UdsTransport { fn recv_frame(&mut self) -> Option> { recv_frame_from(&mut self.stream, &mut self.state) } diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index 590b99df5..ed8403dbf 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -1,7 +1,7 @@ //! Virtio-vsock device backend (virtio spec v1.2 Section 5.10). //! -//! Provides a socket transport between guest (AF_VSOCK) and host (TCP). -//! The host side uses non-blocking TCP listeners on localhost for +//! Provides a socket transport between guest (AF_VSOCK) and host (Unix sockets). +//! The host side uses non-blocking Unix domain socket listeners for //! cross-platform compatibility (Windows + macOS + Linux). //! //! Queue layout: @@ -14,7 +14,11 @@ pub mod packet; use std::collections::HashMap; use std::io::{self, Read, Write}; -use std::net::{TcpListener, TcpStream}; + +#[cfg(unix)] +use std::os::unix::net::{UnixListener, UnixStream}; +#[cfg(windows)] +use uds_windows::{UnixListener, UnixStream}; use super::mmio::VirtioDeviceBackend; use super::queue::{GuestMemoryAccessor, Virtqueue}; @@ -44,22 +48,22 @@ type ConnKey = (u32, u32); /// Starting ephemeral port for host-initiated vsock connections. const EPHEMERAL_PORT_START: u32 = 49152; -/// Virtio-vsock device with TCP host-side bridge. +/// Virtio-vsock device with Unix socket host-side bridge. pub struct VirtioVsock { /// Guest CID (typically 3 for the first guest). guest_cid: u64, /// Active connections keyed by (guest_port, host_port). connections: HashMap, - /// TCP listeners on the host side, keyed by vsock port. - /// Used for host-initiated connections (host TCP → guest vsock). - listeners: HashMap, - /// Outbound TCP targets keyed by vsock port. - /// Used for guest-initiated connections (guest vsock → host TCP). + /// Unix socket listeners on the host side, keyed by vsock port. + /// Used for host-initiated connections (host UDS → guest vsock). + listeners: HashMap, + /// Outbound Unix socket targets keyed by vsock port. + /// Used for guest-initiated connections (guest vsock → host UDS). /// When the guest connects to a port in this map, the device makes - /// an outbound TCP connection to the specified address. + /// an outbound Unix socket connection to the specified path. connect_targets: HashMap, - /// Accepted TCP streams, keyed by (guest_port, host_port). - streams: HashMap, + /// Accepted Unix streams, keyed by (guest_port, host_port). + streams: HashMap, /// Pending response/control packets to inject into the RX queue. rx_pending: Vec<(VsockHeader, Vec)>, /// Next ephemeral port for host-initiated connections. @@ -80,34 +84,29 @@ impl VirtioVsock { } } - /// Register a TCP listener on `127.0.0.1:port` for the given host port. + /// Register a Unix socket listener on `socket_path` for the given vsock port. /// /// When a guest connects to this port via AF_VSOCK, the connection - /// is bridged to an accepted TCP client on this listener. - pub fn listen(&mut self, port: u32) -> io::Result<()> { - self.listen_on(port, port as u16) - } - - /// Register a TCP listener on `127.0.0.1:host_port` for the given vsock port. + /// is bridged to an accepted Unix socket client on this listener. /// - /// The guest connects to `vsock_port` via AF_VSOCK, and the bridge listens - /// on `host_port` on the host side. This allows multiple VMs to use - /// distinct host ports for the same guest vsock port number. - pub fn listen_on(&mut self, vsock_port: u32, host_port: u16) -> io::Result<()> { - let listener = TcpListener::bind(format!("127.0.0.1:{}", host_port))?; + /// Removes any stale socket file before binding. + pub fn listen_on(&mut self, vsock_port: u32, socket_path: &str) -> io::Result<()> { + // Remove stale socket file if it exists. + let _ = std::fs::remove_file(socket_path); + let listener = UnixListener::bind(socket_path)?; listener.set_nonblocking(true)?; self.listeners.insert(vsock_port, listener); Ok(()) } - /// Register an outbound TCP target for guest-initiated connections. + /// Register an outbound Unix socket target for guest-initiated connections. /// /// When the guest connects to `vsock_port`, the device makes an outbound - /// TCP connection to `host_addr` instead of accepting from a listener. + /// Unix socket connection to `host_path` instead of accepting from a listener. /// Used for notification channels where the guest initiates the connection /// and the host is already listening. - pub fn connect_to(&mut self, vsock_port: u32, host_addr: String) { - self.connect_targets.insert(vsock_port, host_addr); + pub fn connect_to(&mut self, vsock_port: u32, host_path: String) { + self.connect_targets.insert(vsock_port, host_path); } /// Get the guest CID. @@ -199,7 +198,7 @@ impl VirtioVsock { if let Some(conn) = self.connections.get_mut(&key) { let (resp_hdr, fwd_data) = conn.dispatch(hdr, payload); - // Forward data to host TCP socket. + // Forward data to host Unix socket. // Use retry loop for non-blocking sockets (write_all fails on WouldBlock). if let Some(data) = fwd_data { if let Some(stream) = self.streams.get_mut(&key) { @@ -257,22 +256,19 @@ impl VirtioVsock { fn handle_connect_request(&mut self, hdr: &VsockHeader) { let key = (hdr.src_port, hdr.dst_port); - // Try outbound connection first (guest-initiated → host TCP target). - if let Some(addr) = self.connect_targets.get(&hdr.dst_port).cloned() { - log::debug!("guest-initiated CONNECT: port={} → {}", hdr.dst_port, addr); - let stream = match TcpStream::connect(&addr) { + // Try outbound connection first (guest-initiated → host UDS target). + if let Some(path) = self.connect_targets.get(&hdr.dst_port).cloned() { + log::debug!("guest-initiated CONNECT: port={} → {}", hdr.dst_port, path); + let stream = match UnixStream::connect(&path) { Ok(stream) => { if let Err(e) = stream.set_nonblocking(true) { log::warn!("guest-connect: set_nonblocking failed: {}", e); } - if let Err(e) = stream.set_nodelay(true) { - log::warn!("guest-connect: set_nodelay failed: {}", e); - } - log::debug!("TCP connect OK to {}", addr); + log::debug!("UDS connect OK to {}", path); stream } Err(ref e) => { - log::warn!("TCP connect FAILED to {}: {}", addr, e); + log::warn!("UDS connect FAILED to {}: {}", path, e); let rst = VsockHeader::new_rst( VSOCK_CID_HOST, hdr.dst_port, @@ -311,7 +307,7 @@ impl VirtioVsock { return; } - // Try to accept a pending TCP connection on this listener. + // Try to accept a pending Unix socket connection on this listener. let stream = if let Some(listener) = self.listeners.get(&hdr.dst_port) { match listener.accept() { Ok((stream, _addr)) => { @@ -319,8 +315,8 @@ impl VirtioVsock { Some(stream) } Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { - // No pending TCP connection yet — still accept the vsock connection. - // Data will buffer until a TCP client connects. + // No pending connection yet — still accept the vsock connection. + // Data will buffer until a client connects. None } Err(_) => { @@ -365,26 +361,23 @@ impl VirtioVsock { port } - /// Poll TCP listeners for pending connections and initiate vsock handshakes. + /// Poll Unix socket listeners for pending connections and initiate vsock handshakes. /// - /// When a host TCP client connects to a listener, this method: - /// 1. Accepts the TCP connection + /// When a host client connects to a listener, this method: + /// 1. Accepts the Unix socket connection /// 2. Allocates an ephemeral host port for the vsock side /// 3. Creates a VsockConnection in Connecting state /// 4. Generates a REQUEST packet to send to the guest via RX queue - /// 5. Stores the TCP stream (data is NOT read until Connected) - fn poll_tcp_listeners(&mut self) { + /// 5. Stores the Unix stream (data is NOT read until Connected) + fn poll_listeners(&mut self) { let vsock_ports: Vec = self.listeners.keys().copied().collect(); for vsock_port in vsock_ports { let stream = if let Some(listener) = self.listeners.get(&vsock_port) { match listener.accept() { - Ok((stream, addr)) => { + Ok((stream, _addr)) => { if let Err(e) = stream.set_nonblocking(true) { - log::warn!("vsock set_nonblocking failed: {} (addr={:?})", e, addr); - } - if let Err(e) = stream.set_nodelay(true) { - log::warn!("vsock set_nodelay failed: {} (addr={:?})", e, addr); + log::warn!("vsock set_nonblocking failed: {}", e); } stream } @@ -414,14 +407,14 @@ impl VirtioVsock { } } - /// Poll TCP streams for incoming data and queue it for RX injection. - fn poll_tcp_streams(&mut self) { + /// Poll Unix streams for incoming data and queue it for RX injection. + fn poll_streams(&mut self) { // Collect keys first to avoid borrow issues. let keys: Vec = self.streams.keys().copied().collect(); for key in keys { // Skip streams whose vsock connection is still handshaking. - // TCP data stays in the kernel receive buffer until Connected. + // Data stays in the kernel receive buffer until Connected. if let Some(conn) = self.connections.get(&key) { if conn.state() != ConnState::Connected { continue; @@ -432,8 +425,8 @@ impl VirtioVsock { let data = if let Some(stream) = self.streams.get_mut(&key) { match stream.read(&mut buf) { Ok(0) => { - // TCP connection closed. Send SHUTDOWN to guest. - log::debug!("TCP EOF, key=({},{})", key.0, key.1); + // Unix socket connection closed. Send SHUTDOWN to guest. + log::debug!("UDS EOF, key=({},{})", key.0, key.1); if let Some(conn) = self.connections.get(&key) { let hdr = VsockHeader::new_shutdown( conn.local_cid, @@ -453,14 +446,14 @@ impl VirtioVsock { continue; } Ok(n) => { - log::trace!("TCP read {} bytes, key=({},{})", n, key.0, key.1); + log::trace!("UDS read {} bytes, key=({},{})", n, key.0, key.1); Some(buf[..n].to_vec()) } Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => None, Err(ref e) => { - // I/O error on TCP stream. RST the vsock connection. + // I/O error on Unix stream. RST the vsock connection. log::warn!( - "vsock TCP read error: {} (raw={:?}), key=({},{})", + "vsock UDS read error: {} (raw={:?}), key=({},{})", e, e.raw_os_error(), key.0, @@ -617,16 +610,16 @@ impl VirtioDeviceBackend for VirtioVsock { } fn poll(&mut self, queues: &mut [Virtqueue], mem: &dyn GuestMemoryAccessor) -> bool { - // Accept new TCP connections and initiate vsock handshakes. - self.poll_tcp_listeners(); + // Accept new Unix socket connections and initiate vsock handshakes. + self.poll_listeners(); - // Poll TCP streams for incoming data. + // Poll Unix streams for incoming data. let pending_before = self.rx_pending.len(); - self.poll_tcp_streams(); + self.poll_streams(); let new_data = self.rx_pending.len() - pending_before; if new_data > 0 { log::trace!( - "vsock poll: TCP produced {} new packets, total pending={}", + "vsock poll: UDS produced {} new packets, total pending={}", new_data, self.rx_pending.len() ); @@ -832,22 +825,19 @@ mod tests { assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_RST); } + /// Create a temporary socket path for tests. + fn temp_socket_path(name: &str) -> (std::path::PathBuf, tempfile::TempDir) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join(name); + (path, dir) + } + #[test] fn test_tx_request_with_listener_sends_response() { let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); // Port 0 = OS-assigned. - // Get the actual port. - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - // Re-register with correct port. - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("vsock-test.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); let mem = MockMem::new(0x10000); let mut tx_queue = setup_queue(128); @@ -856,7 +846,7 @@ mod tests { src_cid: 3, dst_cid: 2, src_port: 5000, - dst_port: port, + dst_port: vsock_port, len: 0, type_: 1, op: VSOCK_OP_REQUEST, @@ -1106,7 +1096,7 @@ mod tests { let mut dev = VirtioVsock::new(3); let mem = MockMem::new(0x10000); - // Manually create connection to test data flow without TCP. + // Manually create connection to test data flow without sockets. let req_hdr = VsockHeader { src_cid: 3, dst_cid: 2, @@ -1261,79 +1251,37 @@ mod tests { assert_eq!(hdr.op, packet::VSOCK_OP_CREDIT_UPDATE); } - // --- Listen and connect with TCP --- + // --- Listen and connect with Unix sockets --- #[test] fn test_listen_creates_listener() { let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); // Port 0 = OS-assigned. + let (sock_path, _dir) = temp_socket_path("listen-test.sock"); + dev.listen_on(2695, sock_path.to_str().unwrap()).unwrap(); assert_eq!(dev.listeners.len(), 1); } #[test] - fn test_listen_on_different_host_port() { + fn test_listen_on_two_vsock_ports() { let mut dev = VirtioVsock::new(3); - // vsock port 2695, host TCP port 0 (OS-assigned) - dev.listen_on(2695, 0).unwrap(); - assert_eq!(dev.listeners.len(), 1); - // Listener is keyed by vsock port, not host port + let (path1, _dir1) = temp_socket_path("listen1.sock"); + let (path2, _dir2) = temp_socket_path("listen2.sock"); + dev.listen_on(2695, path1.to_str().unwrap()).unwrap(); + dev.listen_on(2696, path2.to_str().unwrap()).unwrap(); + assert_eq!(dev.listeners.len(), 2); assert!(dev.listeners.contains_key(&2695)); - // The actual TCP port may differ from the vsock port - let actual_port = dev - .listeners - .get(&2695) - .unwrap() - .local_addr() - .unwrap() - .port(); - assert_ne!(actual_port, 2695); // OS assigned a different port + assert!(dev.listeners.contains_key(&2696)); } #[test] - fn test_listen_on_two_vsock_ports_different_host_ports() { + fn test_listen_with_uds_connect() { let mut dev = VirtioVsock::new(3); - dev.listen_on(2695, 0).unwrap(); - dev.listen_on(2696, 0).unwrap(); - assert_eq!(dev.listeners.len(), 2); - // Each vsock port has its own listener - let port1 = dev - .listeners - .get(&2695) - .unwrap() - .local_addr() - .unwrap() - .port(); - let port2 = dev - .listeners - .get(&2696) - .unwrap() - .local_addr() - .unwrap() - .port(); - assert_ne!(port1, port2); - } - - #[test] - fn test_listen_with_tcp_connect() { - use std::net::TcpStream; + let (sock_path, _dir) = temp_socket_path("listen-connect.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - // Re-register with actual port. - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); - - // Connect a TCP client before the guest sends REQUEST. - let _client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); - // Brief delay for the accept backlog to propagate. + // Connect a UDS client before the guest sends REQUEST. + let _client = UnixStream::connect(&sock_path).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); let mem = MockMem::new(0x10000); @@ -1343,7 +1291,7 @@ mod tests { src_cid: 3, dst_cid: 2, src_port: 5000, - dst_port: port, + dst_port: vsock_port, len: 0, type_: 1, op: VSOCK_OP_REQUEST, @@ -1357,35 +1305,26 @@ mod tests { dev.process_tx(&mut tx_queue, &mem); - // Should have RESPONSE and a TCP stream. + // Should have RESPONSE and a stream. assert_eq!(dev.rx_pending.len(), 1); assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); assert_eq!(dev.connection_count(), 1); assert_eq!(dev.streams.len(), 1); } - // --- Poll with TCP data --- + // --- Poll with UDS data --- #[test] - fn test_poll_reads_tcp_data() { + fn test_poll_reads_uds_data() { use std::io::Write as IoWrite; - use std::net::TcpStream; let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("poll-data.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - // Connect TCP client. - let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + // Connect UDS client. + let mut client = UnixStream::connect(&sock_path).unwrap(); // Establish vsock connection. let mem = MockMem::new(0x10000); @@ -1395,7 +1334,7 @@ mod tests { src_cid: 3, dst_cid: 2, src_port: 5000, - dst_port: port, + dst_port: vsock_port, len: 0, type_: 1, op: VSOCK_OP_REQUEST, @@ -1409,23 +1348,20 @@ mod tests { dev.process_tx(&mut tx_queue, &mem); dev.rx_pending.clear(); - // Send data from TCP client to be picked up by poll. - client.write_all(b"tcp data").unwrap(); + // Send data from UDS client to be picked up by poll. + client.write_all(b"uds data").unwrap(); client.flush().unwrap(); - // Small delay to allow TCP data to arrive. std::thread::sleep(std::time::Duration::from_millis(50)); - // Poll should read TCP data and queue it. + // Poll should read UDS data and queue it. let mut queues = vec![ setup_queue(128), // RX setup_queue(128), // TX setup_queue(128), // Event ]; - // Set up an RX buffer. let rx_buf = BUF_BASE + 0x4000; - // Use separate addresses for the RX queue to avoid overlap. let rx_desc = 0x8000u64; let rx_avail = 0x8800u64; let rx_used = 0x9000u64; @@ -1433,26 +1369,23 @@ mod tests { queues[0].set_avail_ring(rx_avail); queues[0].set_used_ring(rx_used); - // Write descriptor for RX. mem.write_u64_at(rx_desc, rx_buf); mem.write_u32_at(rx_desc + 8, 256); mem.write_u16_at(rx_desc + 12, 2); // WRITE mem.write_u16_at(rx_desc + 14, 0); - // Push to avail ring. - mem.write_u16_at(rx_avail + 4, 0); // ring[0] = desc 0 - mem.write_u16_at(rx_avail + 2, 1); // avail idx = 1 + mem.write_u16_at(rx_avail + 4, 0); + mem.write_u16_at(rx_avail + 2, 1); let raised = dev.poll(&mut queues, &mem); assert!(raised); - // Check that data was injected. let hdr_bytes = mem.read_bytes(rx_buf, VSOCK_HEADER_SIZE); let rx_hdr = VsockHeader::from_bytes(&hdr_bytes.try_into().unwrap()); assert_eq!(rx_hdr.op, packet::VSOCK_OP_RW); assert_eq!(rx_hdr.len, 8); let payload = mem.read_bytes(rx_buf + VSOCK_HEADER_SIZE as u64, 8); - assert_eq!(payload, b"tcp data"); + assert_eq!(payload, b"uds data"); } // --- Guest-initiated outbound connection --- @@ -1460,24 +1393,23 @@ mod tests { #[test] fn test_connect_to_registers_target() { let mut dev = VirtioVsock::new(3); - dev.connect_to(2696, "127.0.0.1:9999".to_string()); + dev.connect_to(2696, "/tmp/nonexistent.sock".to_string()); assert_eq!(dev.connect_targets.len(), 1); assert!(dev.connect_targets.contains_key(&2696)); } #[test] fn test_connect_to_outbound_success() { - // Set up a host-side TCP listener to receive the outbound connection. - let host_listener = TcpListener::bind("127.0.0.1:0").unwrap(); - let host_port = host_listener.local_addr().unwrap().port(); + // Set up a host-side Unix listener to receive the outbound connection. + let (host_sock, _dir) = temp_socket_path("host-outbound.sock"); + let host_listener = UnixListener::bind(&host_sock).unwrap(); let mut dev = VirtioVsock::new(3); - dev.connect_to(2696, format!("127.0.0.1:{}", host_port)); + dev.connect_to(2696, host_sock.to_str().unwrap().to_string()); let mem = MockMem::new(0x10000); let mut tx_queue = setup_queue(128); - // Guest sends CONNECT to vsock port 2696. let hdr = VsockHeader { src_cid: 3, dst_cid: 2, @@ -1496,7 +1428,6 @@ mod tests { dev.process_tx(&mut tx_queue, &mem); - // Should get RESPONSE (not RST) and have a connection + stream. assert_eq!(dev.rx_pending.len(), 1); assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); assert_eq!(dev.connection_count(), 1); @@ -1505,14 +1436,14 @@ mod tests { // Host listener should have received the connection. host_listener.set_nonblocking(true).unwrap(); let accepted = host_listener.accept(); - assert!(accepted.is_ok(), "Host should have received TCP connection"); + assert!(accepted.is_ok(), "Host should have received UDS connection"); } #[test] fn test_connect_to_unreachable_sends_rst() { let mut dev = VirtioVsock::new(3); - // Port 1 is not listening — connection will fail. - dev.connect_to(2696, "127.0.0.1:1".to_string()); + // Nonexistent path — connection will fail. + dev.connect_to(2696, "/tmp/nonexistent-vsock-test-path.sock".to_string()); let mem = MockMem::new(0x10000); let mut tx_queue = setup_queue(128); @@ -1535,7 +1466,6 @@ mod tests { dev.process_tx(&mut tx_queue, &mem); - // Should get RST because target is unreachable. assert_eq!(dev.rx_pending.len(), 1); assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_RST); assert_eq!(dev.connection_count(), 0); @@ -1543,14 +1473,14 @@ mod tests { #[test] fn test_connect_to_preferred_over_listener() { - // If both connect_target and listener exist for same port, - // connect_target should be used (checked first). - let host_listener = TcpListener::bind("127.0.0.1:0").unwrap(); - let host_port = host_listener.local_addr().unwrap().port(); + let (host_sock, _dir) = temp_socket_path("preferred.sock"); + let _host_listener = UnixListener::bind(&host_sock).unwrap(); + + let (listen_sock, _dir2) = temp_socket_path("listen-fallback.sock"); let mut dev = VirtioVsock::new(3); - dev.connect_to(2696, format!("127.0.0.1:{}", host_port)); - dev.listen_on(2696, 0).unwrap(); // Also add a listener on same vsock port. + dev.connect_to(2696, host_sock.to_str().unwrap().to_string()); + dev.listen_on(2696, listen_sock.to_str().unwrap()).unwrap(); let mem = MockMem::new(0x10000); let mut tx_queue = setup_queue(128); @@ -1573,54 +1503,45 @@ mod tests { dev.process_tx(&mut tx_queue, &mem); - // Should get RESPONSE via outbound connection. assert_eq!(dev.rx_pending.len(), 1); assert_eq!(dev.rx_pending[0].0.op, packet::VSOCK_OP_RESPONSE); assert_eq!(dev.connection_count(), 1); assert_eq!(dev.streams.len(), 1); } - // --- Host-initiated connections (poll_tcp_listeners) --- + // --- Host-initiated connections (poll_listeners) --- #[test] - fn test_poll_tcp_listeners_accepts_and_sends_request() { + fn test_poll_listeners_accepts_and_sends_request() { let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("poll-accept.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - // Host TCP client connects BEFORE any guest action. - let _client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + // Host UDS client connects BEFORE any guest action. + let _client = UnixStream::connect(&sock_path).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); - // poll_tcp_listeners should accept and generate a REQUEST. - dev.poll_tcp_listeners(); + dev.poll_listeners(); assert_eq!(dev.rx_pending.len(), 1); assert_eq!(dev.rx_pending[0].0.op, VSOCK_OP_REQUEST); assert_eq!(dev.rx_pending[0].0.src_cid, VSOCK_CID_HOST); - assert_eq!(dev.rx_pending[0].0.dst_cid, 3); // guest CID - assert_eq!(dev.rx_pending[0].0.dst_port, port); // guest vsock port + assert_eq!(dev.rx_pending[0].0.dst_cid, 3); + assert_eq!(dev.rx_pending[0].0.dst_port, vsock_port); assert!(dev.rx_pending[0].0.src_port >= EPHEMERAL_PORT_START); assert_eq!(dev.connection_count(), 1); assert_eq!(dev.streams.len(), 1); } #[test] - fn test_poll_tcp_listeners_no_pending_is_noop() { + fn test_poll_listeners_no_pending_is_noop() { let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); + let (sock_path, _dir) = temp_socket_path("poll-noop.sock"); + dev.listen_on(2695, sock_path.to_str().unwrap()).unwrap(); - // No TCP client connected. - dev.poll_tcp_listeners(); + // No client connected. + dev.poll_listeners(); assert!(dev.rx_pending.is_empty()); assert_eq!(dev.connection_count(), 0); @@ -1631,36 +1552,28 @@ mod tests { use std::io::Write as IoWrite; let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("lifecycle.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - // Step 1: Host TCP client connects. - let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + // Step 1: Host client connects. + let mut client = UnixStream::connect(&sock_path).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); // Step 2: VMM accepts and sends REQUEST to guest. - dev.poll_tcp_listeners(); + dev.poll_listeners(); assert_eq!(dev.rx_pending.len(), 1); let req = &dev.rx_pending[0].0; assert_eq!(req.op, VSOCK_OP_REQUEST); let host_ephemeral = req.src_port; - let key = (port, host_ephemeral); + let key = (vsock_port, host_ephemeral); dev.rx_pending.clear(); - // Step 3: Guest sends RESPONSE (simulated via handle_guest_packet). + // Step 3: Guest sends RESPONSE. let resp = VsockHeader { src_cid: 3, dst_cid: VSOCK_CID_HOST, - src_port: port, + src_port: vsock_port, dst_port: host_ephemeral, len: 0, type_: 1, @@ -1671,18 +1584,17 @@ mod tests { }; dev.handle_guest_packet(&resp, &[]); - // Connection should now be Connected. assert_eq!( dev.connections.get(&key).unwrap().state(), ConnState::Connected ); - // Step 4: Host sends data via TCP -> forwarded to guest via vsock. + // Step 4: Host sends data → forwarded to guest via vsock. client.write_all(b"hello from host").unwrap(); client.flush().unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); - dev.poll_tcp_streams(); + dev.poll_streams(); let conn = dev.connections.get(&key).unwrap(); assert!(conn.tx_buf_len() > 0); } @@ -1692,28 +1604,18 @@ mod tests { use std::io::Write as IoWrite; let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("handshake.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - // Host connects and sends data before handshake completes. - let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + let mut client = UnixStream::connect(&sock_path).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); - dev.poll_tcp_listeners(); + dev.poll_listeners(); let host_ephemeral = dev.rx_pending[0].0.src_port; - let key = (port, host_ephemeral); + let key = (vsock_port, host_ephemeral); dev.rx_pending.clear(); - // Connection is in Connecting state. assert_eq!( dev.connections.get(&key).unwrap().state(), ConnState::Connecting @@ -1724,16 +1626,15 @@ mod tests { client.flush().unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); - // poll_tcp_streams should SKIP this stream (not Connected yet). - dev.poll_tcp_streams(); - // Data stays in kernel buffer, connection tx_buf is empty. + // poll_streams should SKIP this stream (not Connected yet). + dev.poll_streams(); assert_eq!(dev.connections.get(&key).unwrap().tx_buf_len(), 0); - // Now complete the handshake. + // Complete the handshake. let resp = VsockHeader { src_cid: 3, dst_cid: VSOCK_CID_HOST, - src_port: port, + src_port: vsock_port, dst_port: host_ephemeral, len: 0, type_: 1, @@ -1748,8 +1649,8 @@ mod tests { ConnState::Connected ); - // NOW poll_tcp_streams reads the data. - dev.poll_tcp_streams(); + // NOW poll_streams reads the data. + dev.poll_streams(); assert!(dev.connections.get(&key).unwrap().tx_buf_len() > 0); } @@ -1765,38 +1666,28 @@ mod tests { } #[test] - fn test_host_initiated_guest_data_to_host_tcp() { + fn test_host_initiated_guest_data_to_host_uds() { use std::io::Read as IoRead; let mut dev = VirtioVsock::new(3); - dev.listen(0).unwrap(); - let port = dev - .listeners - .values() - .next() - .unwrap() - .local_addr() - .unwrap() - .port() as u32; - let listener = dev.listeners.remove(&0).unwrap(); - dev.listeners.insert(port, listener); + let (sock_path, _dir) = temp_socket_path("guest-data.sock"); + let vsock_port = 2695u32; + dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); - // Host connects. - let mut client = TcpStream::connect(format!("127.0.0.1:{}", port)).unwrap(); + let mut client = UnixStream::connect(&sock_path).unwrap(); client.set_nonblocking(true).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); - // Accept + REQUEST. - dev.poll_tcp_listeners(); + dev.poll_listeners(); let host_ephemeral = dev.rx_pending[0].0.src_port; - let key = (port, host_ephemeral); + let key = (vsock_port, host_ephemeral); dev.rx_pending.clear(); // Guest RESPONSE. let resp = VsockHeader { src_cid: 3, dst_cid: VSOCK_CID_HOST, - src_port: port, + src_port: vsock_port, dst_port: host_ephemeral, len: 0, type_: 1, @@ -1807,11 +1698,11 @@ mod tests { }; dev.handle_guest_packet(&resp, &[]); - // Guest sends data (RW) → should be forwarded to host TCP stream. + // Guest sends data (RW) → should be forwarded to host Unix stream. let rw_hdr = VsockHeader { src_cid: 3, dst_cid: VSOCK_CID_HOST, - src_port: port, + src_port: vsock_port, dst_port: host_ephemeral, len: 11, type_: 1, @@ -1822,7 +1713,7 @@ mod tests { }; dev.handle_guest_packet(&rw_hdr, b"hello guest"); - // Read from TCP client. + // Read from UDS client. std::thread::sleep(std::time::Duration::from_millis(50)); let mut buf = [0u8; 128]; let n = client.read(&mut buf).unwrap(); From 81d7316bde1c76e96c0612a6282206bd4eff447f Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 2 May 2026 23:40:00 +0800 Subject: [PATCH 29/33] fix(windows): gate Unix-only deps behind cfg(unix) for Windows compilation Move devices, polly, and utils crates to [target.'cfg(unix)'.dependencies] in libkrun/Cargo.toml since these depend on nix, vm-memory, and other Unix-only crates that fail to compile on Windows. The Windows WHPX backend has its own self-contained device implementations and never uses these crates. Also fix cpuid dependency in vmm/Cargo.toml to include package name specifier (krun-cpuid) lost during rebase. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 758 +++++++++++++++++++---------------------- src/libkrun/Cargo.toml | 12 +- src/vmm/Cargo.toml | 2 +- 3 files changed, 352 insertions(+), 420 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8b2b9a039..307f8b1ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -34,12 +34,12 @@ checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "annotate-snippets" -version = "0.9.2" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccaf7e9dfbb6ab22c82e473cd1a8a7bd313c19a5b7e40970f3d89ef5a5c9e81e" +checksum = "710e8eae58854cdc1790fcb56cca04d712a17be849eeb81da2a724bf4bae2bc4" dependencies = [ + "anstyle", "unicode-width", - "yansi-term", ] [[package]] @@ -98,25 +98,6 @@ version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" -[[package]] -name = "arch" -version = "0.1.0" -dependencies = [ - "arch_gen", - "kvm-bindings", - "kvm-ioctls", - "libc", - "smbios", - "tdx", - "utils", - "vm-memory", - "vmm-sys-util 0.14.0", -] - -[[package]] -name = "arch_gen" -version = "0.1.0" - [[package]] name = "async-trait" version = "0.1.88" @@ -174,33 +155,13 @@ dependencies = [ "virtue", ] -[[package]] -name = "bindgen" -version = "0.69.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" -dependencies = [ - "annotate-snippets", - "bitflags 2.10.0", - "cexpr", - "clang-sys", - "itertools", - "lazy_static", - "lazycell", - "proc-macro2", - "quote", - "regex", - "rustc-hash 1.1.0", - "shlex", - "syn", -] - [[package]] name = "bindgen" version = "0.72.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f72209734318d0b619a5e0f5129918b848c416e122a3c4ce054e03cb87b726f" dependencies = [ + "annotate-snippets", "bitflags 2.10.0", "cexpr", "clang-sys", @@ -208,7 +169,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash 2.1.1", + "rustc-hash", "shlex", "syn", ] @@ -306,14 +267,14 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] name = "cfg-expr" -version = "0.15.8" +version = "0.20.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02" +checksum = "3c6b04e07d8080154ed4ac03546d9a2b303cc2fe1901ba0b35b301516e289368" dependencies = [ "smallvec", "target-lexicon", @@ -350,9 +311,9 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "convert_case" -version = "0.6.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f" dependencies = [ "unicode-segmentation", ] @@ -362,9 +323,6 @@ name = "cookie-factory" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2" -dependencies = [ - "futures", -] [[package]] name = "cpufeatures" @@ -375,15 +333,6 @@ dependencies = [ "libc", ] -[[package]] -name = "cpuid" -version = "0.1.0" -dependencies = [ - "kvm-bindings", - "kvm-ioctls", - "vmm-sys-util 0.14.0", -] - [[package]] name = "crc32fast" version = "1.4.2" @@ -418,37 +367,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "devices" -version = "0.1.0" -dependencies = [ - "arch", - "bitflags 1.3.2", - "caps", - "crossbeam-channel", - "hvf", - "imago", - "krun_display", - "krun_input", - "kvm-bindings", - "kvm-ioctls", - "libc", - "libloading", - "log", - "lru", - "nix 0.30.1", - "pipewire", - "polly", - "rand", - "rutabaga_gfx", - "thiserror 2.0.12", - "utils", - "virtio-bindings", - "vm-fdt", - "vm-memory", - "zerocopy", -] - [[package]] name = "digest" version = "0.10.7" @@ -504,6 +422,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "filetime" version = "0.2.26" @@ -532,95 +456,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "futures" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" - -[[package]] -name = "futures-executor" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" - -[[package]] -name = "futures-macro" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" - -[[package]] -name = "futures-task" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" - -[[package]] -name = "futures-util" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -666,27 +501,23 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hvf" -version = "0.1.0" -dependencies = [ - "arch", - "crossbeam-channel", - "libloading", - "log", -] - [[package]] name = "imago" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a46885ecbabe024b9e6e3ee9ee3a32ad05adf2b6269f45aa912793d9c54c805" +checksum = "ae7cfee876c698a1a2ed9c705ab18f21acbed82110f19b51cc458de73426fe2c" dependencies = [ "async-trait", "bincode", @@ -704,12 +535,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.17.0", ] [[package]] @@ -795,38 +626,208 @@ dependencies = [ "sha2", "sm3", "strum", - "thiserror 2.0.12", + "thiserror 2.0.18", ] [[package]] -name = "kernel" -version = "0.1.0" +name = "krun-arch" +version = "0.1.0-1.18.0" dependencies = [ - "utils", + "krun-arch-gen", + "krun-smbios", + "krun-utils", + "kvm-bindings", + "kvm-ioctls", + "libc", + "tdx", "vm-memory", + "vmm-sys-util 0.14.0", +] + +[[package]] +name = "krun-arch-gen" +version = "0.1.0-1.18.0" + +[[package]] +name = "krun-aws-nitro" +version = "0.1.0-1.18.0" +dependencies = [ + "krun-devices", + "libc", + "log", + "nitro-enclaves 0.6.1", + "nix 0.30.1", + "signal-hook", + "tar", + "vsock", ] [[package]] -name = "krun_display" +name = "krun-cpuid" +version = "0.1.0-1.18.0" +dependencies = [ + "kvm-bindings", + "kvm-ioctls", + "vmm-sys-util 0.14.0", +] + +[[package]] +name = "krun-devices" +version = "0.1.0-1.18.0" +dependencies = [ + "bitflags 1.3.2", + "caps", + "crossbeam-channel", + "imago", + "krun-arch", + "krun-display", + "krun-hvf", + "krun-input", + "krun-polly", + "krun-rutabaga-gfx", + "krun-utils", + "kvm-bindings", + "kvm-ioctls", + "libc", + "libloading", + "log", + "lru", + "nix 0.30.1", + "pipewire", + "rand", + "thiserror 2.0.18", + "virtio-bindings", + "vm-fdt", + "vm-memory", + "zerocopy", +] + +[[package]] +name = "krun-display" version = "0.1.0" dependencies = [ - "bindgen 0.72.0", + "bindgen", "bitflags 2.10.0", "log", "static_assertions", - "thiserror 2.0.12", + "thiserror 2.0.18", ] [[package]] -name = "krun_input" +name = "krun-hvf" +version = "0.1.0-1.18.0" +dependencies = [ + "crossbeam-channel", + "krun-arch", + "libloading", + "log", +] + +[[package]] +name = "krun-input" version = "0.1.0" dependencies = [ - "bindgen 0.72.0", + "bindgen", "bitflags 2.10.0", "libc", "log", "static_assertions", - "thiserror 2.0.12", + "thiserror 2.0.18", +] + +[[package]] +name = "krun-kernel" +version = "0.1.0-1.18.0" +dependencies = [ + "krun-utils", + "vm-memory", +] + +[[package]] +name = "krun-polly" +version = "0.1.0-1.18.0" +dependencies = [ + "krun-utils", + "libc", +] + +[[package]] +name = "krun-rutabaga-gfx" +version = "0.1.0-1.18.0" +dependencies = [ + "anyhow", + "cfg-if", + "libc", + "log", + "nix 0.30.1", + "pkg-config", + "remain", + "thiserror 1.0.69", + "vmm-sys-util 0.14.0", + "winapi", + "zerocopy", +] + +[[package]] +name = "krun-smbios" +version = "0.1.0-1.18.0" +dependencies = [ + "vm-memory", +] + +[[package]] +name = "krun-utils" +version = "0.1.0-1.18.0" +dependencies = [ + "bitflags 1.3.2", + "crossbeam-channel", + "kvm-bindings", + "libc", + "log", + "nix 0.30.1", + "vmm-sys-util 0.14.0", +] + +[[package]] +name = "krun-vmm" +version = "0.1.0-1.18.0" +dependencies = [ + "bitfield", + "bitflags 2.10.0", + "bzip2", + "crossbeam-channel", + "env_logger", + "flate2", + "iocuddle", + "kbs-types", + "krun-arch", + "krun-arch-gen", + "krun-cpuid", + "krun-devices", + "krun-display", + "krun-hvf", + "krun-input", + "krun-kernel", + "krun-polly", + "krun-utils", + "kvm-bindings", + "kvm-ioctls", + "libc", + "linux-loader", + "log", + "nix 0.30.1", + "rand", + "serde", + "serde_json", + "tdx", + "tempfile", + "thiserror 2.0.18", + "uds_windows", + "vm-memory", + "vmm-sys-util 0.14.0", + "windows-sys 0.61.2", + "zerocopy", + "zstd", ] [[package]] @@ -850,18 +851,6 @@ dependencies = [ "vmm-sys-util 0.14.0", ] -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "libc" version = "0.2.172" @@ -870,27 +859,27 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libkrun" -version = "1.16.0" +version = "1.18.0" dependencies = [ "crossbeam-channel", - "devices", "env_logger", - "hvf", - "krun_display", - "krun_input", + "krun-aws-nitro", + "krun-devices", + "krun-display", + "krun-hvf", + "krun-input", + "krun-polly", + "krun-utils", + "krun-vmm", "kvm-bindings", "kvm-ioctls", "libc", "libloading", "log", - "nitro", - "nitro-enclaves", + "nitro-enclaves 0.5.0", "once_cell", - "polly", "rand", - "utils", "vm-memory", - "vmm", ] [[package]] @@ -916,9 +905,9 @@ dependencies = [ [[package]] name = "libspa" -version = "0.8.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65f3a4b81b2a2d8c7f300643676202debd1b7c929dbf5c9bb89402ea11d19810" +checksum = "b6b8cfa2a7656627b4c92c6b9ef929433acd673d5ab3708cda1b18478ac00df4" dependencies = [ "bitflags 2.10.0", "cc", @@ -926,27 +915,27 @@ dependencies = [ "cookie-factory", "libc", "libspa-sys", - "nix 0.27.1", - "nom", + "nix 0.30.1", + "nom 8.0.0", "system-deps", ] [[package]] name = "libspa-sys" -version = "0.8.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf0d9716420364790e85cbb9d3ac2c950bde16a7dd36f3209b7dfdfc4a24d01f" +checksum = "901049455d2eb6decf9058235d745237952f4804bc584c5fcb41412e6adcc6e0" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "system-deps", ] [[package]] name = "linux-loader" -version = "0.13.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870c3814345f050991f99869417779f6062542bcf4ed81db7a1b926ad1306638" +checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" dependencies = [ "vm-memory", ] @@ -969,7 +958,7 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f8cc7106155f10bdf99a6f379688f543ad6596a415375b36a59a054ceda1198" dependencies = [ - "hashbrown", + "hashbrown 0.15.3", ] [[package]] @@ -1013,27 +1002,27 @@ dependencies = [ ] [[package]] -name = "nitro" -version = "0.1.0" +name = "nitro-enclaves" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b5b539a76e3f555fb143c3e67d5e05fa1d5fece02a515f6ecf41b3f1a081f58" dependencies = [ - "devices", + "bitflags 2.10.0", "libc", - "nitro-enclaves", - "nix 0.30.1", - "tar", + "nix 0.26.4", + "rand", "vsock", ] [[package]] name = "nitro-enclaves" -version = "0.5.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b5b539a76e3f555fb143c3e67d5e05fa1d5fece02a515f6ecf41b3f1a081f58" +checksum = "c6436c562bcdb6f192e0e59f627bff5b0b88f2e1c48264079f4f1d6da42bec2d" dependencies = [ "bitflags 2.10.0", "libc", "nix 0.26.4", - "rand", "vsock", ] @@ -1050,17 +1039,6 @@ dependencies = [ "pin-utils", ] -[[package]] -name = "nix" -version = "0.27.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" -dependencies = [ - "bitflags 2.10.0", - "cfg-if", - "libc", -] - [[package]] name = "nix" version = "0.29.0" @@ -1097,6 +1075,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + [[package]] name = "object" version = "0.36.7" @@ -1142,28 +1129,28 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipewire" -version = "0.8.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08e645ba5c45109106d56610b3ee60eb13a6f2beb8b74f8dc8186cf261788dda" +checksum = "9688b89abf11d756499f7c6190711d6dbe5a3acdb30c8fbf001d6596d06a8d44" dependencies = [ "anyhow", "bitflags 2.10.0", "libc", "libspa", "libspa-sys", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pipewire-sys", - "thiserror 1.0.69", + "thiserror 2.0.18", ] [[package]] name = "pipewire-sys" -version = "0.8.0" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "849e188f90b1dda88fe2bfe1ad31fe5f158af2c98f80fb5d13726c44f3f01112" +checksum = "cb028afee0d6ca17020b090e3b8fa2d7de23305aef975c7e5192a5050246ea36" dependencies = [ - "bindgen 0.69.5", + "bindgen", "libspa-sys", "system-deps", ] @@ -1174,14 +1161,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "polly" -version = "0.0.1" -dependencies = [ - "libc", - "utils", -] - [[package]] name = "portable-atomic" version = "1.11.1" @@ -1314,12 +1293,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -1354,23 +1327,6 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" -[[package]] -name = "rutabaga_gfx" -version = "0.1.2" -dependencies = [ - "anyhow", - "cfg-if", - "libc", - "log", - "nix 0.30.1", - "pkg-config", - "remain", - "thiserror 1.0.69", - "vmm-sys-util 0.14.0", - "winapi", - "zerocopy", -] - [[package]] name = "ryu" version = "1.0.20" @@ -1385,18 +1341,28 @@ checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -1417,11 +1383,11 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.9" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" dependencies = [ - "serde", + "serde_core", ] [[package]] @@ -1442,20 +1408,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] -name = "simd-adler32" -version = "0.3.7" +name = "signal-hook" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +dependencies = [ + "libc", + "signal-hook-registry", +] [[package]] -name = "slab" -version = "0.4.9" +name = "signal-hook-registry" +version = "1.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" dependencies = [ - "autocfg", + "errno", + "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "sm3" version = "0.4.2" @@ -1471,13 +1448,6 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" -[[package]] -name = "smbios" -version = "0.1.0" -dependencies = [ - "vm-memory", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -1518,9 +1488,9 @@ dependencies = [ [[package]] name = "system-deps" -version = "6.2.2" +version = "7.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349" +checksum = "396a35feb67335377e0251fcbc1092fc85c484bd4e3a7a54319399da127796e7" dependencies = [ "cfg-expr", "heck", @@ -1542,9 +1512,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "tdx" @@ -1561,6 +1531,19 @@ dependencies = [ "vmm-sys-util 0.12.1", ] +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -1572,11 +1555,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.12" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.12", + "thiserror-impl 2.0.18", ] [[package]] @@ -1592,9 +1575,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.12" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -1613,38 +1596,43 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.23" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" dependencies = [ - "serde", + "indexmap", + "serde_core", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_parser", + "toml_writer", + "winnow", ] [[package]] name = "toml_datetime" -version = "0.6.11" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ - "serde", + "serde_core", ] [[package]] -name = "toml_edit" -version = "0.22.27" +name = "toml_parser" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "indexmap", - "serde", - "serde_spanned", - "toml_datetime", "winnow", ] +[[package]] +name = "toml_writer" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" + [[package]] name = "tracing" version = "0.1.41" @@ -1682,6 +1670,17 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +[[package]] +name = "uds_windows" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" +dependencies = [ + "memoffset 0.9.1", + "tempfile", + "windows-sys 0.61.2", +] + [[package]] name = "unicode-ident" version = "1.0.18" @@ -1696,9 +1695,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] name = "unty" @@ -1712,19 +1711,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "utils" -version = "0.1.0" -dependencies = [ - "bitflags 1.3.2", - "crossbeam-channel", - "kvm-bindings", - "libc", - "log", - "nix 0.30.1", - "vmm-sys-util 0.14.0", -] - [[package]] name = "uuid" version = "1.17.0" @@ -1767,55 +1753,15 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.16.2" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" +checksum = "48f1f33aee6ae648087fbed47c2944e2796f7877d4717a59edc8d7cb62f71061" dependencies = [ "libc", - "thiserror 1.0.69", + "thiserror 2.0.18", "winapi", ] -[[package]] -name = "vmm" -version = "0.1.0" -dependencies = [ - "arch", - "arch_gen", - "bitfield", - "bitflags 2.10.0", - "bzip2", - "cpuid", - "crossbeam-channel", - "devices", - "env_logger", - "flate2", - "hvf", - "iocuddle", - "kbs-types", - "kernel", - "krun_display", - "krun_input", - "kvm-bindings", - "kvm-ioctls", - "libc", - "linux-loader", - "log", - "nix 0.30.1", - "polly", - "rand", - "serde", - "serde_json", - "tdx", - "thiserror 2.0.12", - "utils", - "vm-memory", - "vmm-sys-util 0.14.0", - "windows-sys 0.61.2", - "zerocopy", - "zstd", -] - [[package]] name = "vmm-sys-util" version = "0.12.1" @@ -2099,12 +2045,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.10" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06928c8748d81b05c9be96aad92e1b6ff01833332f281e8cfca3be4b35fc9ec" -dependencies = [ - "memchr", -] +checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0" [[package]] name = "wit-bindgen-rt" @@ -2125,15 +2068,6 @@ dependencies = [ "rustix", ] -[[package]] -name = "yansi-term" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe5c30ade05e61656247b2e334a031dfd0cc466fadef865bdcdea8d537951bf1" -dependencies = [ - "winapi", -] - [[package]] name = "zerocopy" version = "0.8.26" diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 60ba04270..eebb7bb1f 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -21,25 +21,23 @@ virgl_resource_map2 = ["devices/virgl_resource_map2"] aws-nitro = ["vmm/aws-nitro", "devices/aws-nitro", "dep:aws-nitro", "dep:nitro-enclaves"] [dependencies] -# Cross-platform dependencies -log = "0.4.0" -vmm = { path = "../vmm" } - -# Unix-only dependencies (the upstream C API implementation) -[target.'cfg(unix)'.dependencies] crossbeam-channel = ">=0.5.15" env_logger = "0.11" libc = ">=0.2.39" libloading = "0.8" +log = "0.4.0" once_cell = "1.4.1" krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } rand = "0.9.2" +vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } + +# Unix-only internal crates (not used by Windows WHPX backend) +[target.'cfg(unix)'.dependencies] devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } -vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } [target.'cfg(target_os = "macos")'.dependencies] hvf = { package = "krun-hvf", version = "=0.1.0-1.18.0", path = "../hvf" } diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 3ec0b5be8..046da00f5 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -55,7 +55,7 @@ zstd = "0.13" # cpuid is needed on Unix x86_64 only (upstream VMM uses it) [target.'cfg(all(target_arch = "x86_64", unix))'.dependencies] -cpuid = { path = "../cpuid" } +cpuid = { package = "krun-cpuid", version = "=0.1.0-1.18.0", path = "../cpuid" } [target.'cfg(target_os = "linux")'.dependencies] tdx = { version = "0.1.0", optional = true } From 6140590d0d9124bcf8044bd4b7308b192c926aae Mon Sep 17 00:00:00 2001 From: lile Date: Sun, 3 May 2026 09:55:00 +0800 Subject: [PATCH 30/33] fix: update enable_tsi() to use tsi_flags after TsiFlags refactoring The enable_tsi field was replaced with tsi_flags: TsiFlags but the getter still referenced self.enable_tsi, causing infinite recursion. Co-Authored-By: Claude Opus 4.6 --- src/devices/src/virtio/vsock/muxer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index 9421a9cc9..574f64a62 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -133,7 +133,7 @@ impl VsockMuxer { } pub fn enable_tsi(&self) -> bool { - self.enable_tsi + self.tsi_flags.tsi_enabled() } pub(crate) fn activate( From 540822a67637b219781a4ac3d50720e4fe82c3ec Mon Sep 17 00:00:00 2001 From: lile Date: Tue, 5 May 2026 21:12:02 +0800 Subject: [PATCH 31/33] fix: cargo fmt + declare efi feature for clippy - Run cargo fmt to fix import ordering (cpuid, muxer, lib.rs, windows/) - Add `efi = []` to vmm/Cargo.toml to resolve "unexpected cfg condition value" clippy error on builder.rs:91 Co-Authored-By: Claude Opus 4.6 --- src/cpuid/src/common.rs | 4 +- src/devices/src/virtio/vsock/muxer.rs | 2 +- src/libkrun/src/lib.rs | 4939 +++++++++-------- src/vmm/Cargo.toml | 1 + src/vmm/src/windows/cmdline.rs | 1 - src/vmm/src/windows/devices/irq_chip.rs | 47 +- src/vmm/src/windows/devices/lapic.rs | 17 +- .../src/windows/devices/virtio/vsock/mod.rs | 21 +- src/vmm/src/windows/runner.rs | 89 +- 9 files changed, 2613 insertions(+), 2508 deletions(-) diff --git a/src/cpuid/src/common.rs b/src/cpuid/src/common.rs index 950a8673d..d7474b89c 100644 --- a/src/cpuid/src/common.rs +++ b/src/cpuid/src/common.rs @@ -2,9 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 #[cfg(target_arch = "x86")] -use std::arch::x86::{__cpuid_count, __get_cpuid_max, CpuidResult}; +use std::arch::x86::{CpuidResult, __cpuid_count, __get_cpuid_max}; #[cfg(target_arch = "x86_64")] -use std::arch::x86_64::{__cpuid_count, __get_cpuid_max, CpuidResult}; +use std::arch::x86_64::{CpuidResult, __cpuid_count, __get_cpuid_max}; use crate::cpu_leaf::*; diff --git a/src/devices/src/virtio/vsock/muxer.rs b/src/devices/src/virtio/vsock/muxer.rs index 574f64a62..620c58fa1 100644 --- a/src/devices/src/virtio/vsock/muxer.rs +++ b/src/devices/src/virtio/vsock/muxer.rs @@ -18,7 +18,7 @@ use super::tsi_stream::TsiStreamProxy; use super::unix::UnixProxy; use super::TsiFlags; use super::VsockError; -use crossbeam_channel::{Sender, unbounded}; +use crossbeam_channel::{unbounded, Sender}; use utils::epoll::{ControlOperation, Epoll, EpollEvent, EventSet}; use vm_memory::GuestMemoryMmap; diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 61ae7d383..83bb6dee9 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,475 +14,486 @@ mod windows_api; #[cfg(not(target_os = "windows"))] mod unix_api { -use crossbeam_channel::unbounded; -#[cfg(feature = "blk")] -use devices::virtio::CacheType; -#[cfg(feature = "blk")] -use devices::virtio::block::{ImageType, SyncMode}; -#[cfg(feature = "gpu")] -use devices::virtio::gpu::display::DisplayInfo; -#[cfg(feature = "net")] -use devices::virtio::net::device::VirtioNetBackend; -use env_logger::{Env, Target}; -#[cfg(feature = "gpu")] -use krun_display::DisplayBackend; - -use libc::{c_char, c_int, size_t}; -use once_cell::sync::Lazy; -use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; -use std::collections::HashMap; -use std::collections::hash_map::Entry; -use std::convert::TryInto; -use std::env; -#[cfg(target_os = "linux")] -use std::ffi::CString; -use std::ffi::{CStr, c_void}; -use std::fs::File; -use std::io::IsTerminal; -#[cfg(target_os = "linux")] -use std::os::fd::AsRawFd; -use std::os::fd::{BorrowedFd, FromRawFd, RawFd}; -use std::path::PathBuf; -use std::slice; -use std::sync::LazyLock; -use std::sync::Mutex; -use std::sync::atomic::{AtomicI32, Ordering}; -use utils::eventfd::EventFd; -use vmm::resources::{ - DefaultVirtioConsoleConfig, PortConfig, SerialConsoleConfig, TsiFlags, VirtioConsoleConfigMode, - VmResources, VsockConfig, -}; -#[cfg(feature = "blk")] -use vmm::vmm_config::block::{BlockDeviceConfig, BlockRootConfig}; -#[cfg(not(feature = "tee"))] -use vmm::vmm_config::external_kernel::{ExternalKernel, KernelFormat}; -#[cfg(not(feature = "tee"))] -use vmm::vmm_config::firmware::FirmwareConfig; -#[cfg(not(feature = "tee"))] -use vmm::vmm_config::fs::FsDeviceConfig; -use vmm::vmm_config::kernel_bundle::KernelBundle; -#[cfg(feature = "tee")] -use vmm::vmm_config::kernel_bundle::{InitrdBundle, QbootBundle}; -use vmm::vmm_config::kernel_cmdline::{DEFAULT_KERNEL_CMDLINE, KernelCmdlineConfig}; -use vmm::vmm_config::machine_config::VmConfig; -#[cfg(feature = "net")] -use vmm::vmm_config::net::NetworkInterfaceConfig; -use vmm::vmm_config::vsock::VsockDeviceConfig; - -#[cfg(feature = "aws-nitro")] -use aws_nitro::enclave::NitroEnclave; - -#[cfg(feature = "gpu")] -use devices::virtio::display::{DisplayInfoEdid, MAX_DISPLAYS, PhysicalSize}; -#[cfg(feature = "input")] -use krun_input::{InputConfigBackend, InputEventProviderBackend}; - -const KRUN_SUCCESS: i32 = 0; -const MAX_ARGS: usize = 4096; - -#[cfg(all(target_os = "linux", not(feature = "tee")))] -const KRUNFW_NAME: &str = "libkrunfw.so.5"; -#[cfg(all(target_os = "linux", feature = "amd-sev"))] -const KRUNFW_NAME: &str = "libkrunfw-sev.so.5"; -#[cfg(all(target_os = "linux", feature = "tdx"))] -const KRUNFW_NAME: &str = "libkrunfw-tdx.so.5"; -#[cfg(target_os = "macos")] -const KRUNFW_NAME: &str = "libkrunfw.5.dylib"; - -#[cfg(feature = "aws-nitro")] -static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); - -const INIT_PATH: &str = "/init.krun"; - -static KRUNFW: LazyLock> = - LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); - -pub struct KrunfwBindings { - get_kernel: libloading::Symbol< - 'static, - unsafe extern "C" fn(*mut u64, *mut u64, *mut size_t) -> *mut c_char, - >, - #[cfg(feature = "tee")] - get_initrd: libloading::Symbol<'static, unsafe extern "C" fn(*mut size_t) -> *mut c_char>, - #[cfg(feature = "tee")] - get_qboot: libloading::Symbol<'static, unsafe extern "C" fn(*mut size_t) -> *mut c_char>, -} - -impl KrunfwBindings { - fn load_bindings() -> Result { - let krunfw = match KRUNFW.as_ref() { - Some(krunfw) => krunfw, - None => return Err(libloading::Error::DlOpenUnknown), - }; - Ok(unsafe { - KrunfwBindings { - get_kernel: krunfw.get(b"krunfw_get_kernel")?, - #[cfg(feature = "tee")] - get_initrd: krunfw.get(b"krunfw_get_initrd")?, - #[cfg(feature = "tee")] - get_qboot: krunfw.get(b"krunfw_get_qboot")?, - } - }) - } - - pub fn new() -> Option { - Self::load_bindings().ok() - } -} - -#[derive(Clone)] -#[cfg(feature = "net")] -enum LegacyNetworkConfig { - VirtioNetPasst(RawFd), - VirtioNetGvproxy(PathBuf), -} - -#[derive(Default)] -struct ContextConfig { - krunfw: Option, - vmr: VmResources, - workdir: Option, - exec_path: Option, - env: Option, - args: Option, - rlimits: Option, - #[cfg(feature = "net")] - legacy_net_cfg: Option, - #[cfg(feature = "net")] - legacy_mac: Option<[u8; 6]>, - #[cfg(feature = "net")] - disable_tsi: bool, - net_index: u8, - tsi_port_map: Option>, - vsock_config: VsockConfig, + use crossbeam_channel::unbounded; #[cfg(feature = "blk")] - block_cfgs: Vec, - #[cfg(feature = "blk")] - root_block_cfg: Option, + use devices::virtio::block::{ImageType, SyncMode}; + #[cfg(feature = "gpu")] + use devices::virtio::gpu::display::DisplayInfo; + #[cfg(feature = "net")] + use devices::virtio::net::device::VirtioNetBackend; #[cfg(feature = "blk")] - data_block_cfg: Option, + use devices::virtio::CacheType; + use env_logger::{Env, Target}; + #[cfg(feature = "gpu")] + use krun_display::DisplayBackend; + + use libc::{c_char, c_int, size_t}; + use once_cell::sync::Lazy; + use polly::event_manager::EventManager; + #[cfg(all(feature = "blk", not(feature = "tee")))] + use rand::distr::{Alphanumeric, SampleString}; + use std::collections::hash_map::Entry; + use std::collections::HashMap; + use std::convert::TryInto; + use std::env; + #[cfg(target_os = "linux")] + use std::ffi::CString; + use std::ffi::{c_void, CStr}; + use std::fs::File; + use std::io::IsTerminal; + #[cfg(target_os = "linux")] + use std::os::fd::AsRawFd; + use std::os::fd::{BorrowedFd, FromRawFd, RawFd}; + use std::path::PathBuf; + use std::slice; + use std::sync::atomic::{AtomicI32, Ordering}; + use std::sync::LazyLock; + use std::sync::Mutex; + use utils::eventfd::EventFd; + use vmm::resources::{ + DefaultVirtioConsoleConfig, PortConfig, SerialConsoleConfig, TsiFlags, + VirtioConsoleConfigMode, VmResources, VsockConfig, + }; #[cfg(feature = "blk")] - block_root: Option, + use vmm::vmm_config::block::{BlockDeviceConfig, BlockRootConfig}; + #[cfg(not(feature = "tee"))] + use vmm::vmm_config::external_kernel::{ExternalKernel, KernelFormat}; + #[cfg(not(feature = "tee"))] + use vmm::vmm_config::firmware::FirmwareConfig; + #[cfg(not(feature = "tee"))] + use vmm::vmm_config::fs::FsDeviceConfig; + use vmm::vmm_config::kernel_bundle::KernelBundle; #[cfg(feature = "tee")] - tee_config_file: Option, - unix_ipc_port_map: Option>, - shutdown_efd: Option, - gpu_virgl_flags: Option, - gpu_shm_size: Option, - enable_snd: bool, - console_output: Option, - vmm_uid: Option, - vmm_gid: Option, -} + use vmm::vmm_config::kernel_bundle::{InitrdBundle, QbootBundle}; + use vmm::vmm_config::kernel_cmdline::{KernelCmdlineConfig, DEFAULT_KERNEL_CMDLINE}; + use vmm::vmm_config::machine_config::VmConfig; + #[cfg(feature = "net")] + use vmm::vmm_config::net::NetworkInterfaceConfig; + use vmm::vmm_config::vsock::VsockDeviceConfig; -impl ContextConfig { - fn set_workdir(&mut self, workdir: String) { - self.workdir = Some(workdir); - } + #[cfg(feature = "aws-nitro")] + use aws_nitro::enclave::NitroEnclave; + + #[cfg(feature = "gpu")] + use devices::virtio::display::{DisplayInfoEdid, PhysicalSize, MAX_DISPLAYS}; + #[cfg(feature = "input")] + use krun_input::{InputConfigBackend, InputEventProviderBackend}; + + const KRUN_SUCCESS: i32 = 0; + const MAX_ARGS: usize = 4096; + + #[cfg(all(target_os = "linux", not(feature = "tee")))] + const KRUNFW_NAME: &str = "libkrunfw.so.5"; + #[cfg(all(target_os = "linux", feature = "amd-sev"))] + const KRUNFW_NAME: &str = "libkrunfw-sev.so.5"; + #[cfg(all(target_os = "linux", feature = "tdx"))] + const KRUNFW_NAME: &str = "libkrunfw-tdx.so.5"; + #[cfg(target_os = "macos")] + const KRUNFW_NAME: &str = "libkrunfw.5.dylib"; - fn get_workdir(&self) -> String { - match &self.workdir { - Some(workdir) => format!("KRUN_WORKDIR={workdir}"), - None => "".to_string(), - } - } + #[cfg(feature = "aws-nitro")] + static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); + + const INIT_PATH: &str = "/init.krun"; - fn set_exec_path(&mut self, exec_path: String) { - self.exec_path = Some(exec_path); + static KRUNFW: LazyLock> = + LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); + + pub struct KrunfwBindings { + get_kernel: libloading::Symbol< + 'static, + unsafe extern "C" fn(*mut u64, *mut u64, *mut size_t) -> *mut c_char, + >, + #[cfg(feature = "tee")] + get_initrd: libloading::Symbol<'static, unsafe extern "C" fn(*mut size_t) -> *mut c_char>, + #[cfg(feature = "tee")] + get_qboot: libloading::Symbol<'static, unsafe extern "C" fn(*mut size_t) -> *mut c_char>, } - fn get_exec_path(&self) -> String { - match &self.exec_path { - Some(exec_path) => format!("KRUN_INIT={exec_path}"), - None => "".to_string(), + impl KrunfwBindings { + fn load_bindings() -> Result { + let krunfw = match KRUNFW.as_ref() { + Some(krunfw) => krunfw, + None => return Err(libloading::Error::DlOpenUnknown), + }; + Ok(unsafe { + KrunfwBindings { + get_kernel: krunfw.get(b"krunfw_get_kernel")?, + #[cfg(feature = "tee")] + get_initrd: krunfw.get(b"krunfw_get_initrd")?, + #[cfg(feature = "tee")] + get_qboot: krunfw.get(b"krunfw_get_qboot")?, + } + }) } - } - #[cfg(all(feature = "blk", not(feature = "tee")))] - fn set_block_root(&mut self, device: String, fstype: Option, options: Option) { - self.block_root = Some(BlockRootConfig { - device, - fstype, - options, - }); + pub fn new() -> Option { + Self::load_bindings().ok() + } } - fn get_block_root(&self) -> String { + #[derive(Clone)] + #[cfg(feature = "net")] + enum LegacyNetworkConfig { + VirtioNetPasst(RawFd), + VirtioNetGvproxy(PathBuf), + } + + #[derive(Default)] + struct ContextConfig { + krunfw: Option, + vmr: VmResources, + workdir: Option, + exec_path: Option, + env: Option, + args: Option, + rlimits: Option, + #[cfg(feature = "net")] + legacy_net_cfg: Option, + #[cfg(feature = "net")] + legacy_mac: Option<[u8; 6]>, + #[cfg(feature = "net")] + disable_tsi: bool, + net_index: u8, + tsi_port_map: Option>, + vsock_config: VsockConfig, #[cfg(feature = "blk")] - match &self.block_root { - Some(block_root) => { - let mut res = format!("KRUN_BLOCK_ROOT_DEVICE={}", block_root.device); - if let Some(fstype) = &block_root.fstype { - res += &format!(" KRUN_BLOCK_ROOT_FSTYPE={fstype}"); - } - if let Some(options) = &block_root.options { - res += &format!(" KRUN_BLOCK_ROOT_OPTIONS={options}"); - } - res + block_cfgs: Vec, + #[cfg(feature = "blk")] + root_block_cfg: Option, + #[cfg(feature = "blk")] + data_block_cfg: Option, + #[cfg(feature = "blk")] + block_root: Option, + #[cfg(feature = "tee")] + tee_config_file: Option, + unix_ipc_port_map: Option>, + shutdown_efd: Option, + gpu_virgl_flags: Option, + gpu_shm_size: Option, + enable_snd: bool, + console_output: Option, + vmm_uid: Option, + vmm_gid: Option, + } + + impl ContextConfig { + fn set_workdir(&mut self, workdir: String) { + self.workdir = Some(workdir); + } + + fn get_workdir(&self) -> String { + match &self.workdir { + Some(workdir) => format!("KRUN_WORKDIR={workdir}"), + None => "".to_string(), } - None => "".to_string(), } - #[cfg(not(feature = "blk"))] - "".to_string() - } - fn set_env(&mut self, env: String) { - self.env = Some(env); - } + fn set_exec_path(&mut self, exec_path: String) { + self.exec_path = Some(exec_path); + } - fn get_env(&self) -> String { - match &self.env { - Some(env) => env.clone(), - None => "".to_string(), + fn get_exec_path(&self) -> String { + match &self.exec_path { + Some(exec_path) => format!("KRUN_INIT={exec_path}"), + None => "".to_string(), + } } - } - fn set_args(&mut self, args: String) { - self.args = Some(args); - } + #[cfg(all(feature = "blk", not(feature = "tee")))] + fn set_block_root( + &mut self, + device: String, + fstype: Option, + options: Option, + ) { + self.block_root = Some(BlockRootConfig { + device, + fstype, + options, + }); + } - fn get_args(&self) -> String { - match &self.args { - Some(args) => args.clone(), - None => "".to_string(), + fn get_block_root(&self) -> String { + #[cfg(feature = "blk")] + match &self.block_root { + Some(block_root) => { + let mut res = format!("KRUN_BLOCK_ROOT_DEVICE={}", block_root.device); + if let Some(fstype) = &block_root.fstype { + res += &format!(" KRUN_BLOCK_ROOT_FSTYPE={fstype}"); + } + if let Some(options) = &block_root.options { + res += &format!(" KRUN_BLOCK_ROOT_OPTIONS={options}"); + } + res + } + None => "".to_string(), + } + #[cfg(not(feature = "blk"))] + "".to_string() } - } - fn set_rlimits(&mut self, rlimits: String) { - self.rlimits = Some(rlimits); - } + fn set_env(&mut self, env: String) { + self.env = Some(env); + } - fn get_rlimits(&self) -> String { - match &self.rlimits { - Some(rlimits) => format!("KRUN_RLIMITS={rlimits}"), - None => "".to_string(), + fn get_env(&self) -> String { + match &self.env { + Some(env) => env.clone(), + None => "".to_string(), + } } - } - #[cfg(feature = "blk")] - fn add_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { - self.block_cfgs.push(block_cfg); - } + fn set_args(&mut self, args: String) { + self.args = Some(args); + } - #[cfg(feature = "blk")] - fn set_root_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { - self.root_block_cfg = Some(block_cfg); - } + fn get_args(&self) -> String { + match &self.args { + Some(args) => args.clone(), + None => "".to_string(), + } + } - #[cfg(feature = "blk")] - fn set_data_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { - self.data_block_cfg = Some(block_cfg); - } + fn set_rlimits(&mut self, rlimits: String) { + self.rlimits = Some(rlimits); + } - #[cfg(feature = "blk")] - fn get_block_cfg(&self) -> Vec { - // For backwards compat, when cfgs is empty (the new API is not used), this needs to be - // root and then data, in that order. Also for backwards compat, root/data are setters and - // need to discard redundant calls. So we have simple setters above and fix up here. - // - // When the new API is used, this is simpler. - if self.block_cfgs.is_empty() { - [&self.root_block_cfg, &self.data_block_cfg] - .into_iter() - .filter_map(|cfg| cfg.clone()) - .collect() - } else { - self.block_cfgs.clone() + fn get_rlimits(&self) -> String { + match &self.rlimits { + Some(rlimits) => format!("KRUN_RLIMITS={rlimits}"), + None => "".to_string(), + } } - } - #[cfg(feature = "net")] - fn set_net_mac(&mut self, mac: [u8; 6]) { - self.legacy_mac = Some(mac); - } + #[cfg(feature = "blk")] + fn add_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { + self.block_cfgs.push(block_cfg); + } - fn set_port_map(&mut self, new_port_map: HashMap) -> Result<(), ()> { - if self.net_index != 0 { - return Err(()); + #[cfg(feature = "blk")] + fn set_root_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { + self.root_block_cfg = Some(block_cfg); } - self.tsi_port_map.replace(new_port_map); - Ok(()) - } + #[cfg(feature = "blk")] + fn set_data_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { + self.data_block_cfg = Some(block_cfg); + } - #[cfg(feature = "tee")] - fn set_tee_config_file(&mut self, filepath: PathBuf) { - self.tee_config_file = Some(filepath); - } + #[cfg(feature = "blk")] + fn get_block_cfg(&self) -> Vec { + // For backwards compat, when cfgs is empty (the new API is not used), this needs to be + // root and then data, in that order. Also for backwards compat, root/data are setters and + // need to discard redundant calls. So we have simple setters above and fix up here. + // + // When the new API is used, this is simpler. + if self.block_cfgs.is_empty() { + [&self.root_block_cfg, &self.data_block_cfg] + .into_iter() + .filter_map(|cfg| cfg.clone()) + .collect() + } else { + self.block_cfgs.clone() + } + } - #[cfg(feature = "tee")] - fn get_tee_config_file(&self) -> Option { - self.tee_config_file.clone() - } + #[cfg(feature = "net")] + fn set_net_mac(&mut self, mac: [u8; 6]) { + self.legacy_mac = Some(mac); + } - fn add_vsock_port(&mut self, port: u32, filepath: PathBuf, listen: bool) { - if let Some(ref mut map) = &mut self.unix_ipc_port_map { - map.insert(port, (filepath, listen)); - } else { - let mut map: HashMap = HashMap::new(); - map.insert(port, (filepath, listen)); - self.unix_ipc_port_map = Some(map); + fn set_port_map(&mut self, new_port_map: HashMap) -> Result<(), ()> { + if self.net_index != 0 { + return Err(()); + } + + self.tsi_port_map.replace(new_port_map); + Ok(()) } - } - fn set_gpu_virgl_flags(&mut self, virgl_flags: u32) { - self.gpu_virgl_flags = Some(virgl_flags); - } + #[cfg(feature = "tee")] + fn set_tee_config_file(&mut self, filepath: PathBuf) { + self.tee_config_file = Some(filepath); + } - fn set_gpu_shm_size(&mut self, shm_size: usize) { - self.gpu_shm_size = Some(shm_size); - } + #[cfg(feature = "tee")] + fn get_tee_config_file(&self) -> Option { + self.tee_config_file.clone() + } - fn set_vmm_uid(&mut self, vmm_uid: libc::uid_t) { - self.vmm_uid = Some(vmm_uid); - } + fn add_vsock_port(&mut self, port: u32, filepath: PathBuf, listen: bool) { + if let Some(ref mut map) = &mut self.unix_ipc_port_map { + map.insert(port, (filepath, listen)); + } else { + let mut map: HashMap = HashMap::new(); + map.insert(port, (filepath, listen)); + self.unix_ipc_port_map = Some(map); + } + } - fn set_vmm_gid(&mut self, vmm_gid: libc::gid_t) { - self.vmm_gid = Some(vmm_gid); - } -} + fn set_gpu_virgl_flags(&mut self, virgl_flags: u32) { + self.gpu_virgl_flags = Some(virgl_flags); + } -#[cfg(feature = "aws-nitro")] -impl TryFrom for NitroEnclave { - type Error = i32; + fn set_gpu_shm_size(&mut self, shm_size: usize) { + self.gpu_shm_size = Some(shm_size); + } - fn try_from(ctx: ContextConfig) -> Result { - let vm_config = ctx.vmr.vm_config(); + fn set_vmm_uid(&mut self, vmm_uid: libc::uid_t) { + self.vmm_uid = Some(vmm_uid); + } - let Some(mem_size_mib) = vm_config.mem_size_mib else { - error!("memory size not configured"); - return Err(-libc::EINVAL); - }; + fn set_vmm_gid(&mut self, vmm_gid: libc::gid_t) { + self.vmm_gid = Some(vmm_gid); + } + } - let Some(vcpus) = vm_config.vcpu_count else { - error!("vCPU count not configured"); - return Err(-libc::EINVAL); - }; + #[cfg(feature = "aws-nitro")] + impl TryFrom for NitroEnclave { + type Error = i32; - let rootfs = if let Some(path) = &ctx.vmr.fs.first() { - path.shared_dir.clone() - } else { - error!("rootfs path required"); - return Err(-libc::EINVAL); - }; + fn try_from(ctx: ContextConfig) -> Result { + let vm_config = ctx.vmr.vm_config(); - let Some(exec_path) = ctx.exec_path else { - error!("exec path not specified"); - return Err(-libc::EINVAL); - }; + let Some(mem_size_mib) = vm_config.mem_size_mib else { + error!("memory size not configured"); + return Err(-libc::EINVAL); + }; - let Some(exec_env) = ctx.env else { - error!("execution env not specified"); - return Err(-libc::EINVAL); - }; + let Some(vcpus) = vm_config.vcpu_count else { + error!("vCPU count not configured"); + return Err(-libc::EINVAL); + }; - let Some(exec_args) = ctx.args else { - error!("execution args not specified"); - return Err(-libc::EINVAL); - }; + let rootfs = if let Some(path) = &ctx.vmr.fs.first() { + path.shared_dir.clone() + } else { + error!("rootfs path required"); + return Err(-libc::EINVAL); + }; + + let Some(exec_path) = ctx.exec_path else { + error!("exec path not specified"); + return Err(-libc::EINVAL); + }; - let net_unixfd = { - let mut list = ctx.vmr.net.list; - let len = list.len(); - match len { - 0 => None, - 1 => { - let device = list.pop_front().unwrap(); - let device = device.lock().unwrap(); + let Some(exec_env) = ctx.env else { + error!("execution env not specified"); + return Err(-libc::EINVAL); + }; - let fd = match device.cfg_backend { - VirtioNetBackend::UnixstreamFd(fd) => RawFd::from(fd), - _ => return Err(libc::EINVAL), - }; + let Some(exec_args) = ctx.args else { + error!("execution args not specified"); + return Err(-libc::EINVAL); + }; - Some(fd) - } - _ => { - error!( + let net_unixfd = { + let mut list = ctx.vmr.net.list; + let len = list.len(); + match len { + 0 => None, + 1 => { + let device = list.pop_front().unwrap(); + let device = device.lock().unwrap(); + + let fd = match device.cfg_backend { + VirtioNetBackend::UnixstreamFd(fd) => RawFd::from(fd), + _ => return Err(libc::EINVAL), + }; + + Some(fd) + } + _ => { + error!( "more than one network interface configured (max 1 allowed, found {len})" ); - return Err(-libc::EINVAL); + return Err(-libc::EINVAL); + } } - } - }; - - let Some(output_path) = ctx.console_output else { - error!("console output path not specified"); - return Err(-libc::EINVAL); - }; + }; - let debug = KRUN_NITRO_DEBUG.lock().unwrap(); + let Some(output_path) = ctx.console_output else { + error!("console output path not specified"); + return Err(-libc::EINVAL); + }; - Ok(Self { - mem_size_mib, - vcpus, - rootfs, - exec_path, - exec_args, - exec_env, - net_unixfd, - output_path, - debug: *debug, - }) + let debug = KRUN_NITRO_DEBUG.lock().unwrap(); + + Ok(Self { + mem_size_mib, + vcpus, + rootfs, + exec_path, + exec_args, + exec_env, + net_unixfd, + output_path, + debug: *debug, + }) + } } -} -// TODO: Use this everywhere instead of the manual match -#[allow(dead_code)] -fn with_cfg(ctx_id: u32, f: impl FnOnce(&mut ContextConfig) -> i32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => f(ctx_cfg.get_mut()), - Entry::Vacant(_) => -libc::ENOENT, + // TODO: Use this everywhere instead of the manual match + #[allow(dead_code)] + fn with_cfg(ctx_id: u32, f: impl FnOnce(&mut ContextConfig) -> i32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => f(ctx_cfg.get_mut()), + Entry::Vacant(_) => -libc::ENOENT, + } } -} -static CTX_MAP: Lazy>> = Lazy::new(|| Mutex::new(HashMap::new())); -static CTX_IDS: AtomicI32 = AtomicI32::new(0); + static CTX_MAP: Lazy>> = + Lazy::new(|| Mutex::new(HashMap::new())); + static CTX_IDS: AtomicI32 = AtomicI32::new(0); -fn log_level_to_filter_str(level: u32) -> &'static str { - match level { - 0 => "off", - 1 => "error", - 2 => "warn", - 3 => "info", - 4 => "debug", - _ => "trace", + fn log_level_to_filter_str(level: u32) -> &'static str { + match level { + 0 => "off", + 1 => "error", + 2 => "warn", + 3 => "info", + 4 => "debug", + _ => "trace", + } } -} -#[no_mangle] -pub extern "C" fn krun_set_log_level(level: u32) -> i32 { - let filter = log_level_to_filter_str(level); - env_logger::Builder::from_env(Env::default().default_filter_or(filter)) - .format_timestamp_micros() - .init(); + #[no_mangle] + pub extern "C" fn krun_set_log_level(level: u32) -> i32 { + let filter = log_level_to_filter_str(level); + env_logger::Builder::from_env(Env::default().default_filter_or(filter)) + .format_timestamp_micros() + .init(); - #[cfg(feature = "aws-nitro")] - { - // Notify krun-awsnitro to enable debug for log level. - if level == 4 { - let mut debug = KRUN_NITRO_DEBUG.lock().unwrap(); + #[cfg(feature = "aws-nitro")] + { + // Notify krun-awsnitro to enable debug for log level. + if level == 4 { + let mut debug = KRUN_NITRO_DEBUG.lock().unwrap(); - *debug = true; + *debug = true; + } } - } - KRUN_SUCCESS -} + KRUN_SUCCESS + } -mod log_defs { - pub const KRUN_LOG_STYLE_AUTO: u32 = 0; - pub const KRUN_LOG_STYLE_ALWAYS: u32 = 1; - pub const KRUN_LOG_STYLE_NEVER: u32 = 2; - pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; -} + mod log_defs { + pub const KRUN_LOG_STYLE_AUTO: u32 = 0; + pub const KRUN_LOG_STYLE_ALWAYS: u32 = 1; + pub const KRUN_LOG_STYLE_NEVER: u32 = 2; + pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; + } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_init_log(target: RawFd, level: u32, style: u32, options: u32) -> i32 { - let target = match target { + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_init_log( + target: RawFd, + level: u32, + style: u32, + options: u32, + ) -> i32 { + let target = match target { ..-1 => return -libc::EINVAL, -1 => Target::default(), 0 /* stdin */ => return -libc::EINVAL, @@ -491,803 +502,748 @@ pub unsafe extern "C" fn krun_init_log(target: RawFd, level: u32, style: u32, op fd => Target::Pipe(Box::new(File::from_raw_fd(fd))), }; - let filter = log_level_to_filter_str(level); + let filter = log_level_to_filter_str(level); - let write_style = match style { - log_defs::KRUN_LOG_STYLE_AUTO => "auto", - log_defs::KRUN_LOG_STYLE_ALWAYS => "always", - log_defs::KRUN_LOG_STYLE_NEVER => "never", - _ => return -libc::EINVAL, - }; - - let use_env = match options { - 0 => true, - log_defs::KRUN_LOG_OPTION_NO_ENV => false, - _ => return -libc::EINVAL, - }; - - let mut builder = if use_env { - env_logger::Builder::from_env( - Env::new() - .default_filter_or(filter) - .default_write_style_or(write_style), - ) - } else { - let mut builder = env_logger::Builder::new(); - builder.parse_filters(filter).parse_write_style(write_style); - builder - }; - builder.format_timestamp_micros().target(target).init(); - - KRUN_SUCCESS -} + let write_style = match style { + log_defs::KRUN_LOG_STYLE_AUTO => "auto", + log_defs::KRUN_LOG_STYLE_ALWAYS => "always", + log_defs::KRUN_LOG_STYLE_NEVER => "never", + _ => return -libc::EINVAL, + }; -#[no_mangle] -pub extern "C" fn krun_create_ctx() -> i32 { - let shutdown_efd = if cfg!(target_arch = "aarch64") && cfg!(target_os = "macos") { - Some(EventFd::new(utils::eventfd::EFD_NONBLOCK).unwrap()) - } else { - None - }; + let use_env = match options { + 0 => true, + log_defs::KRUN_LOG_OPTION_NO_ENV => false, + _ => return -libc::EINVAL, + }; - let ctx_cfg = { - ContextConfig { - krunfw: KrunfwBindings::new(), - shutdown_efd, - ..Default::default() - } - }; + let mut builder = if use_env { + env_logger::Builder::from_env( + Env::new() + .default_filter_or(filter) + .default_write_style_or(write_style), + ) + } else { + let mut builder = env_logger::Builder::new(); + builder.parse_filters(filter).parse_write_style(write_style); + builder + }; + builder.format_timestamp_micros().target(target).init(); - let ctx_id = CTX_IDS.fetch_add(1, Ordering::SeqCst); - if ctx_id == i32::MAX || CTX_MAP.lock().unwrap().contains_key(&(ctx_id as u32)) { - // libkrun is not intended to be used as a daemon for managing VMs. - panic!("Context ID namespace exhausted"); + KRUN_SUCCESS } - CTX_MAP.lock().unwrap().insert(ctx_id as u32, ctx_cfg); - ctx_id -} + #[no_mangle] + pub extern "C" fn krun_create_ctx() -> i32 { + let shutdown_efd = if cfg!(target_arch = "aarch64") && cfg!(target_os = "macos") { + Some(EventFd::new(utils::eventfd::EFD_NONBLOCK).unwrap()) + } else { + None + }; -#[no_mangle] -pub extern "C" fn krun_free_ctx(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().remove(&ctx_id) { - Some(_) => KRUN_SUCCESS, - None => -libc::ENOENT, - } -} + let ctx_cfg = { + ContextConfig { + krunfw: KrunfwBindings::new(), + shutdown_efd, + ..Default::default() + } + }; -#[no_mangle] -pub extern "C" fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32 { - let mem_size_mib: usize = match ram_mib.try_into() { - Ok(size) => size, - Err(e) => { - warn!("Error parsing the amount of RAM: {e:?}"); - return -libc::EINVAL; + let ctx_id = CTX_IDS.fetch_add(1, Ordering::SeqCst); + if ctx_id == i32::MAX || CTX_MAP.lock().unwrap().contains_key(&(ctx_id as u32)) { + // libkrun is not intended to be used as a daemon for managing VMs. + panic!("Context ID namespace exhausted"); } - }; + CTX_MAP.lock().unwrap().insert(ctx_id as u32, ctx_cfg); - let vm_config = VmConfig { - vcpu_count: Some(num_vcpus), - mem_size_mib: Some(mem_size_mib), - ht_enabled: Some(false), - cpu_template: None, - }; + ctx_id + } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - if ctx_cfg.get_mut().vmr.set_vm_config(&vm_config).is_err() { - return -libc::EINVAL; - } + #[no_mangle] + pub extern "C" fn krun_free_ctx(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().remove(&ctx_id) { + Some(_) => KRUN_SUCCESS, + None => -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, } - KRUN_SUCCESS -} + #[no_mangle] + pub extern "C" fn krun_set_vm_config(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32 { + let mem_size_mib: usize = match ram_mib.try_into() { + Ok(size) => size, + Err(e) => { + warn!("Error parsing the amount of RAM: {e:?}"); + return -libc::EINVAL; + } + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) -> i32 { - let root_path = match CStr::from_ptr(c_root_path).to_str() { - Ok(root) => root, - Err(_) => return -libc::EINVAL, - }; + let vm_config = VmConfig { + vcpu_count: Some(num_vcpus), + mem_size_mib: Some(mem_size_mib), + ht_enabled: Some(false), + cpu_template: None, + }; - let fs_id = "/dev/root".to_string(); - let shared_dir = root_path.to_string(); - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.add_fs_device(FsDeviceConfig { - fs_id, - shared_dir, - // Default to a conservative 512 MB window. - shm_size: Some(1 << 29), - allow_root_dir_delete: false, - read_only: false, - }); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + if ctx_cfg.get_mut().vmr.set_vm_config(&vm_config).is_err() { + return -libc::EINVAL; + } + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_add_virtiofs( - ctx_id: u32, - c_tag: *const c_char, - c_path: *const c_char, -) -> i32 { - krun_add_virtiofs3(ctx_id, c_tag, c_path, 0, false) -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_add_virtiofs2( - ctx_id: u32, - c_tag: *const c_char, - c_path: *const c_char, - shm_size: u64, -) -> i32 { - krun_add_virtiofs3(ctx_id, c_tag, c_path, shm_size, false) -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_add_virtiofs3( - ctx_id: u32, - c_tag: *const c_char, - c_path: *const c_char, - shm_size: u64, - read_only: bool, -) -> i32 { - if c_tag.is_null() || c_path.is_null() { - return -libc::EINVAL; - } - - let tag = match CStr::from_ptr(c_tag).to_str() { - Ok(tag) => tag, - Err(_) => return -libc::EINVAL, - }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, - }; - let shm = if shm_size > 0 { - match shm_size.try_into() { - Ok(s) => Some(s), + KRUN_SUCCESS + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(not(feature = "tee"))] + pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) -> i32 { + let root_path = match CStr::from_ptr(c_root_path).to_str() { + Ok(root) => root, Err(_) => return -libc::EINVAL, - } - } else { - None - }; + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.add_fs_device(FsDeviceConfig { - fs_id: tag.to_string(), - shared_dir: path.to_string(), - shm_size: shm, - allow_root_dir_delete: false, - read_only, - }); + let fs_id = "/dev/root".to_string(); + let shared_dir = root_path.to_string(); + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.add_fs_device(FsDeviceConfig { + fs_id, + shared_dir, + // Default to a conservative 512 MB window. + shm_size: Some(1 << 29), + allow_root_dir_delete: false, + read_only: false, + }); + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(not(feature = "tee"))] -pub unsafe extern "C" fn krun_set_mapped_volumes( - _ctx_id: u32, - _c_mapped_volumes: *const *const c_char, -) -> i32 { - -libc::EINVAL -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_add_disk( - ctx_id: u32, - c_block_id: *const c_char, - c_disk_path: *const c_char, - read_only: bool, -) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - let block_id = match CStr::from_ptr(c_block_id).to_str() { - Ok(block_id) => block_id, - Err(_) => return -libc::EINVAL, - }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: block_id.to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: read_only, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.add_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_add_disk2( - ctx_id: u32, - c_block_id: *const c_char, - c_disk_path: *const c_char, - disk_format: u32, - read_only: bool, -) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; - - let block_id = match CStr::from_ptr(c_block_id).to_str() { - Ok(block_id) => block_id, - Err(_) => return -libc::EINVAL, - }; - - let format = match ImageType::try_from(disk_format) { - Ok(format) => format, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: block_id.to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: format, - is_disk_read_only: read_only, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.add_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_add_disk3( - ctx_id: u32, - c_block_id: *const c_char, - c_disk_path: *const c_char, - disk_format: u32, - read_only: bool, - direct_io: bool, - sync_mode: u32, -) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; + KRUN_SUCCESS + } - let block_id = match CStr::from_ptr(c_block_id).to_str() { - Ok(block_id) => block_id, - Err(_) => return -libc::EINVAL, - }; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(not(feature = "tee"))] + pub unsafe extern "C" fn krun_add_virtiofs( + ctx_id: u32, + c_tag: *const c_char, + c_path: *const c_char, + ) -> i32 { + krun_add_virtiofs3(ctx_id, c_tag, c_path, 0, false) + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(not(feature = "tee"))] + pub unsafe extern "C" fn krun_add_virtiofs2( + ctx_id: u32, + c_tag: *const c_char, + c_path: *const c_char, + shm_size: u64, + ) -> i32 { + krun_add_virtiofs3(ctx_id, c_tag, c_path, shm_size, false) + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(not(feature = "tee"))] + pub unsafe extern "C" fn krun_add_virtiofs3( + ctx_id: u32, + c_tag: *const c_char, + c_path: *const c_char, + shm_size: u64, + read_only: bool, + ) -> i32 { + if c_tag.is_null() || c_path.is_null() { + return -libc::EINVAL; + } - let format = match ImageType::try_from(disk_format) { - Ok(fmt) => fmt, - Err(_) => return -libc::EINVAL, - }; + let tag = match CStr::from_ptr(c_tag).to_str() { + Ok(tag) => tag, + Err(_) => return -libc::EINVAL, + }; + let path = match CStr::from_ptr(c_path).to_str() { + Ok(path) => path, + Err(_) => return -libc::EINVAL, + }; - let sync_mode = match SyncMode::try_from(sync_mode) { - Ok(mode) => mode, - Err(_) => return -libc::EINVAL, - }; + let shm = if shm_size > 0 { + match shm_size.try_into() { + Ok(s) => Some(s), + Err(_) => return -libc::EINVAL, + } + } else { + None + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: block_id.to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: format, - is_disk_read_only: read_only, - direct_io, - sync_mode, - }; - cfg.add_block_cfg(block_device_config); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.add_fs_device(FsDeviceConfig { + fs_id: tag.to_string(), + shared_dir: path.to_string(), + shm_size: shm, + allow_root_dir_delete: false, + read_only, + }); + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_set_root_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; + KRUN_SUCCESS + } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: "root".to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: false, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.set_root_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(not(feature = "tee"))] + pub unsafe extern "C" fn krun_set_mapped_volumes( + _ctx_id: u32, + _c_mapped_volumes: *const *const c_char, + ) -> i32 { + -libc::EINVAL } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "blk")] + pub unsafe extern "C" fn krun_add_disk( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + read_only: bool, + ) -> i32 { + let disk_path = match CStr::from_ptr(c_disk_path).to_str() { + Ok(disk) => disk, + Err(_) => return -libc::EINVAL, + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "blk")] -pub unsafe extern "C" fn krun_set_data_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { - let disk_path = match CStr::from_ptr(c_disk_path).to_str() { - Ok(disk) => disk, - Err(_) => return -libc::EINVAL, - }; + let block_id = match CStr::from_ptr(c_block_id).to_str() { + Ok(block_id) => block_id, + Err(_) => return -libc::EINVAL, + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let block_device_config = BlockDeviceConfig { - block_id: "data".to_string(), - cache_type: CacheType::auto(disk_path), - disk_image_path: disk_path.to_string(), - disk_image_format: ImageType::Raw, - is_disk_read_only: false, - direct_io: false, - #[cfg(not(target_os = "macos"))] - sync_mode: SyncMode::Full, - #[cfg(target_os = "macos")] - sync_mode: SyncMode::Relaxed, - }; - cfg.set_data_block_cfg(block_device_config); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -/* - * Send the VFKIT magic after establishing the connection, - * as required by gvproxy in vfkit mode. - */ -#[cfg(feature = "net")] -const NET_FLAG_VFKIT: u32 = 1 << 0; -#[cfg(feature = "net")] -const NET_FLAG_DHCP_CLIENT: u32 = 1 << 1; -#[cfg(feature = "net")] -const NET_FLAG_ALL: u32 = NET_FLAG_VFKIT | NET_FLAG_DHCP_CLIENT; - -/* Taken from uapi/linux/virtio_net.h */ -#[cfg(feature = "net")] -const NET_FEATURE_CSUM: u32 = 1 << 0; -#[cfg(feature = "net")] -const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; -#[cfg(feature = "net")] -const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; -#[cfg(feature = "net")] -const NET_FEATURE_GUEST_TSO6: u32 = 1 << 8; -#[cfg(feature = "net")] -const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; -#[cfg(feature = "net")] -const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; -#[cfg(feature = "net")] -const NET_FEATURE_HOST_TSO6: u32 = 1 << 12; -#[cfg(feature = "net")] -const NET_FEATURE_HOST_UFO: u32 = 1 << 14; -/* - * These are the flags enabled by default on each virtio-net instance - * before the introduction of "krun_add_net_*". They are now used in - * the legacy API ("krun_set_passt_fd" and "krun_set_gvproxy_path") - * for compatiblity reasons. - */ -#[cfg(feature = "net")] -const NET_COMPAT_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_UFO; -#[cfg(feature = "net")] -const NET_ALL_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_TSO6 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_TSO6 - | NET_FEATURE_HOST_UFO; - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_add_net_unixstream( - ctx_id: u32, - c_path: *const c_char, - fd: c_int, - c_mac: *const u8, - features: u32, - flags: u32, -) -> i32 { - let path = if !c_path.is_null() { - match CStr::from_ptr(c_path).to_str() { - Ok(path) => Some(PathBuf::from(path)), - Err(_) => None, - } - } else { - None - }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let block_device_config = BlockDeviceConfig { + block_id: block_id.to_string(), + cache_type: CacheType::auto(disk_path), + disk_image_path: disk_path.to_string(), + disk_image_format: ImageType::Raw, + is_disk_read_only: read_only, + direct_io: false, + #[cfg(not(target_os = "macos"))] + sync_mode: SyncMode::Full, + #[cfg(target_os = "macos")] + sync_mode: SyncMode::Relaxed, + }; + cfg.add_block_cfg(block_device_config); + } + Entry::Vacant(_) => return -libc::ENOENT, + } - if fd >= 0 && path.is_some() { - return -libc::EINVAL; - } - if fd < 0 && path.is_none() { - return -libc::EINVAL; + KRUN_SUCCESS } - let backend = if let Some(path) = path { - VirtioNetBackend::UnixstreamPath(path) - } else { - VirtioNetBackend::UnixstreamFd(fd) - }; - let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { - Ok(m) => m, - Err(_) => return -libc::EINVAL, - }; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "blk")] + pub unsafe extern "C" fn krun_add_disk2( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + disk_format: u32, + read_only: bool, + ) -> i32 { + let disk_path = match CStr::from_ptr(c_disk_path).to_str() { + Ok(disk) => disk, + Err(_) => return -libc::EINVAL, + }; - if (flags & !NET_FLAG_DHCP_CLIENT) != 0 { - return -libc::EINVAL; - } - let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; + let block_id = match CStr::from_ptr(c_block_id).to_str() { + Ok(block_id) => block_id, + Err(_) => return -libc::EINVAL, + }; - if (features & !NET_ALL_FEATURES) != 0 { - return -libc::EINVAL; - } + let format = match ImageType::try_from(disk_format) { + Ok(format) => format, + Err(_) => return -libc::EINVAL, + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, backend, mac, features); - if enable_dhcp_client { - cfg.vmr.dhcp_client = true; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let block_device_config = BlockDeviceConfig { + block_id: block_id.to_string(), + cache_type: CacheType::auto(disk_path), + disk_image_path: disk_path.to_string(), + disk_image_format: format, + is_disk_read_only: read_only, + direct_io: false, + #[cfg(not(target_os = "macos"))] + sync_mode: SyncMode::Full, + #[cfg(target_os = "macos")] + sync_mode: SyncMode::Relaxed, + }; + cfg.add_block_cfg(block_device_config); } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_add_net_unixgram( - ctx_id: u32, - c_path: *const c_char, - fd: c_int, - c_mac: *const u8, - features: u32, - flags: u32, -) -> i32 { - let path = if !c_path.is_null() { - match CStr::from_ptr(c_path).to_str() { - Ok(path) => Some(PathBuf::from(path)), - Err(_) => None, - } - } else { - None - }; - if fd >= 0 && path.is_some() { - return -libc::EINVAL; - } - if fd < 0 && path.is_none() { - return -libc::EINVAL; + KRUN_SUCCESS } - let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { - Ok(m) => m, - Err(_) => return -libc::EINVAL, - }; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "blk")] + pub unsafe extern "C" fn krun_add_disk3( + ctx_id: u32, + c_block_id: *const c_char, + c_disk_path: *const c_char, + disk_format: u32, + read_only: bool, + direct_io: bool, + sync_mode: u32, + ) -> i32 { + let disk_path = match CStr::from_ptr(c_disk_path).to_str() { + Ok(disk) => disk, + Err(_) => return -libc::EINVAL, + }; - if (features & !NET_ALL_FEATURES) != 0 { - return -libc::EINVAL; - } + let block_id = match CStr::from_ptr(c_block_id).to_str() { + Ok(block_id) => block_id, + Err(_) => return -libc::EINVAL, + }; - if (flags & !NET_FLAG_ALL) != 0 { - return -libc::EINVAL; - } - let send_vfkit_magic: bool = flags & NET_FLAG_VFKIT != 0; - let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; + let format = match ImageType::try_from(disk_format) { + Ok(fmt) => fmt, + Err(_) => return -libc::EINVAL, + }; - let backend = if let Some(path) = path { - VirtioNetBackend::UnixgramPath(path, send_vfkit_magic) - } else { - VirtioNetBackend::UnixgramFd(fd) - }; + let sync_mode = match SyncMode::try_from(sync_mode) { + Ok(mode) => mode, + Err(_) => return -libc::EINVAL, + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, backend, mac, features); - if enable_dhcp_client { - cfg.vmr.dhcp_client = true; - } - } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_disable_tsi(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.disable_tsi = true; - } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(all(target_os = "linux", feature = "net"))] -pub unsafe extern "C" fn krun_add_net_tap( - ctx_id: u32, - c_tap_name: *const c_char, - c_mac: *const u8, - features: u32, - flags: u32, -) -> i32 { - let tap_name = match CStr::from_ptr(c_tap_name).to_str() { - Ok(tap_name) => tap_name.to_string(), - Err(e) => { - debug!("Error parsing tap_name: {e:?}"); - return -libc::EINVAL; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let block_device_config = BlockDeviceConfig { + block_id: block_id.to_string(), + cache_type: CacheType::auto(disk_path), + disk_image_path: disk_path.to_string(), + disk_image_format: format, + is_disk_read_only: read_only, + direct_io, + sync_mode, + }; + cfg.add_block_cfg(block_device_config); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - - let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { - Ok(m) => m, - Err(_) => return -libc::EINVAL, - }; - if (features & !NET_ALL_FEATURES) != 0 { - return -libc::EINVAL; + KRUN_SUCCESS } - if features & (NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_TSO6 | NET_FEATURE_GUEST_UFO) != 0 - && features & NET_FEATURE_GUEST_CSUM == 0 - { - debug!( - "Network tap backend requires GUEST_CSUM to be requested if any of GUEST_TSO4, GUEST_TSO6 and/or GUEST_UFO are required" - ); - return -libc::EINVAL; - } + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "blk")] + pub unsafe extern "C" fn krun_set_root_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { + let disk_path = match CStr::from_ptr(c_disk_path).to_str() { + Ok(disk) => disk, + Err(_) => return -libc::EINVAL, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let block_device_config = BlockDeviceConfig { + block_id: "root".to_string(), + cache_type: CacheType::auto(disk_path), + disk_image_path: disk_path.to_string(), + disk_image_format: ImageType::Raw, + is_disk_read_only: false, + direct_io: false, + #[cfg(not(target_os = "macos"))] + sync_mode: SyncMode::Full, + #[cfg(target_os = "macos")] + sync_mode: SyncMode::Relaxed, + }; + cfg.set_root_block_cfg(block_device_config); + } + Entry::Vacant(_) => return -libc::ENOENT, + } - if (flags & !NET_FLAG_DHCP_CLIENT) != 0 { - return -libc::EINVAL; + KRUN_SUCCESS } - let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, VirtioNetBackend::Tap(tap_name), mac, features); - if enable_dhcp_client { - cfg.vmr.dhcp_client = true; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "blk")] + pub unsafe extern "C" fn krun_set_data_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { + let disk_path = match CStr::from_ptr(c_disk_path).to_str() { + Ok(disk) => disk, + Err(_) => return -libc::EINVAL, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let block_device_config = BlockDeviceConfig { + block_id: "data".to_string(), + cache_type: CacheType::auto(disk_path), + disk_image_path: disk_path.to_string(), + disk_image_format: ImageType::Raw, + is_disk_read_only: false, + direct_io: false, + #[cfg(not(target_os = "macos"))] + sync_mode: SyncMode::Full, + #[cfg(target_os = "macos")] + sync_mode: SyncMode::Relaxed, + }; + cfg.set_data_block_cfg(block_device_config); } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + + KRUN_SUCCESS } - KRUN_SUCCESS -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(all(not(target_os = "linux"), feature = "net"))] -pub unsafe extern "C" fn krun_add_net_tap( - _ctx_id: u32, - _c_tap_name: *const c_char, - _c_mac: *const u8, - _features: u32, - _flags: u32, -) -> i32 { - -libc::EINVAL -} + /* + * Send the VFKIT magic after establishing the connection, + * as required by gvproxy in vfkit mode. + */ + #[cfg(feature = "net")] + const NET_FLAG_VFKIT: u32 = 1 << 0; + #[cfg(feature = "net")] + const NET_FLAG_DHCP_CLIENT: u32 = 1 << 1; + #[cfg(feature = "net")] + const NET_FLAG_ALL: u32 = NET_FLAG_VFKIT | NET_FLAG_DHCP_CLIENT; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_passt_fd(ctx_id: u32, fd: c_int) -> i32 { - if fd < 0 { - return -libc::EINVAL; - } + /* Taken from uapi/linux/virtio_net.h */ + #[cfg(feature = "net")] + const NET_FEATURE_CSUM: u32 = 1 << 0; + #[cfg(feature = "net")] + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + #[cfg(feature = "net")] + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + #[cfg(feature = "net")] + const NET_FEATURE_GUEST_TSO6: u32 = 1 << 8; + #[cfg(feature = "net")] + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + #[cfg(feature = "net")] + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + #[cfg(feature = "net")] + const NET_FEATURE_HOST_TSO6: u32 = 1 << 12; + #[cfg(feature = "net")] + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + /* + * These are the flags enabled by default on each virtio-net instance + * before the introduction of "krun_add_net_*". They are now used in + * the legacy API ("krun_set_passt_fd" and "krun_set_gvproxy_path") + * for compatiblity reasons. + */ + #[cfg(feature = "net")] + const NET_COMPAT_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + #[cfg(feature = "net")] + const NET_ALL_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_TSO6 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_TSO6 + | NET_FEATURE_HOST_UFO; + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_add_net_unixstream( + ctx_id: u32, + c_path: *const c_char, + fd: c_int, + c_mac: *const u8, + features: u32, + flags: u32, + ) -> i32 { + let path = if !c_path.is_null() { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(PathBuf::from(path)), + Err(_) => None, + } + } else { + None + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - // The legacy interface only supports a single network interface. - if cfg.net_index != 0 { - return -libc::EINVAL; + if fd >= 0 && path.is_some() { + return -libc::EINVAL; + } + if fd < 0 && path.is_none() { + return -libc::EINVAL; + } + let backend = if let Some(path) = path { + VirtioNetBackend::UnixstreamPath(path) + } else { + VirtioNetBackend::UnixstreamFd(fd) + }; + + let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { + Ok(m) => m, + Err(_) => return -libc::EINVAL, + }; + + if (flags & !NET_FLAG_DHCP_CLIENT) != 0 { + return -libc::EINVAL; + } + let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; + + if (features & !NET_ALL_FEATURES) != 0 { + return -libc::EINVAL; + } + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + create_virtio_net(cfg, backend, mac, features); + if enable_dhcp_client { + cfg.vmr.dhcp_client = true; + } } - cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetPasst(fd)); + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_gvproxy_path(ctx_id: u32, c_path: *const c_char) -> i32 { - let path_str = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(e) => { - debug!("Error parsing gvproxy_path: {e:?}"); + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_add_net_unixgram( + ctx_id: u32, + c_path: *const c_char, + fd: c_int, + c_mac: *const u8, + features: u32, + flags: u32, + ) -> i32 { + let path = if !c_path.is_null() { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(PathBuf::from(path)), + Err(_) => None, + } + } else { + None + }; + + if fd >= 0 && path.is_some() { + return -libc::EINVAL; + } + if fd < 0 && path.is_none() { return -libc::EINVAL; } - }; - let path = PathBuf::from(path_str); + let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { + Ok(m) => m, + Err(_) => return -libc::EINVAL, + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - // The legacy interface only supports a single network interface. - if cfg.net_index != 0 { - return -libc::EINVAL; + if (features & !NET_ALL_FEATURES) != 0 { + return -libc::EINVAL; + } + + if (flags & !NET_FLAG_ALL) != 0 { + return -libc::EINVAL; + } + let send_vfkit_magic: bool = flags & NET_FLAG_VFKIT != 0; + let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; + + let backend = if let Some(path) = path { + VirtioNetBackend::UnixgramPath(path, send_vfkit_magic) + } else { + VirtioNetBackend::UnixgramFd(fd) + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + create_virtio_net(cfg, backend, mac, features); + if enable_dhcp_client { + cfg.vmr.dhcp_client = true; + } } - cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetGvproxy(path)); + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "net")] -pub unsafe extern "C" fn krun_set_net_mac(ctx_id: u32, c_mac: *const u8) -> i32 { - let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { - Ok(m) => m, - Err(_) => return -libc::EINVAL, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_net_mac(mac); + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_disable_tsi(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.disable_tsi = true; + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_port_map(ctx_id: u32, c_port_map: *const *const c_char) -> i32 { - let mut port_map = HashMap::new(); - let port_map_array: &[*const c_char] = slice::from_raw_parts(c_port_map, MAX_ARGS); - for item in port_map_array.iter().take(MAX_ARGS) { - if item.is_null() { - break; - } else { - let s = match CStr::from_ptr(*item).to_str() { - Ok(s) => s, - Err(_) => return -libc::EINVAL, - }; - let port_tuple: Vec<&str> = s.split(':').collect(); - if port_tuple.len() != 2 { + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(all(target_os = "linux", feature = "net"))] + pub unsafe extern "C" fn krun_add_net_tap( + ctx_id: u32, + c_tap_name: *const c_char, + c_mac: *const u8, + features: u32, + flags: u32, + ) -> i32 { + let tap_name = match CStr::from_ptr(c_tap_name).to_str() { + Ok(tap_name) => tap_name.to_string(), + Err(e) => { + debug!("Error parsing tap_name: {e:?}"); return -libc::EINVAL; } - let host_port: u16 = match port_tuple[0].parse() { - Ok(p) => p, - Err(_) => return -libc::EINVAL, - }; - let guest_port: u16 = match port_tuple[1].parse() { - Ok(p) => p, - Err(_) => return -libc::EINVAL, - }; + }; - if port_map.contains_key(&guest_port) { - return -libc::EINVAL; + let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { + Ok(m) => m, + Err(_) => return -libc::EINVAL, + }; + + if (features & !NET_ALL_FEATURES) != 0 { + return -libc::EINVAL; + } + + if features & (NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_TSO6 | NET_FEATURE_GUEST_UFO) != 0 + && features & NET_FEATURE_GUEST_CSUM == 0 + { + debug!( + "Network tap backend requires GUEST_CSUM to be requested if any of GUEST_TSO4, GUEST_TSO6 and/or GUEST_UFO are required" + ); + return -libc::EINVAL; + } + + if (flags & !NET_FLAG_DHCP_CLIENT) != 0 { + return -libc::EINVAL; + } + let enable_dhcp_client: bool = flags & NET_FLAG_DHCP_CLIENT != 0; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + create_virtio_net(cfg, VirtioNetBackend::Tap(tap_name), mac, features); + if enable_dhcp_client { + cfg.vmr.dhcp_client = true; + } } - for hp in port_map.values() { - if *hp == host_port { + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(all(not(target_os = "linux"), feature = "net"))] + pub unsafe extern "C" fn krun_add_net_tap( + _ctx_id: u32, + _c_tap_name: *const c_char, + _c_mac: *const u8, + _features: u32, + _flags: u32, + ) -> i32 { + -libc::EINVAL + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_set_passt_fd(ctx_id: u32, fd: c_int) -> i32 { + if fd < 0 { + return -libc::EINVAL; + } + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + // The legacy interface only supports a single network interface. + if cfg.net_index != 0 { return -libc::EINVAL; } + cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetPasst(fd)); } - port_map.insert(guest_port, host_port); + Entry::Vacant(_) => return -libc::ENOENT, } + KRUN_SUCCESS } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if cfg.vsock_config == VsockConfig::Disabled { - return -libc::ENODEV; - } - if cfg.set_port_map(port_map).is_err() { + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_set_gvproxy_path(ctx_id: u32, c_path: *const c_char) -> i32 { + let path_str = match CStr::from_ptr(c_path).to_str() { + Ok(path) => path, + Err(e) => { + debug!("Error parsing gvproxy_path: {e:?}"); return -libc::EINVAL; } + }; + + let path = PathBuf::from(path_str); + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + // The legacy interface only supports a single network interface. + if cfg.net_index != 0 { + return -libc::EINVAL; + } + cfg.legacy_net_cfg = Some(LegacyNetworkConfig::VirtioNetGvproxy(path)); + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "net")] + pub unsafe extern "C" fn krun_set_net_mac(ctx_id: u32, c_mac: *const u8) -> i32 { + let mac: [u8; 6] = match slice::from_raw_parts(c_mac, 6).try_into() { + Ok(m) => m, + Err(_) => return -libc::EINVAL, + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_rlimits(ctx_id: u32, c_rlimits: *const *const c_char) -> i32 { - let rlimits = if c_rlimits.is_null() { - return -libc::EINVAL; - } else { - let mut strvec = Vec::new(); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_net_mac(mac); + } + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS + } - let array: &[*const c_char] = slice::from_raw_parts(c_rlimits, MAX_ARGS); - for item in array.iter().take(MAX_ARGS) { + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_port_map( + ctx_id: u32, + c_port_map: *const *const c_char, + ) -> i32 { + let mut port_map = HashMap::new(); + let port_map_array: &[*const c_char] = slice::from_raw_parts(c_port_map, MAX_ARGS); + for item in port_map_array.iter().take(MAX_ARGS) { if item.is_null() { break; } else { @@ -1295,1570 +1251,1641 @@ pub unsafe extern "C" fn krun_set_rlimits(ctx_id: u32, c_rlimits: *const *const Ok(s) => s, Err(_) => return -libc::EINVAL, }; - strvec.push(s); + let port_tuple: Vec<&str> = s.split(':').collect(); + if port_tuple.len() != 2 { + return -libc::EINVAL; + } + let host_port: u16 = match port_tuple[0].parse() { + Ok(p) => p, + Err(_) => return -libc::EINVAL, + }; + let guest_port: u16 = match port_tuple[1].parse() { + Ok(p) => p, + Err(_) => return -libc::EINVAL, + }; + + if port_map.contains_key(&guest_port) { + return -libc::EINVAL; + } + for hp in port_map.values() { + if *hp == host_port { + return -libc::EINVAL; + } + } + port_map.insert(guest_port, host_port); } } - format!("\"{}\"", strvec.join(",")) - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - ctx_cfg.get_mut().set_rlimits(rlimits); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if cfg.vsock_config == VsockConfig::Disabled { + return -libc::ENODEV; + } + if cfg.set_port_map(port_map).is_err() { + return -libc::EINVAL; + } + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_rlimits(ctx_id: u32, c_rlimits: *const *const c_char) -> i32 { + let rlimits = if c_rlimits.is_null() { + return -libc::EINVAL; + } else { + let mut strvec = Vec::new(); -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_workdir(ctx_id: u32, c_workdir_path: *const c_char) -> i32 { - let workdir_path = match CStr::from_ptr(c_workdir_path).to_str() { - Ok(workdir) => workdir, - Err(_) => return -libc::EINVAL, - }; + let array: &[*const c_char] = slice::from_raw_parts(c_rlimits, MAX_ARGS); + for item in array.iter().take(MAX_ARGS) { + if item.is_null() { + break; + } else { + let s = match CStr::from_ptr(*item).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + strvec.push(s); + } + } + + format!("\"{}\"", strvec.join(",")) + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - ctx_cfg.get_mut().set_workdir(workdir_path.to_string()); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().set_rlimits(rlimits); + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} + KRUN_SUCCESS + } -unsafe fn collapse_str_array(array: &[*const c_char]) -> Result { - let mut strvec = Vec::new(); + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_workdir(ctx_id: u32, c_workdir_path: *const c_char) -> i32 { + let workdir_path = match CStr::from_ptr(c_workdir_path).to_str() { + Ok(workdir) => workdir, + Err(_) => return -libc::EINVAL, + }; - for item in array.iter().take(MAX_ARGS) { - if item.is_null() { - break; - } else { - let s = CStr::from_ptr(*item).to_str()?; - strvec.push(format!("\"{s}\"")); - } - } - - Ok(strvec.join(" ")) -} - -#[allow(clippy::format_collect)] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_exec( - ctx_id: u32, - c_exec_path: *const c_char, - c_argv: *const *const c_char, - c_envp: *const *const c_char, -) -> i32 { - let exec_path = match CStr::from_ptr(c_exec_path).to_str() { - Ok(path) => path, - Err(e) => { - debug!("Error parsing exec_path: {e:?}"); - return -libc::EINVAL; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().set_workdir(workdir_path.to_string()); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - let args = if !c_argv.is_null() { - let argv_array: &[*const c_char] = slice::from_raw_parts(c_argv, MAX_ARGS); - match collapse_str_array(argv_array) { - Ok(s) => s, - Err(e) => { - debug!("Error parsing args: {e:?}"); - return -libc::EINVAL; + KRUN_SUCCESS + } + + unsafe fn collapse_str_array(array: &[*const c_char]) -> Result { + let mut strvec = Vec::new(); + + for item in array.iter().take(MAX_ARGS) { + if item.is_null() { + break; + } else { + let s = CStr::from_ptr(*item).to_str()?; + strvec.push(format!("\"{s}\"")); } } - } else { - "".to_string() - }; - let env = if !c_envp.is_null() { - let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); - match collapse_str_array(envp_array) { - Ok(s) => s, + Ok(strvec.join(" ")) + } + + #[allow(clippy::format_collect)] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_exec( + ctx_id: u32, + c_exec_path: *const c_char, + c_argv: *const *const c_char, + c_envp: *const *const c_char, + ) -> i32 { + let exec_path = match CStr::from_ptr(c_exec_path).to_str() { + Ok(path) => path, Err(e) => { - debug!("Error parsing args: {e:?}"); + debug!("Error parsing exec_path: {e:?}"); return -libc::EINVAL; } - } - } else { - env::vars() - .map(|(key, value)| format!(" {key}=\"{value}\"")) - .collect() - }; + }; + + let args = if !c_argv.is_null() { + let argv_array: &[*const c_char] = slice::from_raw_parts(c_argv, MAX_ARGS); + match collapse_str_array(argv_array) { + Ok(s) => s, + Err(e) => { + debug!("Error parsing args: {e:?}"); + return -libc::EINVAL; + } + } + } else { + "".to_string() + }; + + let env = if !c_envp.is_null() { + let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); + match collapse_str_array(envp_array) { + Ok(s) => s, + Err(e) => { + debug!("Error parsing args: {e:?}"); + return -libc::EINVAL; + } + } + } else { + env::vars() + .map(|(key, value)| format!(" {key}=\"{value}\"")) + .collect() + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_exec_path(exec_path.to_string()); - cfg.set_env(env); - cfg.set_args(args); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_exec_path(exec_path.to_string()); + cfg.set_env(env); + cfg.set_args(args); + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[allow(clippy::format_collect)] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_env(ctx_id: u32, c_envp: *const *const c_char) -> i32 { + let env = if !c_envp.is_null() { + let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); + match collapse_str_array(envp_array) { + Ok(s) => s, + Err(e) => { + debug!("Error parsing args: {e:?}"); + return -libc::EINVAL; + } + } + } else { + env::vars() + .map(|(key, value)| format!(" {key}=\"{value}\"")) + .collect() + }; -#[allow(clippy::format_collect)] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_env(ctx_id: u32, c_envp: *const *const c_char) -> i32 { - let env = if !c_envp.is_null() { - let envp_array: &[*const c_char] = slice::from_raw_parts(c_envp, MAX_ARGS); - match collapse_str_array(envp_array) { - Ok(s) => s, - Err(e) => { - debug!("Error parsing args: {e:?}"); - return -libc::EINVAL; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_env(env); } + Entry::Vacant(_) => return -libc::ENOENT, } - } else { - env::vars() - .map(|(key, value)| format!(" {key}=\"{value}\"")) - .collect() - }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_env(env); - } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + #[cfg(feature = "tee")] + pub unsafe extern "C" fn krun_set_tee_config_file( + ctx_id: u32, + c_filepath: *const c_char, + ) -> i32 { + let filepath = match CStr::from_ptr(c_filepath).to_str() { + Ok(f) => f, + Err(_) => return -libc::EINVAL, + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -#[cfg(feature = "tee")] -pub unsafe extern "C" fn krun_set_tee_config_file(ctx_id: u32, c_filepath: *const c_char) -> i32 { - let filepath = match CStr::from_ptr(c_filepath).to_str() { - Ok(f) => f, - Err(_) => return -libc::EINVAL, - }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_tee_config_file(PathBuf::from(filepath.to_string())); + } + Entry::Vacant(_) => return -libc::ENOENT, + } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_tee_config_file(PathBuf::from(filepath.to_string())); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_vsock_port( - ctx_id: u32, - port: u32, - c_filepath: *const c_char, -) -> i32 { - krun_add_vsock_port2(ctx_id, port, c_filepath, false) -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_vsock_port2( - ctx_id: u32, - port: u32, - c_filepath: *const c_char, - listen: bool, -) -> i32 { - #[cfg(feature = "aws-nitro")] - if listen { - return -libc::EINVAL; + KRUN_SUCCESS } - let filepath = match CStr::from_ptr(c_filepath).to_str() { - Ok(f) => PathBuf::from(f.to_string()), - Err(_) => return -libc::EINVAL, - }; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_vsock_port( + ctx_id: u32, + port: u32, + c_filepath: *const c_char, + ) -> i32 { + krun_add_vsock_port2(ctx_id, port, c_filepath, false) + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_vsock_port2( + ctx_id: u32, + port: u32, + c_filepath: *const c_char, + listen: bool, + ) -> i32 { + #[cfg(feature = "aws-nitro")] + if listen { + return -libc::EINVAL; + } - if listen { - match filepath.try_exists() { - Ok(true) => return -libc::EEXIST, + let filepath = match CStr::from_ptr(c_filepath).to_str() { + Ok(f) => PathBuf::from(f.to_string()), Err(_) => return -libc::EINVAL, - _ => {} - } - } + }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if cfg.vsock_config == VsockConfig::Disabled { - return -libc::ENODEV; + if listen { + match filepath.try_exists() { + Ok(true) => return -libc::EEXIST, + Err(_) => return -libc::EINVAL, + _ => {} } - cfg.add_vsock_port(port, filepath, listen); - } - Entry::Vacant(_) => return -libc::ENOENT, - } + } - KRUN_SUCCESS -} + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if cfg.vsock_config == VsockConfig::Disabled { + return -libc::ENODEV; + } + cfg.add_vsock_port(port, filepath, listen); + } + Entry::Vacant(_) => return -libc::ENOENT, + } -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_gpu_options(ctx_id: u32, virgl_flags: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_gpu_virgl_flags(virgl_flags); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_gpu_options2( - ctx_id: u32, - virgl_flags: u32, - shm_size: u64, -) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_gpu_virgl_flags(virgl_flags); - cfg.set_gpu_shm_size(shm_size.try_into().unwrap()); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[cfg(not(feature = "gpu"))] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub extern "C" fn krun_set_display_backend( - _ctx_id: u32, - _features: u32, - _vtable: *const c_void, - _vtable_size: usize, -) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub extern "C" fn krun_set_display_backend( - ctx_id: u32, - vtable: *const c_void, - vtable_size: usize, -) -> i32 { - if vtable_size < size_of::() { - return -libc::EINVAL; - } - - // SAFETY: We have checked the vtable size is fine, otherwise we have to trust the user. Just - // to be extra careful, this uses read_unaligned, but we could probably get away with ptr::read. - let display_backend: DisplayBackend = - unsafe { std::ptr::read_unaligned(vtable as *const DisplayBackend) }; - - if !display_backend.verify() { - return -libc::EINVAL; - } - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.display_backend = Some(display_backend); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[cfg(not(feature = "input"))] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub extern "C" fn krun_add_input_device( - _ctx_id: u32, - _config_backend: *const c_void, - _config_backend_size: size_t, - _event_provider_backend: *const c_void, - _event_provider_backend_size: size_t, -) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "input")] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub extern "C" fn krun_add_input_device_fd(ctx_id: u32, input_fd: i32) -> i32 { - use devices::virtio::input::passthrough::PassthroughInputBackend; - use krun_input::{IntoInputConfig, IntoInputEvents}; - - if input_fd < 0 { - return -libc::EINVAL; - } - // TODO: currently we let the fd (and it's Box allocation) live forever, we should eventually fix - // this - let input_fd = unsafe { - // SAFETY: The user provided fd should be valid. Its lifetime is 'static because it will - // exist until libkrun _exits the process - BorrowedFd::borrow_raw(input_fd) - }; - let borrowed_fd: &'static BorrowedFd<'static> = Box::leak(Box::new(input_fd)); + KRUN_SUCCESS + } - let config_backend = PassthroughInputBackend::into_input_config(Some(borrowed_fd)); - let events_backend = PassthroughInputBackend::into_input_events(Some(borrowed_fd)); + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_gpu_options(ctx_id: u32, virgl_flags: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_gpu_virgl_flags(virgl_flags); + } + Entry::Vacant(_) => return -libc::ENOENT, + } - with_cfg(ctx_id, |cfg| { - cfg.vmr - .input_backends - .push((config_backend, events_backend)); KRUN_SUCCESS - }) -} - -#[cfg(feature = "input")] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_input_device( - ctx_id: u32, - config_backend: *const InputConfigBackend<'static>, - config_backend_size: size_t, - event_provider_backend: *const InputEventProviderBackend<'static>, - event_provider_backend_size: size_t, -) -> i32 { - if config_backend.is_null() || event_provider_backend.is_null() { - return -libc::EINVAL; } - if config_backend_size < size_of::() - || event_provider_backend_size < size_of::() - { - return -libc::EINVAL; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_gpu_options2( + ctx_id: u32, + virgl_flags: u32, + shm_size: u64, + ) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_gpu_virgl_flags(virgl_flags); + cfg.set_gpu_shm_size(shm_size.try_into().unwrap()); + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS } - let config_backend = unsafe { *config_backend }; - let events_backend = unsafe { *event_provider_backend }; + #[cfg(not(feature = "gpu"))] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub extern "C" fn krun_set_display_backend( + _ctx_id: u32, + _features: u32, + _vtable: *const c_void, + _vtable_size: usize, + ) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub extern "C" fn krun_set_display_backend( + ctx_id: u32, + vtable: *const c_void, + vtable_size: usize, + ) -> i32 { + if vtable_size < size_of::() { + return -libc::EINVAL; + } - if !config_backend.verify() || !events_backend.verify() { - return -libc::EINVAL; - } + // SAFETY: We have checked the vtable size is fine, otherwise we have to trust the user. Just + // to be extra careful, this uses read_unaligned, but we could probably get away with ptr::read. + let display_backend: DisplayBackend = + unsafe { std::ptr::read_unaligned(vtable as *const DisplayBackend) }; - with_cfg(ctx_id, |cfg| { - cfg.vmr - .input_backends - .push((config_backend, events_backend)); - KRUN_SUCCESS - }) -} - -#[cfg(not(feature = "input"))] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_input_device_fd(_ctx_id: u32, _input_fd: i32) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_display(ctx_id: u32, width: u32, height: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if cfg.vmr.displays.len() >= MAX_DISPLAYS { - return -libc::ENOMEM; - } - - cfg.vmr.displays.push(DisplayInfo::new(width, height)); - (cfg.vmr.displays.len() - 1) as i32 - } - Entry::Vacant(_) => -libc::ENOENT, - } -} - -#[cfg(not(feature = "gpu"))] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_display(_ctx_id: u32, _width: u32, _height: u32) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[no_mangle] -pub extern "C" fn krun_display_set_refresh_rate( - ctx_id: u32, - display_id: u32, - refresh_rate: u32, -) -> i32 { - with_cfg(ctx_id, |cfg| { - let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + if !display_backend.verify() { return -libc::EINVAL; - }; + } - let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { - return -libc::EALREADY; - }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.display_backend = Some(display_backend); + } + Entry::Vacant(_) => return -libc::ENOENT, + } - edid_params.refresh_rate = refresh_rate; KRUN_SUCCESS - }) -} - -#[cfg(not(feature = "gpu"))] -#[no_mangle] -pub extern "C" fn krun_display_set_refresh_rate( - _ctx_id: u32, - _display_id: u32, - _refresh_rate: u32, -) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[no_mangle] -#[allow(clippy::missing_safety_doc)] -pub unsafe extern "C" fn krun_display_set_edid( - ctx_id: u32, - display_id: u32, - edid: *const u8, - size: size_t, -) -> i32 { - with_cfg(ctx_id, |cfg| { - let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + } + + #[cfg(not(feature = "input"))] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub extern "C" fn krun_add_input_device( + _ctx_id: u32, + _config_backend: *const c_void, + _config_backend_size: size_t, + _event_provider_backend: *const c_void, + _event_provider_backend_size: size_t, + ) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "input")] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub extern "C" fn krun_add_input_device_fd(ctx_id: u32, input_fd: i32) -> i32 { + use devices::virtio::input::passthrough::PassthroughInputBackend; + use krun_input::{IntoInputConfig, IntoInputEvents}; + + if input_fd < 0 { return -libc::EINVAL; + } + // TODO: currently we let the fd (and it's Box allocation) live forever, we should eventually fix + // this + let input_fd = unsafe { + // SAFETY: The user provided fd should be valid. Its lifetime is 'static because it will + // exist until libkrun _exits the process + BorrowedFd::borrow_raw(input_fd) }; + let borrowed_fd: &'static BorrowedFd<'static> = Box::leak(Box::new(input_fd)); - if edid.is_null() { + let config_backend = PassthroughInputBackend::into_input_config(Some(borrowed_fd)); + let events_backend = PassthroughInputBackend::into_input_events(Some(borrowed_fd)); + + with_cfg(ctx_id, |cfg| { + cfg.vmr + .input_backends + .push((config_backend, events_backend)); + KRUN_SUCCESS + }) + } + + #[cfg(feature = "input")] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_input_device( + ctx_id: u32, + config_backend: *const InputConfigBackend<'static>, + config_backend_size: size_t, + event_provider_backend: *const InputEventProviderBackend<'static>, + event_provider_backend_size: size_t, + ) -> i32 { + if config_backend.is_null() || event_provider_backend.is_null() { return -libc::EINVAL; } - let blob = unsafe { slice::from_raw_parts(edid, size) }; + if config_backend_size < size_of::() + || event_provider_backend_size < size_of::() + { + return -libc::EINVAL; + } - display_info.edid = DisplayInfoEdid::Provided(Box::from(blob)); - KRUN_SUCCESS - }) -} - -#[cfg(not(feature = "gpu"))] -#[no_mangle] -#[allow(clippy::missing_safety_doc)] -pub unsafe extern "C" fn krun_display_set_edid( - _ctx_id: u32, - _display_id: u32, - _edid: *const u8, - _size: size_t, -) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[no_mangle] -pub extern "C" fn krun_display_set_physical_size( - ctx_id: u32, - display_id: u32, - width_mm: u16, - height_mm: u16, -) -> i32 { - with_cfg(ctx_id, |cfg| { - let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + let config_backend = unsafe { *config_backend }; + let events_backend = unsafe { *event_provider_backend }; + + if !config_backend.verify() || !events_backend.verify() { return -libc::EINVAL; - }; - let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { - return -libc::EALREADY; - }; - edid_params.physical_size = PhysicalSize::DimensionsMillimeters(width_mm, height_mm); + } + + with_cfg(ctx_id, |cfg| { + cfg.vmr + .input_backends + .push((config_backend, events_backend)); + KRUN_SUCCESS + }) + } + + #[cfg(not(feature = "input"))] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_input_device_fd(_ctx_id: u32, _input_fd: i32) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_display(ctx_id: u32, width: u32, height: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if cfg.vmr.displays.len() >= MAX_DISPLAYS { + return -libc::ENOMEM; + } + + cfg.vmr.displays.push(DisplayInfo::new(width, height)); + (cfg.vmr.displays.len() - 1) as i32 + } + Entry::Vacant(_) => -libc::ENOENT, + } + } + + #[cfg(not(feature = "gpu"))] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_display(_ctx_id: u32, _width: u32, _height: u32) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[no_mangle] + pub extern "C" fn krun_display_set_refresh_rate( + ctx_id: u32, + display_id: u32, + refresh_rate: u32, + ) -> i32 { + with_cfg(ctx_id, |cfg| { + let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + return -libc::EINVAL; + }; + + let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { + return -libc::EALREADY; + }; + + edid_params.refresh_rate = refresh_rate; + KRUN_SUCCESS + }) + } + + #[cfg(not(feature = "gpu"))] + #[no_mangle] + pub extern "C" fn krun_display_set_refresh_rate( + _ctx_id: u32, + _display_id: u32, + _refresh_rate: u32, + ) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[no_mangle] + #[allow(clippy::missing_safety_doc)] + pub unsafe extern "C" fn krun_display_set_edid( + ctx_id: u32, + display_id: u32, + edid: *const u8, + size: size_t, + ) -> i32 { + with_cfg(ctx_id, |cfg| { + let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + return -libc::EINVAL; + }; + + if edid.is_null() { + return -libc::EINVAL; + } + + let blob = unsafe { slice::from_raw_parts(edid, size) }; + + display_info.edid = DisplayInfoEdid::Provided(Box::from(blob)); + KRUN_SUCCESS + }) + } + + #[cfg(not(feature = "gpu"))] + #[no_mangle] + #[allow(clippy::missing_safety_doc)] + pub unsafe extern "C" fn krun_display_set_edid( + _ctx_id: u32, + _display_id: u32, + _edid: *const u8, + _size: size_t, + ) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[no_mangle] + pub extern "C" fn krun_display_set_physical_size( + ctx_id: u32, + display_id: u32, + width_mm: u16, + height_mm: u16, + ) -> i32 { + with_cfg(ctx_id, |cfg| { + let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + return -libc::EINVAL; + }; + let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { + return -libc::EALREADY; + }; + edid_params.physical_size = PhysicalSize::DimensionsMillimeters(width_mm, height_mm); + KRUN_SUCCESS + }) + } + + #[cfg(not(feature = "gpu"))] + #[no_mangle] + pub extern "C" fn krun_display_set_physical_size( + _ctx_id: u32, + _display_id: u32, + _width_mm: u16, + _height_mm: u16, + ) -> i32 { + -libc::ENOTSUP + } + + #[cfg(feature = "gpu")] + #[no_mangle] + #[allow(clippy::missing_safety_doc)] + pub extern "C" fn krun_display_set_dpi(ctx_id: u32, display_id: u32, dpi: u32) -> i32 { + with_cfg(ctx_id, |cfg| { + let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { + return -libc::EINVAL; + }; + let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { + return -libc::EINVAL; + }; + edid_params.physical_size = PhysicalSize::Dpi(dpi); + KRUN_SUCCESS + }) + } + + #[cfg(not(feature = "gpu"))] + #[no_mangle] + pub extern "C" fn krun_display_set_dpi(_ctx_id: u32, _display_id: u32, _dpi: u32) -> i32 { + -libc::ENOTSUP + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_snd_device(ctx_id: u32, enable: bool) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.enable_snd = enable; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + KRUN_SUCCESS - }) -} - -#[cfg(not(feature = "gpu"))] -#[no_mangle] -pub extern "C" fn krun_display_set_physical_size( - _ctx_id: u32, - _display_id: u32, - _width_mm: u16, - _height_mm: u16, -) -> i32 { - -libc::ENOTSUP -} - -#[cfg(feature = "gpu")] -#[no_mangle] -#[allow(clippy::missing_safety_doc)] -pub extern "C" fn krun_display_set_dpi(ctx_id: u32, display_id: u32, dpi: u32) -> i32 { - with_cfg(ctx_id, |cfg| { - let Some(display_info) = cfg.vmr.displays.get_mut(display_id as usize) else { - return -libc::EINVAL; + } + + #[allow(unused_assignments)] + #[no_mangle] + pub extern "C" fn krun_get_shutdown_eventfd(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if let Some(efd) = cfg.shutdown_efd.as_ref() { + #[cfg(target_os = "macos")] + return efd.get_write_fd(); + #[cfg(target_os = "linux")] + return efd.as_raw_fd(); + } else { + -libc::EINVAL + } + } + Entry::Vacant(_) => -libc::ENOENT, + } + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_console_output( + ctx_id: u32, + c_filepath: *const c_char, + ) -> i32 { + let filepath = match CStr::from_ptr(c_filepath).to_str() { + Ok(f) => f, + Err(_) => return -libc::EINVAL, }; - let DisplayInfoEdid::Generated(ref mut edid_params) = display_info.edid else { - return -libc::EINVAL; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if cfg.console_output.is_some() { + -libc::EINVAL + } else { + cfg.console_output = Some(PathBuf::from(filepath.to_string())); + KRUN_SUCCESS + } + } + Entry::Vacant(_) => -libc::ENOENT, + } + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_nested_virt(ctx_id: u32, enabled: bool) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.nested_enabled = enabled; + KRUN_SUCCESS + } + Entry::Vacant(_) => -libc::ENOENT, + } + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_check_nested_virt() -> i32 { + #[cfg(target_os = "macos")] + match hvf::check_nested_virt() { + Ok(supp) => supp as i32, + Err(_) => -libc::EINVAL, + } + + #[cfg(target_os = "linux")] + { + let paths = [ + "/sys/module/kvm_intel/parameters/nested", + "/sys/module/kvm_amd/parameters/nested", + ]; + if paths.iter().any(|path| { + std::fs::read_to_string(path).is_ok_and(|contents| { + let val = contents.trim(); + val == "1" || val.eq_ignore_ascii_case("Y") + }) + }) { + 1 + } else { + 0 + } + } + + #[cfg(not(any(target_os = "macos", target_os = "linux")))] + -libc::EOPNOTSUPP + } + + const KRUN_FEATURE_NET: u64 = 0; + const KRUN_FEATURE_BLK: u64 = 1; + const KRUN_FEATURE_GPU: u64 = 2; + const KRUN_FEATURE_SND: u64 = 3; + const KRUN_FEATURE_INPUT: u64 = 4; + const KRUN_FEATURE_TEE: u64 = 6; + const KRUN_FEATURE_AMD_SEV: u64 = 7; + const KRUN_FEATURE_INTEL_TDX: u64 = 8; + const KRUN_FEATURE_AWS_NITRO: u64 = 9; + const KRUN_FEATURE_VIRGL_RESOURCE_MAP2: u64 = 10; + + #[no_mangle] + pub extern "C" fn krun_has_feature(feature: u64) -> c_int { + let supported = match feature { + KRUN_FEATURE_NET => cfg!(feature = "net"), + KRUN_FEATURE_BLK => cfg!(feature = "blk"), + KRUN_FEATURE_GPU => cfg!(feature = "gpu"), + KRUN_FEATURE_SND => cfg!(feature = "snd"), + KRUN_FEATURE_INPUT => cfg!(feature = "input"), + KRUN_FEATURE_TEE => cfg!(feature = "tee"), + KRUN_FEATURE_AMD_SEV => cfg!(feature = "amd-sev"), + KRUN_FEATURE_INTEL_TDX => cfg!(feature = "tdx"), + KRUN_FEATURE_AWS_NITRO => cfg!(feature = "aws-nitro"), + KRUN_FEATURE_VIRGL_RESOURCE_MAP2 => cfg!(feature = "virgl_resource_map2"), + _ => return -libc::EINVAL, }; - edid_params.physical_size = PhysicalSize::Dpi(dpi); - KRUN_SUCCESS - }) -} - -#[cfg(not(feature = "gpu"))] -#[no_mangle] -pub extern "C" fn krun_display_set_dpi(_ctx_id: u32, _display_id: u32, _dpi: u32) -> i32 { - -libc::ENOTSUP -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_snd_device(ctx_id: u32, enable: bool) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.enable_snd = enable; - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[allow(unused_assignments)] -#[no_mangle] -pub extern "C" fn krun_get_shutdown_eventfd(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if let Some(efd) = cfg.shutdown_efd.as_ref() { - #[cfg(target_os = "macos")] - return efd.get_write_fd(); - #[cfg(target_os = "linux")] - return efd.as_raw_fd(); + + supported as c_int + } + + /// Gets the maximum number of vCPUs supported by the hypervisor. + /// + /// Returns the maximum number of vCPUs that can be created by this hypervisor, + /// or a negative error code on failure. + #[cfg(any(target_os = "macos", target_os = "linux"))] + #[no_mangle] + pub extern "C" fn krun_get_max_vcpus() -> i32 { + #[cfg(target_os = "macos")] + { + use hvf::bindings::{hv_vm_get_max_vcpu_count, HV_SUCCESS}; + let mut max_vcpu_count: u32 = 0; + let ret = unsafe { hv_vm_get_max_vcpu_count(&mut max_vcpu_count as *mut u32) }; + if ret == HV_SUCCESS { + max_vcpu_count as i32 } else { + error!("Error retrieving max vcpu count: {ret:?}"); -libc::EINVAL } } - Entry::Vacant(_) => -libc::ENOENT, - } -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_console_output(ctx_id: u32, c_filepath: *const c_char) -> i32 { - let filepath = match CStr::from_ptr(c_filepath).to_str() { - Ok(f) => f, - Err(_) => return -libc::EINVAL, - }; + #[cfg(target_os = "linux")] + { + use kvm_ioctls::Kvm; + match Kvm::new() { + Ok(kvm) => kvm.get_max_vcpus() as i32, + Err(e) => { + error!("Error retrieving max vcpu count: {e:?}"); + -libc::EINVAL + } + } + } + } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if cfg.console_output.is_some() { - -libc::EINVAL - } else { - cfg.console_output = Some(PathBuf::from(filepath.to_string())); + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub extern "C" fn krun_split_irqchip(ctx_id: u32, enable: bool) -> i32 { + if enable && !cfg!(target_arch = "x86_64") { + return -libc::EINVAL; + } + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.split_irqchip = enable; KRUN_SUCCESS } + Entry::Vacant(_) => -libc::ENOENT, } - Entry::Vacant(_) => -libc::ENOENT, } -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_nested_virt(ctx_id: u32, enabled: bool) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.nested_enabled = enabled; - KRUN_SUCCESS + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_smbios_oem_strings( + ctx_id: u32, + oem_strings: *const *const c_char, + ) -> i32 { + if oem_strings.is_null() { + return -libc::EINVAL; } - Entry::Vacant(_) => -libc::ENOENT, + + let cstr_ptr_slice = slice::from_raw_parts(oem_strings, MAX_ARGS); + + let mut oem_strings = Vec::new(); + + for cstr_ptr in cstr_ptr_slice.iter().take_while(|p| !p.is_null()) { + let Ok(s) = CStr::from_ptr(*cstr_ptr).to_str() else { + return -libc::EINVAL; + }; + oem_strings.push(s.to_string()); + } + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().vmr.smbios_oem_strings = + (!oem_strings.is_empty()).then_some(oem_strings) + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS } -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_check_nested_virt() -> i32 { - #[cfg(target_os = "macos")] - match hvf::check_nested_virt() { - Ok(supp) => supp as i32, - Err(_) => -libc::EINVAL, + #[cfg(feature = "net")] + fn create_virtio_net( + ctx_cfg: &mut ContextConfig, + backend: VirtioNetBackend, + mac: [u8; 6], + features: u32, + ) { + let network_interface_config = NetworkInterfaceConfig { + iface_id: format!("eth{}", ctx_cfg.net_index), + backend, + mac, + features, + }; + ctx_cfg.net_index += 1; + ctx_cfg + .vmr + .add_network_interface(network_interface_config) + .expect("Failed to create network interface"); } - #[cfg(target_os = "linux")] - { - let paths = [ - "/sys/module/kvm_intel/parameters/nested", - "/sys/module/kvm_amd/parameters/nested", - ]; - if paths.iter().any(|path| { - std::fs::read_to_string(path).is_ok_and(|contents| { - let val = contents.trim(); - val == "1" || val.eq_ignore_ascii_case("Y") - }) - }) { - 1 - } else { - 0 - } - } - - #[cfg(not(any(target_os = "macos", target_os = "linux")))] - -libc::EOPNOTSUPP -} - -const KRUN_FEATURE_NET: u64 = 0; -const KRUN_FEATURE_BLK: u64 = 1; -const KRUN_FEATURE_GPU: u64 = 2; -const KRUN_FEATURE_SND: u64 = 3; -const KRUN_FEATURE_INPUT: u64 = 4; -const KRUN_FEATURE_TEE: u64 = 6; -const KRUN_FEATURE_AMD_SEV: u64 = 7; -const KRUN_FEATURE_INTEL_TDX: u64 = 8; -const KRUN_FEATURE_AWS_NITRO: u64 = 9; -const KRUN_FEATURE_VIRGL_RESOURCE_MAP2: u64 = 10; - -#[no_mangle] -pub extern "C" fn krun_has_feature(feature: u64) -> c_int { - let supported = match feature { - KRUN_FEATURE_NET => cfg!(feature = "net"), - KRUN_FEATURE_BLK => cfg!(feature = "blk"), - KRUN_FEATURE_GPU => cfg!(feature = "gpu"), - KRUN_FEATURE_SND => cfg!(feature = "snd"), - KRUN_FEATURE_INPUT => cfg!(feature = "input"), - KRUN_FEATURE_TEE => cfg!(feature = "tee"), - KRUN_FEATURE_AMD_SEV => cfg!(feature = "amd-sev"), - KRUN_FEATURE_INTEL_TDX => cfg!(feature = "tdx"), - KRUN_FEATURE_AWS_NITRO => cfg!(feature = "aws-nitro"), - KRUN_FEATURE_VIRGL_RESOURCE_MAP2 => cfg!(feature = "virgl_resource_map2"), - _ => return -libc::EINVAL, - }; + #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] + fn map_kernel(ctx_id: u32, kernel_path: &PathBuf) -> i32 { + let file = match File::options().read(true).write(false).open(kernel_path) { + Ok(file) => file, + Err(err) => { + error!("Error opening external kernel: {err}"); + return -libc::EINVAL; + } + }; - supported as c_int -} + let kernel_size = file.metadata().unwrap().len(); + + let kernel_host_addr = unsafe { + libc::mmap( + std::ptr::null_mut(), + kernel_size as usize, + libc::PROT_READ, + libc::MAP_SHARED, + file.as_raw_fd(), + 0_i64, + ) + }; + if std::ptr::eq(kernel_host_addr, libc::MAP_FAILED) { + error!("Can't load kernel into process map"); + return -libc::EINVAL; + } -/// Gets the maximum number of vCPUs supported by the hypervisor. -/// -/// Returns the maximum number of vCPUs that can be created by this hypervisor, -/// or a negative error code on failure. -#[cfg(any(target_os = "macos", target_os = "linux"))] -#[no_mangle] -pub extern "C" fn krun_get_max_vcpus() -> i32 { - #[cfg(target_os = "macos")] - { - use hvf::bindings::{HV_SUCCESS, hv_vm_get_max_vcpu_count}; - let mut max_vcpu_count: u32 = 0; - let ret = unsafe { hv_vm_get_max_vcpu_count(&mut max_vcpu_count as *mut u32) }; - if ret == HV_SUCCESS { - max_vcpu_count as i32 + let kernel_bundle = KernelBundle { + host_addr: kernel_host_addr as u64, + guest_addr: 0x8000_0000, + entry_addr: 0x8000_0000, + size: kernel_size as usize, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => ctx_cfg + .get_mut() + .vmr + .set_kernel_bundle(kernel_bundle) + .unwrap(), + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS + } + + #[cfg(feature = "tee")] + #[allow(clippy::format_collect)] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_kernel(_ctx_id: u32, _c_kernel_path: *const c_char) -> i32 { + -libc::EOPNOTSUPP + } + + #[cfg(not(feature = "tee"))] + #[allow(clippy::format_collect)] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_kernel( + ctx_id: u32, + c_kernel_path: *const c_char, + kernel_format: u32, + c_initramfs_path: *const c_char, + c_cmdline: *const c_char, + ) -> i32 { + let path = match CStr::from_ptr(c_kernel_path).to_str() { + Ok(path) => PathBuf::from(path), + Err(e) => { + error!("Error parsing kernel_path: {e:?}"); + return -libc::EINVAL; + } + }; + + let format = match kernel_format { + // For raw kernels in x86_64, we map the kernel into the + // process and treat it as a bundled kernel. + #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] + 0 => return map_kernel(ctx_id, &path), + #[cfg(target_arch = "aarch64")] + 0 => KernelFormat::Raw, + 1 => KernelFormat::Elf, + 2 => KernelFormat::PeGz, + 3 => KernelFormat::ImageBz2, + 4 => KernelFormat::ImageGz, + 5 => KernelFormat::ImageZstd, + _ => { + return -libc::EINVAL; + } + }; + + let (initramfs_path, initramfs_size) = if !c_initramfs_path.is_null() { + match CStr::from_ptr(c_initramfs_path).to_str() { + Ok(path) => { + let path = PathBuf::from(path); + let size = match std::fs::metadata(&path) { + Ok(metadata) => metadata.len(), + Err(e) => { + error!("Can't read initramfs metadata: {e:?}"); + return -libc::EINVAL; + } + }; + (Some(path), size) + } + Err(e) => { + error!("Error parsing initramfs path: {e:?}"); + return -libc::EINVAL; + } + } + } else { + (None, 0) + }; + + let cmdline = if !c_cmdline.is_null() { + match CStr::from_ptr(c_cmdline).to_str() { + Ok(cmdline) => Some(cmdline.to_string()), + Err(e) => { + error!("Error parsing kernel cmdline: {e:?}"); + return -libc::EINVAL; + } + } } else { - error!("Error retrieving max vcpu count: {ret:?}"); - -libc::EINVAL + None + }; + + let external_kernel = ExternalKernel { + path, + format, + initramfs_path, + initramfs_size, + cmdline, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().vmr.set_external_kernel(external_kernel) + } + Entry::Vacant(_) => return -libc::ENOENT, } + + KRUN_SUCCESS } - #[cfg(target_os = "linux")] - { - use kvm_ioctls::Kvm; - match Kvm::new() { - Ok(kvm) => kvm.get_max_vcpus() as i32, + #[cfg(not(feature = "tee"))] + #[allow(clippy::format_collect)] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_firmware(ctx_id: u32, c_firmware_path: *const c_char) -> i32 { + let path = match CStr::from_ptr(c_firmware_path).to_str() { + Ok(path) => PathBuf::from(path), Err(e) => { - error!("Error retrieving max vcpu count: {e:?}"); - -libc::EINVAL + error!("Error parsing firmware_path: {e:?}"); + return -libc::EINVAL; } - } - } -} + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub extern "C" fn krun_split_irqchip(ctx_id: u32, enable: bool) -> i32 { - if enable && !cfg!(target_arch = "x86_64") { - return -libc::EINVAL; - } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.split_irqchip = enable; - KRUN_SUCCESS + let firmware_config = FirmwareConfig { path }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().vmr.set_firmware_config(firmware_config) + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => -libc::ENOENT, - } -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_smbios_oem_strings( - ctx_id: u32, - oem_strings: *const *const c_char, -) -> i32 { - if oem_strings.is_null() { - return -libc::EINVAL; + KRUN_SUCCESS } - let cstr_ptr_slice = slice::from_raw_parts(oem_strings, MAX_ARGS); - - let mut oem_strings = Vec::new(); - - for cstr_ptr in cstr_ptr_slice.iter().take_while(|p| !p.is_null()) { - let Ok(s) = CStr::from_ptr(*cstr_ptr).to_str() else { - return -libc::EINVAL; + unsafe fn load_krunfw_payload( + krunfw: &KrunfwBindings, + vmr: &mut VmResources, + ) -> Result<(), libloading::Error> { + let mut kernel_guest_addr: u64 = 0; + let mut kernel_entry_addr: u64 = 0; + let mut kernel_size: usize = 0; + let kernel_host_addr = unsafe { + (krunfw.get_kernel)( + &mut kernel_guest_addr as *mut u64, + &mut kernel_entry_addr as *mut u64, + &mut kernel_size as *mut usize, + ) }; - oem_strings.push(s.to_string()); - } + let kernel_bundle = KernelBundle { + host_addr: kernel_host_addr as u64, + guest_addr: kernel_guest_addr, + entry_addr: kernel_entry_addr, + size: kernel_size, + }; + vmr.set_kernel_bundle(kernel_bundle).unwrap(); + + #[cfg(feature = "tee")] + { + let mut qboot_size: usize = 0; + let qboot_host_addr = unsafe { (krunfw.get_qboot)(&mut qboot_size as *mut usize) }; + let qboot_bundle = QbootBundle { + host_addr: qboot_host_addr as u64, + size: qboot_size, + }; + vmr.set_qboot_bundle(qboot_bundle).unwrap(); - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - ctx_cfg.get_mut().vmr.smbios_oem_strings = - (!oem_strings.is_empty()).then_some(oem_strings) + let mut initrd_size: usize = 0; + let initrd_host_addr = unsafe { (krunfw.get_initrd)(&mut initrd_size as *mut usize) }; + let initrd_bundle = InitrdBundle { + host_addr: initrd_host_addr as u64, + size: initrd_size, + }; + vmr.set_initrd_bundle(initrd_bundle).unwrap(); } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} + Ok(()) + } -#[cfg(feature = "net")] -fn create_virtio_net( - ctx_cfg: &mut ContextConfig, - backend: VirtioNetBackend, - mac: [u8; 6], - features: u32, -) { - let network_interface_config = NetworkInterfaceConfig { - iface_id: format!("eth{}", ctx_cfg.net_index), - backend, - mac, - features, - }; - ctx_cfg.net_index += 1; - ctx_cfg - .vmr - .add_network_interface(network_interface_config) - .expect("Failed to create network interface"); -} - -#[cfg(all(target_arch = "x86_64", not(feature = "tee")))] -fn map_kernel(ctx_id: u32, kernel_path: &PathBuf) -> i32 { - let file = match File::options().read(true).write(false).open(kernel_path) { - Ok(file) => file, - Err(err) => { - error!("Error opening external kernel: {err}"); - return -libc::EINVAL; + #[no_mangle] + pub extern "C" fn krun_setuid(ctx_id: u32, uid: libc::uid_t) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_vmm_uid(uid); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - let kernel_size = file.metadata().unwrap().len(); - - let kernel_host_addr = unsafe { - libc::mmap( - std::ptr::null_mut(), - kernel_size as usize, - libc::PROT_READ, - libc::MAP_SHARED, - file.as_raw_fd(), - 0_i64, - ) - }; - if std::ptr::eq(kernel_host_addr, libc::MAP_FAILED) { - error!("Can't load kernel into process map"); - return -libc::EINVAL; + KRUN_SUCCESS } - let kernel_bundle = KernelBundle { - host_addr: kernel_host_addr as u64, - guest_addr: 0x8000_0000, - entry_addr: 0x8000_0000, - size: kernel_size as usize, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => ctx_cfg - .get_mut() - .vmr - .set_kernel_bundle(kernel_bundle) - .unwrap(), - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[cfg(feature = "tee")] -#[allow(clippy::format_collect)] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_kernel(_ctx_id: u32, _c_kernel_path: *const c_char) -> i32 { - -libc::EOPNOTSUPP -} - -#[cfg(not(feature = "tee"))] -#[allow(clippy::format_collect)] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_kernel( - ctx_id: u32, - c_kernel_path: *const c_char, - kernel_format: u32, - c_initramfs_path: *const c_char, - c_cmdline: *const c_char, -) -> i32 { - let path = match CStr::from_ptr(c_kernel_path).to_str() { - Ok(path) => PathBuf::from(path), - Err(e) => { - error!("Error parsing kernel_path: {e:?}"); - return -libc::EINVAL; + #[no_mangle] + pub extern "C" fn krun_setgid(ctx_id: u32, gid: libc::gid_t) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.set_vmm_gid(gid); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - let format = match kernel_format { - // For raw kernels in x86_64, we map the kernel into the - // process and treat it as a bundled kernel. - #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] - 0 => return map_kernel(ctx_id, &path), - #[cfg(target_arch = "aarch64")] - 0 => KernelFormat::Raw, - 1 => KernelFormat::Elf, - 2 => KernelFormat::PeGz, - 3 => KernelFormat::ImageBz2, - 4 => KernelFormat::ImageGz, - 5 => KernelFormat::ImageZstd, - _ => { - return -libc::EINVAL; - } - }; + KRUN_SUCCESS + } - let (initramfs_path, initramfs_size) = if !c_initramfs_path.is_null() { - match CStr::from_ptr(c_initramfs_path).to_str() { - Ok(path) => { - let path = PathBuf::from(path); - let size = match std::fs::metadata(&path) { - Ok(metadata) => metadata.len(), - Err(e) => { - error!("Can't read initramfs metadata: {e:?}"); - return -libc::EINVAL; - } - }; - (Some(path), size) - } + #[cfg(all(feature = "blk", not(feature = "tee")))] + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_root_disk_remount( + ctx_id: u32, + c_device: *const c_char, + c_fstype: *const c_char, + c_options: *const c_char, + ) -> i32 { + let device = match CStr::from_ptr(c_device).to_str() { + Ok(device) => device.to_string(), Err(e) => { - error!("Error parsing initramfs path: {e:?}"); + error!("Error parsing device path: {e:?}"); return -libc::EINVAL; } - } - } else { - (None, 0) - }; + }; - let cmdline = if !c_cmdline.is_null() { - match CStr::from_ptr(c_cmdline).to_str() { - Ok(cmdline) => Some(cmdline.to_string()), - Err(e) => { - error!("Error parsing kernel cmdline: {e:?}"); - return -libc::EINVAL; + let fstype = if !c_fstype.is_null() { + match CStr::from_ptr(c_fstype).to_str() { + Ok(fstype) => { + if fstype == "auto" { + None + } else { + Some(fstype.to_string()) + } + } + Err(e) => { + error!("Error parsing fstype: {e:?}"); + return -libc::EINVAL; + } } - } - } else { - None - }; - - let external_kernel = ExternalKernel { - path, - format, - initramfs_path, - initramfs_size, - cmdline, - }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => ctx_cfg.get_mut().vmr.set_external_kernel(external_kernel), - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} + } else { + None + }; -#[cfg(not(feature = "tee"))] -#[allow(clippy::format_collect)] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_firmware(ctx_id: u32, c_firmware_path: *const c_char) -> i32 { - let path = match CStr::from_ptr(c_firmware_path).to_str() { - Ok(path) => PathBuf::from(path), - Err(e) => { - error!("Error parsing firmware_path: {e:?}"); - return -libc::EINVAL; - } - }; + let options = if !c_options.is_null() { + match CStr::from_ptr(c_options).to_str() { + Ok(options) => Some(options.to_string()), + Err(e) => { + error!("Error parsing options: {e:?}"); + return -libc::EINVAL; + } + } + } else { + None + }; - let firmware_config = FirmwareConfig { path }; - - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => ctx_cfg.get_mut().vmr.set_firmware_config(firmware_config), - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -unsafe fn load_krunfw_payload( - krunfw: &KrunfwBindings, - vmr: &mut VmResources, -) -> Result<(), libloading::Error> { - let mut kernel_guest_addr: u64 = 0; - let mut kernel_entry_addr: u64 = 0; - let mut kernel_size: usize = 0; - let kernel_host_addr = unsafe { - (krunfw.get_kernel)( - &mut kernel_guest_addr as *mut u64, - &mut kernel_entry_addr as *mut u64, - &mut kernel_size as *mut usize, - ) - }; - let kernel_bundle = KernelBundle { - host_addr: kernel_host_addr as u64, - guest_addr: kernel_guest_addr, - entry_addr: kernel_entry_addr, - size: kernel_size, - }; - vmr.set_kernel_bundle(kernel_bundle).unwrap(); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let ctx_cfg = ctx_cfg.get_mut(); - #[cfg(feature = "tee")] - { - let mut qboot_size: usize = 0; - let qboot_host_addr = unsafe { (krunfw.get_qboot)(&mut qboot_size as *mut usize) }; - let qboot_bundle = QbootBundle { - host_addr: qboot_host_addr as u64, - size: qboot_size, - }; - vmr.set_qboot_bundle(qboot_bundle).unwrap(); - - let mut initrd_size: usize = 0; - let initrd_host_addr = unsafe { (krunfw.get_initrd)(&mut initrd_size as *mut usize) }; - let initrd_bundle = InitrdBundle { - host_addr: initrd_host_addr as u64, - size: initrd_size, - }; - vmr.set_initrd_bundle(initrd_bundle).unwrap(); - } - - Ok(()) -} - -#[no_mangle] -pub extern "C" fn krun_setuid(ctx_id: u32, uid: libc::uid_t) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_vmm_uid(uid); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[no_mangle] -pub extern "C" fn krun_setgid(ctx_id: u32, gid: libc::gid_t) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.set_vmm_gid(gid); - } - Entry::Vacant(_) => return -libc::ENOENT, - } - - KRUN_SUCCESS -} - -#[cfg(all(feature = "blk", not(feature = "tee")))] -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_root_disk_remount( - ctx_id: u32, - c_device: *const c_char, - c_fstype: *const c_char, - c_options: *const c_char, -) -> i32 { - let device = match CStr::from_ptr(c_device).to_str() { - Ok(device) => device.to_string(), - Err(e) => { - error!("Error parsing device path: {e:?}"); - return -libc::EINVAL; - } - }; + if ctx_cfg.vmr.fs.iter().any(|fs| fs.fs_id == "/dev/root") { + error!("Root filesystem already configured"); + return -libc::EINVAL; + } - let fstype = if !c_fstype.is_null() { - match CStr::from_ptr(c_fstype).to_str() { - Ok(fstype) => { - if fstype == "auto" { - None - } else { - Some(fstype.to_string()) + if ctx_cfg.block_cfgs.is_empty() { + error!("No block devices configured"); + return -libc::EINVAL; } - } - Err(e) => { - error!("Error parsing fstype: {e:?}"); - return -libc::EINVAL; - } - } - } else { - None - }; - let options = if !c_options.is_null() { - match CStr::from_ptr(c_options).to_str() { - Ok(options) => Some(options.to_string()), - Err(e) => { - error!("Error parsing options: {e:?}"); - return -libc::EINVAL; - } - } - } else { - None - }; + // To boot from a filesystem other than virtiofs, + // we need to setup a temporary root from which init.krun can be executed. + // Otherwise, it would have to be copied to the target filesystem beforehand. + // Instead, init.krun will run from virtiofs and then switch to the real root. + let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); + let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let ctx_cfg = ctx_cfg.get_mut(); + if let Err(e) = std::fs::create_dir_all(&empty_root) { + error!("Failed to create empty root directory: {e:?}"); + return -libc::EINVAL; + } - if ctx_cfg.vmr.fs.iter().any(|fs| fs.fs_id == "/dev/root") { - error!("Root filesystem already configured"); - return -libc::EINVAL; - } + ctx_cfg.vmr.add_fs_device(FsDeviceConfig { + fs_id: "/dev/root".into(), + shared_dir: empty_root.to_string_lossy().into(), + // Default to a conservative 512 MB window. + shm_size: Some(1 << 29), + allow_root_dir_delete: true, + read_only: false, + }); - if ctx_cfg.block_cfgs.is_empty() { - error!("No block devices configured"); - return -libc::EINVAL; + ctx_cfg.set_block_root(device, fstype, options); } + Entry::Vacant(_) => return -libc::ENOENT, + }; - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); + KRUN_SUCCESS + } - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; + #[no_mangle] + pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.disable_implicit_console = true; } - - ctx_cfg.vmr.add_fs_device(FsDeviceConfig { - fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), - // Default to a conservative 512 MB window. - shm_size: Some(1 << 29), - allow_root_dir_delete: true, - read_only: false, - }); - - ctx_cfg.set_block_root(device, fstype, options); + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, - }; - KRUN_SUCCESS -} + KRUN_SUCCESS + } -#[no_mangle] -pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.disable_implicit_console = true; + #[no_mangle] + pub extern "C" fn krun_disable_implicit_vsock(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vsock_config = VsockConfig::Disabled; + } + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[no_mangle] + pub extern "C" fn krun_add_vsock(ctx_id: u32, tsi_features: u32) -> i32 { + let tsi_flags = match TsiFlags::from_bits(tsi_features) { + Some(flags) => flags, + None => return -libc::EINVAL, + }; -#[no_mangle] -pub extern "C" fn krun_disable_implicit_vsock(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vsock_config = VsockConfig::Disabled; + if cfg!(target_os = "macos") && tsi_flags.contains(TsiFlags::HIJACK_UNIX) { + error!("TSI hijacking of UNIX sockets is not yet supported on macOS"); + return -libc::EINVAL; } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} - -#[no_mangle] -pub extern "C" fn krun_add_vsock(ctx_id: u32, tsi_features: u32) -> i32 { - let tsi_flags = match TsiFlags::from_bits(tsi_features) { - Some(flags) => flags, - None => return -libc::EINVAL, - }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + if cfg.vsock_config != VsockConfig::Disabled { + return -libc::EEXIST; + } + cfg.vsock_config = VsockConfig::Explicit { tsi_flags }; + } + Entry::Vacant(_) => return -libc::ENOENT, + } - if cfg!(target_os = "macos") && tsi_flags.contains(TsiFlags::HIJACK_UNIX) { - error!("TSI hijacking of UNIX sockets is not yet supported on macOS"); - return -libc::EINVAL; + KRUN_SUCCESS } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - if cfg.vsock_config != VsockConfig::Disabled { - return -libc::EEXIST; + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_virtio_console_default( + ctx_id: u32, + input_fd: libc::c_int, + output_fd: libc::c_int, + err_fd: libc::c_int, + ) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + + cfg.vmr + .virtio_consoles + .push(VirtioConsoleConfigMode::Autoconfigure( + DefaultVirtioConsoleConfig { + input_fd, + output_fd, + err_fd, + }, + )); } - cfg.vsock_config = VsockConfig::Explicit { tsi_flags }; + Entry::Vacant(_) => return -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, + + KRUN_SUCCESS } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_virtio_console_multiport(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + let console_id = cfg.vmr.virtio_consoles.len() as i32; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_virtio_console_default( - ctx_id: u32, - input_fd: libc::c_int, - output_fd: libc::c_int, - err_fd: libc::c_int, -) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); + cfg.vmr + .virtio_consoles + .push(VirtioConsoleConfigMode::Explicit(Vec::new())); - cfg.vmr - .virtio_consoles - .push(VirtioConsoleConfigMode::Autoconfigure( - DefaultVirtioConsoleConfig { - input_fd, - output_fd, - err_fd, - }, - )); + console_id + } + Entry::Vacant(_) => -libc::ENOENT, } - Entry::Vacant(_) => return -libc::ENOENT, } - KRUN_SUCCESS -} + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_console_port_tty( + ctx_id: u32, + console_id: u32, + name: *const libc::c_char, + tty_fd: libc::c_int, + ) -> i32 { + if tty_fd < 0 { + return -libc::EINVAL; + } + + let name_str = if name.is_null() { + String::new() + } else { + match CStr::from_ptr(name).to_str() { + Ok(s) => s.to_string(), + Err(_) => return -libc::EINVAL, + } + }; -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_virtio_console_multiport(ctx_id: u32) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - let console_id = cfg.vmr.virtio_consoles.len() as i32; + if !BorrowedFd::borrow_raw(tty_fd).is_terminal() { + return -libc::ENOTTY; + } - cfg.vmr - .virtio_consoles - .push(VirtioConsoleConfigMode::Explicit(Vec::new())); + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); - console_id + match cfg.vmr.virtio_consoles.get_mut(console_id as usize) { + Some(VirtioConsoleConfigMode::Explicit(ports)) => { + ports.push(PortConfig::Tty { + name: name_str, + tty_fd, + }); + KRUN_SUCCESS + } + _ => -libc::EINVAL, + } + } + Entry::Vacant(_) => -libc::ENOENT, } - Entry::Vacant(_) => -libc::ENOENT, } -} -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_console_port_tty( - ctx_id: u32, - console_id: u32, - name: *const libc::c_char, - tty_fd: libc::c_int, -) -> i32 { - if tty_fd < 0 { - return -libc::EINVAL; - } + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_console_port_inout( + ctx_id: u32, + console_id: u32, + name: *const c_char, + input_fd: c_int, + output_fd: c_int, + ) -> i32 { + let name_str = if name.is_null() { + String::new() + } else { + match CStr::from_ptr(name).to_str() { + Ok(s) => s.to_string(), + Err(_) => return -libc::EINVAL, + } + }; - let name_str = if name.is_null() { - String::new() - } else { - match CStr::from_ptr(name).to_str() { - Ok(s) => s.to_string(), - Err(_) => return -libc::EINVAL, + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + + match cfg.vmr.virtio_consoles.get_mut(console_id as usize) { + Some(VirtioConsoleConfigMode::Explicit(ports)) => { + ports.push(PortConfig::InOut { + name: name_str, + input_fd, + output_fd, + }); + KRUN_SUCCESS + } + _ => -libc::EINVAL, + } + } + Entry::Vacant(_) => -libc::ENOENT, + } + } + + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_add_serial_console_default( + ctx_id: u32, + input_fd: c_int, + output_fd: c_int, + ) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.serial_consoles.push(SerialConsoleConfig { + input_fd, + output_fd, + }); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - if !BorrowedFd::borrow_raw(tty_fd).is_terminal() { - return -libc::ENOTTY; + KRUN_SUCCESS } - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - - match cfg.vmr.virtio_consoles.get_mut(console_id as usize) { - Some(VirtioConsoleConfigMode::Explicit(ports)) => { - ports.push(PortConfig::Tty { - name: name_str, - tty_fd, - }); - KRUN_SUCCESS - } - _ => -libc::EINVAL, - } - } - Entry::Vacant(_) => -libc::ENOENT, - } -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_console_port_inout( - ctx_id: u32, - console_id: u32, - name: *const c_char, - input_fd: c_int, - output_fd: c_int, -) -> i32 { - let name_str = if name.is_null() { - String::new() - } else { - match CStr::from_ptr(name).to_str() { - Ok(s) => s.to_string(), + #[allow(clippy::missing_safety_doc)] + #[no_mangle] + pub unsafe extern "C" fn krun_set_kernel_console( + ctx_id: u32, + console_id: *const c_char, + ) -> i32 { + let console_id = match CStr::from_ptr(console_id).to_str() { + Ok(id) => id.to_string(), Err(_) => return -libc::EINVAL, + }; + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + cfg.vmr.kernel_console = Some(console_id); + } + Entry::Vacant(_) => return -libc::ENOENT, } - }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - - match cfg.vmr.virtio_consoles.get_mut(console_id as usize) { - Some(VirtioConsoleConfigMode::Explicit(ports)) => { - ports.push(PortConfig::InOut { - name: name_str, - input_fd, - output_fd, - }); - KRUN_SUCCESS - } - _ => -libc::EINVAL, - } - } - Entry::Vacant(_) => -libc::ENOENT, - } -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_add_serial_console_default( - ctx_id: u32, - input_fd: c_int, - output_fd: c_int, -) -> i32 { - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.serial_consoles.push(SerialConsoleConfig { - input_fd, - output_fd, - }); - } - Entry::Vacant(_) => return -libc::ENOENT, + KRUN_SUCCESS } - KRUN_SUCCESS -} - -#[allow(clippy::missing_safety_doc)] -#[no_mangle] -pub unsafe extern "C" fn krun_set_kernel_console(ctx_id: u32, console_id: *const c_char) -> i32 { - let console_id = match CStr::from_ptr(console_id).to_str() { - Ok(id) => id.to_string(), - Err(_) => return -libc::EINVAL, - }; - match CTX_MAP.lock().unwrap().entry(ctx_id) { - Entry::Occupied(mut ctx_cfg) => { - let cfg = ctx_cfg.get_mut(); - cfg.vmr.kernel_console = Some(console_id); + #[no_mangle] + #[allow(unreachable_code)] + pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { + #[cfg(target_os = "linux")] + { + let prname = match env::var("HOSTNAME") { + Ok(val) => CString::new(format!("VM:{val}")).unwrap(), + Err(_) => CString::new("libkrun VM").unwrap(), + }; + unsafe { libc::prctl(libc::PR_SET_NAME, prname.as_ptr()) }; } - Entry::Vacant(_) => return -libc::ENOENT, - } - KRUN_SUCCESS -} + #[cfg(feature = "aws-nitro")] + return krun_start_enter_nitro(ctx_id); -#[no_mangle] -#[allow(unreachable_code)] -pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { - #[cfg(target_os = "linux")] - { - let prname = match env::var("HOSTNAME") { - Ok(val) => CString::new(format!("VM:{val}")).unwrap(), - Err(_) => CString::new("libkrun VM").unwrap(), + let mut event_manager = match EventManager::new() { + Ok(em) => em, + Err(e) => { + error!("Unable to create EventManager: {e:?}"); + return -libc::EINVAL; + } }; - unsafe { libc::prctl(libc::PR_SET_NAME, prname.as_ptr()) }; - } - - #[cfg(feature = "aws-nitro")] - return krun_start_enter_nitro(ctx_id); - - let mut event_manager = match EventManager::new() { - Ok(em) => em, - Err(e) => { - error!("Unable to create EventManager: {e:?}"); - return -libc::EINVAL; - } - }; - let mut ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { - Some(ctx_cfg) => ctx_cfg, - None => return -libc::ENOENT, - }; + let mut ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { + Some(ctx_cfg) => ctx_cfg, + None => return -libc::ENOENT, + }; - if ctx_cfg.vmr.external_kernel.is_none() - && ctx_cfg.vmr.kernel_bundle.is_none() - && ctx_cfg.vmr.firmware_config.is_none() - { - if let Some(ref krunfw) = ctx_cfg.krunfw { - if let Err(err) = unsafe { load_krunfw_payload(krunfw, &mut ctx_cfg.vmr) } { - eprintln!("Can't load libkrunfw symbols: {err}"); + if ctx_cfg.vmr.external_kernel.is_none() + && ctx_cfg.vmr.kernel_bundle.is_none() + && ctx_cfg.vmr.firmware_config.is_none() + { + if let Some(ref krunfw) = ctx_cfg.krunfw { + if let Err(err) = unsafe { load_krunfw_payload(krunfw, &mut ctx_cfg.vmr) } { + eprintln!("Can't load libkrunfw symbols: {err}"); + return -libc::ENOENT; + } + } else { + eprintln!("Couldn't find or load {KRUNFW_NAME}"); return -libc::ENOENT; } - } else { - eprintln!("Couldn't find or load {KRUNFW_NAME}"); - return -libc::ENOENT; } - } - #[cfg(feature = "blk")] - for block_cfg in ctx_cfg.get_block_cfg() { - if ctx_cfg.vmr.add_block_device(block_cfg).is_err() { - error!("Error configuring virtio-blk for block"); - return -libc::EINVAL; + #[cfg(feature = "blk")] + for block_cfg in ctx_cfg.get_block_cfg() { + if ctx_cfg.vmr.add_block_device(block_cfg).is_err() { + error!("Error configuring virtio-blk for block"); + return -libc::EINVAL; + } } - } - /* - * Before krun_start_enter() is called in an encrypted context, the TEE - * config must have been set via krun_set_tee_config_file(). If the TEE - * config is not set by this point, print the relevant error message and - * fail. - */ - #[cfg(feature = "tee")] - if let Some(tee_config) = ctx_cfg.get_tee_config_file() { - if let Err(e) = ctx_cfg.vmr.set_tee_config(tee_config) { - error!("Error setting up TEE config: {e:?}"); + /* + * Before krun_start_enter() is called in an encrypted context, the TEE + * config must have been set via krun_set_tee_config_file(). If the TEE + * config is not set by this point, print the relevant error message and + * fail. + */ + #[cfg(feature = "tee")] + if let Some(tee_config) = ctx_cfg.get_tee_config_file() { + if let Err(e) = ctx_cfg.vmr.set_tee_config(tee_config) { + error!("Error setting up TEE config: {e:?}"); + return -libc::EINVAL; + } + } else { + error!("Missing TEE config file"); return -libc::EINVAL; } - } else { - error!("Missing TEE config file"); - return -libc::EINVAL; - } - - let kernel_cmdline = KernelCmdlineConfig { - prolog: Some(format!("{DEFAULT_KERNEL_CMDLINE} init={INIT_PATH}")), - krun_env: Some(format!( - " {} {} {} {} {}", - ctx_cfg.get_exec_path(), - ctx_cfg.get_workdir(), - ctx_cfg.get_block_root(), - ctx_cfg.get_rlimits(), - ctx_cfg.get_env(), - )), - epilog: Some(format!(" -- {}", ctx_cfg.get_args())), - }; - if ctx_cfg.vmr.set_kernel_cmdline(kernel_cmdline).is_err() { - return -libc::EINVAL; - } + let kernel_cmdline = KernelCmdlineConfig { + prolog: Some(format!("{DEFAULT_KERNEL_CMDLINE} init={INIT_PATH}")), + krun_env: Some(format!( + " {} {} {} {} {}", + ctx_cfg.get_exec_path(), + ctx_cfg.get_workdir(), + ctx_cfg.get_block_root(), + ctx_cfg.get_rlimits(), + ctx_cfg.get_env(), + )), + epilog: Some(format!(" -- {}", ctx_cfg.get_args())), + }; - #[cfg(feature = "net")] - { - if let Some(legacy_net_cfg) = ctx_cfg.legacy_net_cfg.clone() { - let backend = match legacy_net_cfg { - LegacyNetworkConfig::VirtioNetGvproxy(path) => { - VirtioNetBackend::UnixgramPath(path, true) - } - LegacyNetworkConfig::VirtioNetPasst(fd) => VirtioNetBackend::UnixstreamFd(fd), - }; - let mac = ctx_cfg - .legacy_mac - .unwrap_or([0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]); - create_virtio_net(&mut ctx_cfg, backend, mac, NET_COMPAT_FEATURES); - } - } - - match &ctx_cfg.vsock_config { - VsockConfig::Disabled => (), - VsockConfig::Explicit { tsi_flags } => { - let vsock_device_config = VsockDeviceConfig { - vsock_id: "vsock0".to_string(), - guest_cid: 3, - host_port_map: ctx_cfg.tsi_port_map, - unix_ipc_port_map: ctx_cfg.unix_ipc_port_map.clone(), - tsi_flags: *tsi_flags, - }; - ctx_cfg.vmr.set_vsock_device(vsock_device_config).unwrap(); + if ctx_cfg.vmr.set_kernel_cmdline(kernel_cmdline).is_err() { + return -libc::EINVAL; } - VsockConfig::Implicit => { - // Implicit vsock configuration - use heuristics - // Check if TSI should be enabled based on network configuration - #[cfg(feature = "net")] - let enable_tsi = ctx_cfg.vmr.net.list.is_empty() && ctx_cfg.legacy_net_cfg.is_none(); - #[cfg(not(feature = "net"))] - let enable_tsi = true; - let has_ipc_map = ctx_cfg.unix_ipc_port_map.is_some(); - - if enable_tsi || has_ipc_map { - let (tsi_flags, host_port_map) = if enable_tsi { - (TsiFlags::HIJACK_INET, ctx_cfg.tsi_port_map) - } else { - (TsiFlags::empty(), None) + #[cfg(feature = "net")] + { + if let Some(legacy_net_cfg) = ctx_cfg.legacy_net_cfg.clone() { + let backend = match legacy_net_cfg { + LegacyNetworkConfig::VirtioNetGvproxy(path) => { + VirtioNetBackend::UnixgramPath(path, true) + } + LegacyNetworkConfig::VirtioNetPasst(fd) => VirtioNetBackend::UnixstreamFd(fd), }; + let mac = ctx_cfg + .legacy_mac + .unwrap_or([0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]); + create_virtio_net(&mut ctx_cfg, backend, mac, NET_COMPAT_FEATURES); + } + } + match &ctx_cfg.vsock_config { + VsockConfig::Disabled => (), + VsockConfig::Explicit { tsi_flags } => { let vsock_device_config = VsockDeviceConfig { vsock_id: "vsock0".to_string(), guest_cid: 3, - host_port_map, + host_port_map: ctx_cfg.tsi_port_map, unix_ipc_port_map: ctx_cfg.unix_ipc_port_map.clone(), - tsi_flags, + tsi_flags: *tsi_flags, }; ctx_cfg.vmr.set_vsock_device(vsock_device_config).unwrap(); } + VsockConfig::Implicit => { + // Implicit vsock configuration - use heuristics + // Check if TSI should be enabled based on network configuration + #[cfg(feature = "net")] + let enable_tsi = + ctx_cfg.vmr.net.list.is_empty() && ctx_cfg.legacy_net_cfg.is_none(); + #[cfg(not(feature = "net"))] + let enable_tsi = true; + + let has_ipc_map = ctx_cfg.unix_ipc_port_map.is_some(); + + if enable_tsi || has_ipc_map { + let (tsi_flags, host_port_map) = if enable_tsi { + (TsiFlags::HIJACK_INET, ctx_cfg.tsi_port_map) + } else { + (TsiFlags::empty(), None) + }; + + let vsock_device_config = VsockDeviceConfig { + vsock_id: "vsock0".to_string(), + guest_cid: 3, + host_port_map, + unix_ipc_port_map: ctx_cfg.unix_ipc_port_map.clone(), + tsi_flags, + }; + ctx_cfg.vmr.set_vsock_device(vsock_device_config).unwrap(); + } + } } - } - if let Some(virgl_flags) = ctx_cfg.gpu_virgl_flags { - ctx_cfg.vmr.set_gpu_virgl_flags(virgl_flags); - } - if let Some(shm_size) = ctx_cfg.gpu_shm_size { - ctx_cfg.vmr.set_gpu_shm_size(shm_size); - } + if let Some(virgl_flags) = ctx_cfg.gpu_virgl_flags { + ctx_cfg.vmr.set_gpu_virgl_flags(virgl_flags); + } + if let Some(shm_size) = ctx_cfg.gpu_shm_size { + ctx_cfg.vmr.set_gpu_shm_size(shm_size); + } - #[cfg(feature = "snd")] - ctx_cfg.vmr.set_snd_device(ctx_cfg.enable_snd); + #[cfg(feature = "snd")] + ctx_cfg.vmr.set_snd_device(ctx_cfg.enable_snd); - if let Some(console_output) = ctx_cfg.console_output { - ctx_cfg.vmr.set_console_output(console_output); - } + if let Some(console_output) = ctx_cfg.console_output { + ctx_cfg.vmr.set_console_output(console_output); + } - if let Some(gid) = ctx_cfg.vmm_gid { - if unsafe { libc::setgid(gid) } != 0 { - error!("Failed to set gid {gid}"); - return -std::io::Error::last_os_error().raw_os_error().unwrap(); + if let Some(gid) = ctx_cfg.vmm_gid { + if unsafe { libc::setgid(gid) } != 0 { + error!("Failed to set gid {gid}"); + return -std::io::Error::last_os_error().raw_os_error().unwrap(); + } } - } - if let Some(uid) = ctx_cfg.vmm_uid { - if unsafe { libc::setuid(uid) } != 0 { - error!("Failed to set uid {uid}"); - return -std::io::Error::last_os_error().raw_os_error().unwrap(); + if let Some(uid) = ctx_cfg.vmm_uid { + if unsafe { libc::setuid(uid) } != 0 { + error!("Failed to set uid {uid}"); + return -std::io::Error::last_os_error().raw_os_error().unwrap(); + } } - } - let (sender, _receiver) = unbounded(); + let (sender, _receiver) = unbounded(); - let _vmm = match vmm::builder::build_microvm( - &ctx_cfg.vmr, - &mut event_manager, - ctx_cfg.shutdown_efd, - sender, - ) { - Ok(vmm) => vmm, - Err(e) => { - error!("Building the microVM failed: {e:?}"); - return -libc::EINVAL; + let _vmm = match vmm::builder::build_microvm( + &ctx_cfg.vmr, + &mut event_manager, + ctx_cfg.shutdown_efd, + sender, + ) { + Ok(vmm) => vmm, + Err(e) => { + error!("Building the microVM failed: {e:?}"); + return -libc::EINVAL; + } + }; + + #[cfg(target_os = "macos")] + if ctx_cfg.gpu_virgl_flags.is_some() { + vmm::worker::start_worker_thread(_vmm.clone(), _receiver).unwrap(); } - }; - #[cfg(target_os = "macos")] - if ctx_cfg.gpu_virgl_flags.is_some() { - vmm::worker::start_worker_thread(_vmm.clone(), _receiver).unwrap(); - } + #[cfg(target_arch = "x86_64")] + if ctx_cfg.vmr.split_irqchip { + vmm::worker::start_worker_thread(_vmm.clone(), _receiver.clone()).unwrap(); + } - #[cfg(target_arch = "x86_64")] - if ctx_cfg.vmr.split_irqchip { + #[cfg(any(feature = "amd-sev", feature = "tdx"))] vmm::worker::start_worker_thread(_vmm.clone(), _receiver.clone()).unwrap(); + + loop { + match event_manager.run() { + Ok(_) => {} + Err(e) => { + error!("Error in EventManager loop: {e:?}"); + return -libc::EINVAL; + } + } + } } - #[cfg(any(feature = "amd-sev", feature = "tdx"))] - vmm::worker::start_worker_thread(_vmm.clone(), _receiver.clone()).unwrap(); + #[cfg(feature = "aws-nitro")] + #[no_mangle] + fn krun_start_enter_nitro(ctx_id: u32) -> i32 { + let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { + Some(ctx_cfg) => ctx_cfg, + None => return -libc::ENOENT, + }; + + let Ok(enclave) = NitroEnclave::try_from(ctx_cfg) else { + return -libc::EINVAL; + }; - loop { - match event_manager.run() { - Ok(_) => {} + match enclave.run() { + Ok(ret) => ret, Err(e) => { - error!("Error in EventManager loop: {e:?}"); - return -libc::EINVAL; + error!("Error running nitro enclave: {e}"); + + -libc::EINVAL } } } -} -#[cfg(feature = "aws-nitro")] -#[no_mangle] -fn krun_start_enter_nitro(ctx_id: u32) -> i32 { - let ctx_cfg = match CTX_MAP.lock().unwrap().remove(&ctx_id) { - Some(ctx_cfg) => ctx_cfg, - None => return -libc::ENOENT, - }; + // ============================================================================ + // New functions — Unix stubs (full implementations in windows_api.rs) + // ============================================================================ - let Ok(enclave) = NitroEnclave::try_from(ctx_cfg) else { - return -libc::EINVAL; - }; + /// Start VM on a background thread (non-blocking). + /// Not yet implemented on Unix — use krun_start_enter() instead. + #[no_mangle] + pub extern "C" fn krun_start(_ctx_id: u32) -> i32 { + -libc::ENOSYS + } + + /// Block until a running VM exits. Returns exit code. + /// Not yet implemented on Unix. + #[no_mangle] + pub extern "C" fn krun_wait(_ctx_id: u32) -> i32 { + -libc::ENOSYS + } - match enclave.run() { - Ok(ret) => ret, - Err(e) => { - error!("Error running nitro enclave: {e}"); - - -libc::EINVAL - } - } -} - -// ============================================================================ -// New functions — Unix stubs (full implementations in windows_api.rs) -// ============================================================================ - -/// Start VM on a background thread (non-blocking). -/// Not yet implemented on Unix — use krun_start_enter() instead. -#[no_mangle] -pub extern "C" fn krun_start(_ctx_id: u32) -> i32 { - -libc::ENOSYS -} - -/// Block until a running VM exits. Returns exit code. -/// Not yet implemented on Unix. -#[no_mangle] -pub extern "C" fn krun_wait(_ctx_id: u32) -> i32 { - -libc::ENOSYS -} - -/// Request a running VM to stop (non-blocking). -/// Not yet implemented on Unix. -#[no_mangle] -pub extern "C" fn krun_stop(_ctx_id: u32) -> i32 { - -libc::ENOSYS -} - -/// Get captured console output for a VM. -/// Not yet implemented on Unix. -#[no_mangle] -pub unsafe extern "C" fn krun_get_console_output( - _ctx_id: u32, - _buf: *mut u8, - _buf_size: u32, -) -> i32 { - -libc::ENOSYS -} - -/// Add a network device backed by a TCP endpoint. -/// Not yet implemented on Unix — use krun_add_net_unixstream/unixgram instead. -#[no_mangle] -pub unsafe extern "C" fn krun_add_net( - _ctx_id: u32, - _c_endpoint: *const c_char, - _c_mac: *const u8, -) -> i32 { - -libc::ENOSYS -} + /// Request a running VM to stop (non-blocking). + /// Not yet implemented on Unix. + #[no_mangle] + pub extern "C" fn krun_stop(_ctx_id: u32) -> i32 { + -libc::ENOSYS + } + + /// Get captured console output for a VM. + /// Not yet implemented on Unix. + #[no_mangle] + pub unsafe extern "C" fn krun_get_console_output( + _ctx_id: u32, + _buf: *mut u8, + _buf_size: u32, + ) -> i32 { + -libc::ENOSYS + } + /// Add a network device backed by a TCP endpoint. + /// Not yet implemented on Unix — use krun_add_net_unixstream/unixgram instead. + #[no_mangle] + pub unsafe extern "C" fn krun_add_net( + _ctx_id: u32, + _c_endpoint: *const c_char, + _c_mac: *const u8, + ) -> i32 { + -libc::ENOSYS + } } // mod unix_api diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 046da00f5..4ac8ffd3a 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -18,6 +18,7 @@ gpu = ["devices/gpu", "krun_display"] snd = ["devices/snd"] input = ["devices/input", "krun_input"] aws-nitro = [] +efi = [] [dependencies] crossbeam-channel = ">=0.5.15" diff --git a/src/vmm/src/windows/cmdline.rs b/src/vmm/src/windows/cmdline.rs index bf3422625..bac75f617 100644 --- a/src/vmm/src/windows/cmdline.rs +++ b/src/vmm/src/windows/cmdline.rs @@ -464,5 +464,4 @@ mod tests { assert!(mmio_pos < sep_pos); assert!(!cmdline.contains("root=/dev/vda")); } - } diff --git a/src/vmm/src/windows/devices/irq_chip.rs b/src/vmm/src/windows/devices/irq_chip.rs index 0b36ad1b6..b12e9c9e0 100644 --- a/src/vmm/src/windows/devices/irq_chip.rs +++ b/src/vmm/src/windows/devices/irq_chip.rs @@ -236,7 +236,12 @@ impl IrqChip { Some(self.ioapic.read_mmio(offset)) } else if addr >= LAPIC_MMIO_BASE && addr < LAPIC_MMIO_BASE + LAPIC_MMIO_SIZE { let offset = addr - LAPIC_MMIO_BASE; - Some(self.lapics[vcpu_id as usize].lock().unwrap().read_mmio(offset)) + Some( + self.lapics[vcpu_id as usize] + .lock() + .unwrap() + .read_mmio(offset), + ) } else { None } @@ -424,7 +429,10 @@ mod tests { chip.raise_irq(5); // pull_irr: merge shared state into local IRR (lock-free delivery path). - chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); + chip.lapics[0] + .lock() + .unwrap() + .pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); let vector = chip.acknowledge(0); @@ -446,7 +454,10 @@ mod tests { // raise_irq(0) should remap to IOAPIC pin 2 and deliver vector 0x22. chip.raise_irq(0); // pull_irr: merge shared state into local IRR (lock-free delivery path). - chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); + chip.lapics[0] + .lock() + .unwrap() + .pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); let vector = chip.acknowledge(0); @@ -513,7 +524,10 @@ mod tests { // Raise IRQ 3. chip.raise_irq(3); // pull_irr: merge shared state into local IRR (lock-free delivery path). - chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); + chip.lapics[0] + .lock() + .unwrap() + .pull_irr(&chip.shared_states[0]); let vector = chip.acknowledge(0); assert_eq!(vector, Some(0x33)); @@ -524,7 +538,10 @@ mod tests { chip.handle_mmio_write(0, LAPIC_MMIO_BASE + 0x0B0, 4, 0); // After EOI, the pin is still asserted → re-injection via shared state. - chip.lapics[0].lock().unwrap().pull_irr(&chip.shared_states[0]); + chip.lapics[0] + .lock() + .unwrap() + .pull_irr(&chip.shared_states[0]); assert!(chip.has_pending(0)); } @@ -566,7 +583,10 @@ mod tests { // Deliver IPI to vCPU 1. chip.deliver_ipi_interrupt(1, 0x40); // pull_irr: merge shared state into local IRR (lock-free delivery path). - chip.lapics[1].lock().unwrap().pull_irr(&chip.shared_states[1]); + chip.lapics[1] + .lock() + .unwrap() + .pull_irr(&chip.shared_states[1]); assert!(chip.has_pending(1)); assert_eq!(chip.acknowledge(1), Some(0x40)); } @@ -642,7 +662,10 @@ mod tests { chip.raise_irq(1); // Before pull_irr, LAPIC has nothing. - assert_eq!(chip.lapics[0].lock().unwrap().get_highest_injectable(), None); + assert_eq!( + chip.lapics[0].lock().unwrap().get_highest_injectable(), + None + ); // After pull_irr, LAPIC sees vector 49. let shared = chip.get_shared_state(0); @@ -667,7 +690,10 @@ mod tests { chip.deliver_ipi_interrupt(1, 80); // Before pull_irr, LAPIC 1 has nothing. - assert_eq!(chip.lapics[1].lock().unwrap().get_highest_injectable(), None); + assert_eq!( + chip.lapics[1].lock().unwrap().get_highest_injectable(), + None + ); // After pull_irr, LAPIC 1 sees vector 80. let shared = chip.get_shared_state(1); @@ -680,6 +706,9 @@ mod tests { // LAPIC 0 should be unaffected. let shared0 = chip.get_shared_state(0); chip.lapics[0].lock().unwrap().pull_irr(&shared0); - assert_eq!(chip.lapics[0].lock().unwrap().get_highest_injectable(), None); + assert_eq!( + chip.lapics[0].lock().unwrap().get_highest_injectable(), + None + ); } } diff --git a/src/vmm/src/windows/devices/lapic.rs b/src/vmm/src/windows/devices/lapic.rs index 235dd6bcc..339ee52ed 100644 --- a/src/vmm/src/windows/devices/lapic.rs +++ b/src/vmm/src/windows/devices/lapic.rs @@ -407,11 +407,7 @@ impl LocalApic { match dest_shorthand { 0b01 => { // Self: send to own LAPIC (used for self-IPI). - log::debug!( - "LAPIC {} ICR: Self IPI vector={:#X}", - self.id, - vector - ); + log::debug!("LAPIC {} ICR: Self IPI vector={:#X}", self.id, vector); return IpiAction::SendInterrupt { target_apic_id: self.id, vector, @@ -425,7 +421,11 @@ impl LocalApic { "LAPIC {} ICR: Broadcast vector={:#X} (shorthand={})", self.id, vector, - if dest_shorthand == 0b10 { "all-incl" } else { "all-excl" } + if dest_shorthand == 0b10 { + "all-incl" + } else { + "all-excl" + } ); return IpiAction::BroadcastInterrupt { source_apic_id: self.id, @@ -967,7 +967,6 @@ mod tests { assert_eq!(result.eoi_vector, None); } - // ---- SharedApicState tests ---- #[test] @@ -976,7 +975,7 @@ mod tests { // Vector 32 → bank 1, bit 0. assert!(shared.request_interrupt(32)); // first set → true assert!(!shared.request_interrupt(32)); // already set → false - // Vector 33 → bank 1, bit 1. + // Vector 33 → bank 1, bit 1. assert!(shared.request_interrupt(33)); // different bit → true } @@ -1020,7 +1019,7 @@ mod tests { let mut lapic = LocalApic::new(); lapic.pull_irr(&shared); lapic.write_mmio(0x0F0, 0x1FF); // enable - // Highest should be 39. + // Highest should be 39. assert_eq!(lapic.get_highest_injectable(), Some(39)); } diff --git a/src/vmm/src/windows/devices/virtio/vsock/mod.rs b/src/vmm/src/windows/devices/virtio/vsock/mod.rs index ed8403dbf..929453f9f 100644 --- a/src/vmm/src/windows/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/windows/devices/virtio/vsock/mod.rs @@ -837,7 +837,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("vsock-test.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); let mem = MockMem::new(0x10000); let mut tx_queue = setup_queue(128); @@ -1278,7 +1279,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("listen-connect.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); // Connect a UDS client before the guest sends REQUEST. let _client = UnixStream::connect(&sock_path).unwrap(); @@ -1321,7 +1323,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("poll-data.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); // Connect UDS client. let mut client = UnixStream::connect(&sock_path).unwrap(); @@ -1516,7 +1519,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("poll-accept.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); // Host UDS client connects BEFORE any guest action. let _client = UnixStream::connect(&sock_path).unwrap(); @@ -1554,7 +1558,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("lifecycle.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); // Step 1: Host client connects. let mut client = UnixStream::connect(&sock_path).unwrap(); @@ -1606,7 +1611,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("handshake.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); let mut client = UnixStream::connect(&sock_path).unwrap(); std::thread::sleep(std::time::Duration::from_millis(50)); @@ -1672,7 +1678,8 @@ mod tests { let mut dev = VirtioVsock::new(3); let (sock_path, _dir) = temp_socket_path("guest-data.sock"); let vsock_port = 2695u32; - dev.listen_on(vsock_port, sock_path.to_str().unwrap()).unwrap(); + dev.listen_on(vsock_port, sock_path.to_str().unwrap()) + .unwrap(); let mut client = UnixStream::connect(&sock_path).unwrap(); client.set_nonblocking(true).unwrap(); diff --git a/src/vmm/src/windows/runner.rs b/src/vmm/src/windows/runner.rs index 586a9cb72..40d3da972 100644 --- a/src/vmm/src/windows/runner.rs +++ b/src/vmm/src/windows/runner.rs @@ -575,7 +575,10 @@ mod imp { } } Err(ref e) => { - log::warn!("try_inject_interrupt_fast: interrupts_enabled() error: {:?}", e); + log::warn!( + "try_inject_interrupt_fast: interrupts_enabled() error: {:?}", + e + ); } } Ok(()) @@ -891,7 +894,9 @@ mod imp { if stats.consecutive_mmio_reads == 10_000 { diag!( "BSP: tight MMIO read loop: addr={:#X} count={} exit={}", - address, stats.consecutive_mmio_reads, stats.exit_count + address, + stats.consecutive_mmio_reads, + stats.exit_count ); if let Ok(regs) = vcpu.get_registers() { diag!("BSP: RIP={:#X} at tight MMIO loop", regs.rip); @@ -926,7 +931,10 @@ mod imp { LapicWriteFastResult::IpiAction(action) => { // ICR fast path: dispatch IPI inline (lock-free). match action { - IpiAction::SendInterrupt { target_apic_id, vector } => { + IpiAction::SendInterrupt { + target_apic_id, + vector, + } => { let idx = target_apic_id as usize; if idx < all_shared.len() { all_shared[idx].request_interrupt(vector); @@ -935,7 +943,10 @@ mod imp { } } } - IpiAction::BroadcastInterrupt { source_apic_id, vector } => { + IpiAction::BroadcastInterrupt { + source_apic_id, + vector, + } => { // Broadcast to all vCPUs except source (lock-free). for idx in 0..all_shared.len() { if idx as u8 != source_apic_id { @@ -974,7 +985,8 @@ mod imp { stats.start_time.elapsed().as_secs_f64() * 1000.0 ); } - let ipi_action = dm.handle_mmio_write(0, address, size, data, guest_mem); + let ipi_action = + dm.handle_mmio_write(0, address, size, data, guest_mem); if !matches!(ipi_action, IpiAction::None) { dispatch_ipi( ipi_action, @@ -1052,9 +1064,15 @@ mod imp { diag!( "BSP HLT stuck: consecutive={} total_halt={} halt_w_irq={} \ exits={} RIP={:#X} IF={} console={}B mmio={} vcpus={}", - stats.halt_count, stats.total_halt_exits, stats.halt_with_irq, - stats.exit_count, regs.rip, - if_flag, console_len, stats.mmio_count, num_vcpus + stats.halt_count, + stats.total_halt_exits, + stats.halt_with_irq, + stats.exit_count, + regs.rip, + if_flag, + console_len, + stats.mmio_count, + num_vcpus ); } } @@ -1321,8 +1339,12 @@ mod imp { if shutdown.load(Ordering::Relaxed) || !run_config.should_run() { diag!( "AP{}: EXIT (shutdown) exits={} cancelled={} halt={} cpuid={} mmio={}", - ap_id, stats.exit_count, stats.cancelled_count, - stats.total_halt_exits, stats.cpuid_count, stats.mmio_count, + ap_id, + stats.exit_count, + stats.cancelled_count, + stats.total_halt_exits, + stats.cpuid_count, + stats.mmio_count, ); return; } @@ -1409,7 +1431,10 @@ mod imp { if stats.consecutive_mmio_reads == 10_000 { log::warn!( "AP{}: tight MMIO read loop: addr={:#X} count={} exit={}", - ap_id, address, stats.consecutive_mmio_reads, stats.exit_count + ap_id, + address, + stats.consecutive_mmio_reads, + stats.exit_count ); if let Ok(regs) = vcpu.get_registers() { log::warn!("AP{}: RIP={:#X} at tight MMIO loop", ap_id, regs.rip); @@ -1444,7 +1469,10 @@ mod imp { LapicWriteFastResult::IpiAction(action) => { // ICR fast path: dispatch IPI inline (lock-free). match action { - IpiAction::SendInterrupt { target_apic_id, vector } => { + IpiAction::SendInterrupt { + target_apic_id, + vector, + } => { let idx = target_apic_id as usize; if idx < all_shared.len() { all_shared[idx].request_interrupt(vector); @@ -1453,7 +1481,10 @@ mod imp { } } } - IpiAction::BroadcastInterrupt { source_apic_id, vector } => { + IpiAction::BroadcastInterrupt { + source_apic_id, + vector, + } => { // Broadcast to all vCPUs except source (lock-free). for idx in 0..all_shared.len() { if idx as u8 != source_apic_id { @@ -1570,10 +1601,7 @@ mod imp { stats.cancelled_count += 1; // Periodic AP progress logging (every 500 Cancelled exits ≈ every 500ms). if stats.cancelled_count % 500 == 0 { - let rip = vcpu - .get_registers() - .map(|r| r.rip) - .unwrap_or(0xDEAD); + let rip = vcpu.get_registers().map(|r| r.rip).unwrap_or(0xDEAD); diag!( "AP{} @ {:.1}s: exits={} cancelled={} halt={} cpuid={} mmio={} RIP={:#X}", ap_id, @@ -1679,8 +1707,12 @@ mod imp { if stats.exit_count >= MAX_EXITS { diag!( "AP{}: EXIT (max_exits) exits={} cancelled={} halt={} cpuid={} mmio={}", - ap_id, stats.exit_count, stats.cancelled_count, - stats.total_halt_exits, stats.cpuid_count, stats.mmio_count, + ap_id, + stats.exit_count, + stats.cancelled_count, + stats.total_halt_exits, + stats.cpuid_count, + stats.mmio_count, ); return; } @@ -2081,8 +2113,16 @@ mod tests { fn test_cpuid_leaf1_topology_bsp() { // BSP (vcpu 0) with 2 vCPUs. // input_rcx=0 (leaf 1 doesn't use sub-leaves). - let (rax, rbx, rcx, rdx) = - super::handle_cpuid(0, 2, 1, 0, 0x1234, 0x0000_0000_0000_5678, 0x8000_0001, 0xABCD); + let (rax, rbx, rcx, rdx) = super::handle_cpuid( + 0, + 2, + 1, + 0, + 0x1234, + 0x0000_0000_0000_5678, + 0x8000_0001, + 0xABCD, + ); // EBX[23:16] = num_vcpus = 2, EBX[31:24] = vcpu_id = 0 assert_eq!(rbx & 0x00FF_0000, 0x0002_0000, "EBX[23:16] should be 2"); assert_eq!( @@ -2189,7 +2229,11 @@ mod tests { for subleaf in 0..3u64 { let r_b = super::handle_cpuid(0, 4, 0xB, subleaf, 0, 0, 0, 0); let r_1f = super::handle_cpuid(0, 4, 0x1F, subleaf, 0, 0, 0, 0); - assert_eq!(r_b, r_1f, "Leaf 0xB and 0x1F should match for sub-leaf {}", subleaf); + assert_eq!( + r_b, r_1f, + "Leaf 0xB and 0x1F should match for sub-leaf {}", + subleaf + ); } } @@ -2249,5 +2293,4 @@ mod tests { assert_eq!(super::handle_msr_read(0, 0x174), 0); assert_eq!(super::handle_msr_read(1, 0xC000_0080), 0); } - } From 0f052c28ce8c8b821ca485d5c138c217411d46b1 Mon Sep 17 00:00:00 2001 From: lile Date: Tue, 5 May 2026 21:43:03 +0800 Subject: [PATCH 32/33] fix(ci): cargo fmt import ordering + restore test_utils dev-dep - Fix import ordering in cpuid/src/common.rs for rustfmt 1.95 - Add back devices dev-dependency with test_utils feature (gated to cfg(unix)) so DummyIrqChip is available for vmm unit tests Co-Authored-By: Claude Opus 4.6 --- src/cpuid/src/common.rs | 4 ++-- src/vmm/Cargo.toml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cpuid/src/common.rs b/src/cpuid/src/common.rs index d7474b89c..950a8673d 100644 --- a/src/cpuid/src/common.rs +++ b/src/cpuid/src/common.rs @@ -2,9 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 #[cfg(target_arch = "x86")] -use std::arch::x86::{CpuidResult, __cpuid_count, __get_cpuid_max}; +use std::arch::x86::{__cpuid_count, __get_cpuid_max, CpuidResult}; #[cfg(target_arch = "x86_64")] -use std::arch::x86_64::{CpuidResult, __cpuid_count, __get_cpuid_max}; +use std::arch::x86_64::{__cpuid_count, __get_cpuid_max, CpuidResult}; use crate::cpu_leaf::*; diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 4ac8ffd3a..59ba3ab68 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -81,5 +81,8 @@ uds_windows = "1.2" [target.'cfg(target_os = "windows")'.dev-dependencies] env_logger = "0.11" +[target.'cfg(unix)'.dev-dependencies] +devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices", features = ["test_utils"] } + [dev-dependencies] tempfile = "3" From 2ffc9ef090952fee271c2ecac5df60b8ef50b5af Mon Sep 17 00:00:00 2001 From: lile Date: Tue, 5 May 2026 22:22:45 +0800 Subject: [PATCH 33/33] fix(ci): gate boot_kernel example behind cfg(windows) The boot_kernel example uses vmm::windows::* modules and env_logger (Windows dev-dep), but Cargo auto-discovers examples and tries to compile them on all platforms. Add cfg(windows) gates and a stub main() for non-Windows platforms. Co-Authored-By: Claude Opus 4.6 --- src/vmm/examples/boot_kernel.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/vmm/examples/boot_kernel.rs b/src/vmm/examples/boot_kernel.rs index 56dd4f0d3..32179ed30 100644 --- a/src/vmm/examples/boot_kernel.rs +++ b/src/vmm/examples/boot_kernel.rs @@ -26,10 +26,17 @@ //! # Full lifecycle test: disk + init + argv //! boot_kernel.exe vmlinuz --disk rootfs.img --init /init --argv --listen --argv vsock://2695 -use std::path::PathBuf; +#[cfg(not(target_os = "windows"))] +fn main() { + eprintln!("boot_kernel: this example requires Windows (WHPX hypervisor)"); +} +#[cfg(target_os = "windows")] +use std::path::PathBuf; +#[cfg(target_os = "windows")] use vmm::windows::context::{DiskConfig, VsockPort, DISK_FORMAT_RAW}; +#[cfg(target_os = "windows")] fn main() { // Initialize logging (RUST_LOG controls verbosity). env_logger::init();