diff --git a/.github/workflows/ValidatePullRequest.yml b/.github/workflows/ValidatePullRequest.yml index 9fc686bba..9356a38d9 100644 --- a/.github/workflows/ValidatePullRequest.yml +++ b/.github/workflows/ValidatePullRequest.yml @@ -67,7 +67,7 @@ jobs: - docs-pr - build-guests strategy: - fail-fast: true + fail-fast: false matrix: hypervisor: [hyperv, 'hyperv-ws2025', mshv3, kvm] cpu: [amd, intel] diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs index 7fdd83b8f..28076d797 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm.rs @@ -48,7 +48,7 @@ use crate::hypervisor::hyperv_linux::MshvVm; use crate::hypervisor::hyperv_windows::WhpVm; #[cfg(kvm)] use crate::hypervisor::kvm::KvmVm; -use crate::hypervisor::regs::CommonSpecialRegisters; +use crate::hypervisor::regs::{CommonDebugRegs, CommonSpecialRegisters}; #[cfg(target_os = "windows")] use crate::hypervisor::wrappers::HandleWrapper; use crate::hypervisor::{HyperlightExit, InterruptHandle, InterruptHandleImpl, get_max_log_level}; @@ -88,6 +88,9 @@ pub(crate) struct HyperlightVm { next_slot: u32, // Monotonically increasing slot number freed_slots: Vec, // Reusable slots from unmapped regions + // pml4 saved to be able to restore it if needed + #[cfg(feature = "init-paging")] + pml4_addr: u64, #[cfg(gdb)] gdb_conn: Option>, #[cfg(gdb)] @@ -189,6 +192,8 @@ impl HyperlightVm { mmap_regions: Vec::new(), freed_slots: Vec::new(), + #[cfg(feature = "init-paging")] + pml4_addr: _pml4_addr, #[cfg(gdb)] gdb_conn, #[cfg(gdb)] @@ -585,6 +590,36 @@ impl HyperlightVm { Ok(()) } + // Resets the following vCPU state: + // - General purpose registers + // - Debug registers + // - XSAVE (overlaps with FPU) + // - FPU registers (to set default FPU state) + // - Special registers (with saved PML4 if feature enabled) + // TODO: check if we can't avoid calling set_fpu and only use reset_xsave + // TODO: check if other state needs to be reset + pub(crate) fn reset_vcpu(&self) -> Result<()> { + self.vm.set_regs(&CommonRegisters { + rflags: 1 << 1, // Reserved bit always set + ..Default::default() + })?; + self.vm.set_debug_regs(&CommonDebugRegs::default())?; + // Note: On KVM this ignores MXCSR so it's being set as part of reset_xsave. + // See https://github.com/torvalds/linux/blob/d358e5254674b70f34c847715ca509e46eb81e6f/arch/x86/kvm/x86.c#L12554-L12599 + self.vm.set_fpu(&CommonFpu::default())?; + self.vm.reset_xsave()?; + #[cfg(feature = "init-paging")] + self.vm + .set_sregs(&CommonSpecialRegisters::standard_64bit_defaults( + self.pml4_addr, + ))?; + #[cfg(not(feature = "init-paging"))] + self.vm + .set_sregs(&CommonSpecialRegisters::standard_real_mode_defaults())?; + + Ok(()) + } + // Handle a debug exit #[cfg(gdb)] fn handle_debug( @@ -1080,3 +1115,1027 @@ mod debug { } } } + +#[cfg(test)] +#[cfg(feature = "init-paging")] +#[allow(clippy::needless_range_loop)] +mod tests { + use std::sync::{Arc, Mutex}; + + use super::*; + #[cfg(kvm)] + use crate::hypervisor::regs::FP_CONTROL_WORD_DEFAULT; + use crate::hypervisor::regs::{CommonSegmentRegister, CommonTableRegister, MXCSR_DEFAULT}; + #[cfg(target_os = "windows")] + use crate::hypervisor::wrappers::HandleWrapper; + #[cfg(feature = "mem_profile")] + use crate::mem::exe::DummyUnwindInfo; + use crate::mem::layout::SandboxMemoryLayout; + use crate::mem::mgr::SandboxMemoryManager; + use crate::mem::ptr::RawPtr; + use crate::mem::ptr_offset::Offset; + use crate::mem::shared_mem::{ExclusiveSharedMemory, SharedMemory}; + use crate::sandbox::SandboxConfiguration; + use crate::sandbox::host_funcs::FunctionRegistry; + #[cfg(feature = "mem_profile")] + use crate::sandbox::trace::MemTraceInfo; + #[cfg(crashdump)] + use crate::sandbox::uninitialized::SandboxRuntimeConfig; + + /// Build dirty general purpose registers for testing reset_vcpu. + fn dirty_regs() -> CommonRegisters { + CommonRegisters { + rax: 0x1111111111111111, + rbx: 0x2222222222222222, + rcx: 0x3333333333333333, + rdx: 0x4444444444444444, + rsi: 0x5555555555555555, + rdi: 0x6666666666666666, + rsp: 0x7777777777777777, + rbp: 0x8888888888888888, + r8: 0x9999999999999999, + r9: 0xAAAAAAAAAAAAAAAA, + r10: 0xBBBBBBBBBBBBBBBB, + r11: 0xCCCCCCCCCCCCCCCC, + r12: 0xDDDDDDDDDDDDDDDD, + r13: 0xEEEEEEEEEEEEEEEE, + r14: 0xFFFFFFFFFFFFFFFF, + r15: 0x0123456789ABCDEF, + rip: 0xFEDCBA9876543210, + rflags: 0x202, // IF + reserved bit 1 + } + } + + /// Build dirty FPU state for testing reset_vcpu. + fn dirty_fpu() -> CommonFpu { + CommonFpu { + fpr: [[0xAB; 16]; 8], + fcw: 0x0F7F, // Different from default 0x037F + fsw: 0x1234, + ftwx: 0xAB, + last_opcode: 0x0123, + last_ip: 0xDEADBEEF00000000, + last_dp: 0xCAFEBABE00000000, + xmm: [[0xCD; 16]; 16], + mxcsr: 0x3F80, // Different from default 0x1F80 + } + } + + /// Build dirty special registers for testing reset_vcpu. + /// Must be consistent for 64-bit long mode (CR0/CR4/EFER). + fn dirty_sregs(_pml4_addr: u64) -> CommonSpecialRegisters { + let segment = CommonSegmentRegister { + base: 0x1000, + limit: 0xFFFF, + selector: 0x10, + type_: 3, // data segment, read/write, accessed + present: 1, + dpl: 0, + db: 1, + s: 1, + l: 0, + g: 1, + avl: 1, + unusable: 0, + padding: 0, + }; + // CS segment - 64-bit code segment + let cs_segment = CommonSegmentRegister { + base: 0, + limit: 0xFFFF, + selector: 0x08, + type_: 0b1011, // code segment, execute/read, accessed + present: 1, + dpl: 0, + db: 0, // must be 0 in 64-bit mode + s: 1, + l: 1, // 64-bit mode + g: 1, + avl: 0, + unusable: 0, + padding: 0, + }; + let table = CommonTableRegister { + base: 0xDEAD0000, + limit: 0xFFFF, + }; + CommonSpecialRegisters { + cs: cs_segment, + ds: segment, + es: segment, + fs: segment, + gs: segment, + ss: segment, + tr: CommonSegmentRegister { + type_: 0b1011, // busy TSS + present: 1, + ..segment + }, + ldt: segment, + gdt: table, + idt: table, + cr0: 0x80000011, // PE + ET + PG + cr2: 0xBADC0DE, + // MSHV validates cr3 and rejects bogus values; use valid _pml4_addr for MSHV + cr3: match get_available_hypervisor() { + #[cfg(mshv3)] + Some(HypervisorType::Mshv) => _pml4_addr, + _ => 0x12345000, + }, + cr4: 0x20, // PAE + cr8: 0x5, + efer: 0x500, // LME + LMA + apic_base: 0xFEE00900, + // interrupt_bitmap: [0xFFFFFFFF; 4], + interrupt_bitmap: [0; 4], // fails if non-zero on MSHV + } + } + + /// Build dirty debug registers for testing reset_vcpu. + /// + /// DR6 bit layout (Intel SDM / AMD APM): + /// Bits 0-3 (B0-B3): Breakpoint condition detected - software writable/clearable + /// Bits 4-10: Reserved, read as 1s on modern processors (read-only) + /// Bit 11 (BLD): Bus Lock Trap - cleared by processor, read-only on older CPUs + /// Bit 12: Reserved, always 0 + /// Bit 13 (BD): Debug Register Access Detected - software clearable + /// Bit 14 (BS): Single-Step - software clearable + /// Bit 15 (BT): Task Switch breakpoint - software clearable + /// Bit 16 (RTM): TSX-related, read-only (1 if no TSX) + /// Bits 17-31: Reserved, read as 1s on modern processors (read-only) + /// Bits 32-63: Reserved, must be 0 + /// + /// Writable bits: 0-3, 13, 14, 15 = mask 0xE00F + /// Reserved 1s: 4-10, 11 (if no BLD), 16 (if no TSX), 17-31 = ~0xE00F on lower 32 bits + const DR6_WRITABLE_MASK: u64 = 0xE00F; // B0-B3, BD, BS, BT + + /// DR7 bit layout: + /// Bits 0-7 (L0-L3, G0-G3): Local/global breakpoint enables - writable + /// Bits 8-9 (LE, GE): Local/Global Exact (386 only, ignored on modern) - writable + /// Bit 10: Reserved, must be 1 (read-only) + /// Bits 11-12: Reserved (RTM/TSX on some CPUs), must be 0 (read-only) + /// Bit 13 (GD): General Detect Enable - writable + /// Bits 14-15: Reserved, must be 0 (read-only) + /// Bits 16-31 (R/W0-3, LEN0-3): Breakpoint conditions and lengths - writable + /// Bits 32-63: Reserved, must be 0 (read-only) + /// + /// Writable bits: 0-9, 13, 16-31 = mask 0xFFFF23FF + const DR7_WRITABLE_MASK: u64 = 0xFFFF_23FF; + + fn dirty_debug_regs() -> CommonDebugRegs { + CommonDebugRegs { + dr0: 0xDEADBEEF00001000, + dr1: 0xDEADBEEF00002000, + dr2: 0xDEADBEEF00003000, + dr3: 0xDEADBEEF00004000, + // Set all writable bits: B0-B3 (0-3), BD (13), BS (14), BT (15) + dr6: DR6_WRITABLE_MASK, + // Set writable bits: L0-L3, G0-G3 (0-7), LE/GE (8-9), GD (13), conditions (16-31) + dr7: DR7_WRITABLE_MASK, + } + } + + /// Query CPUID.0DH.n for XSAVE component info. + /// Returns (size, offset, align_64) for the given component: + /// - size: CPUID.0DH.n:EAX - size in bytes + /// - offset: CPUID.0DH.n:EBX - offset from XSAVE base (standard format only) + /// - align_64: CPUID.0DH.n:ECX bit 1 - true if 64-byte aligned (compacted format) + fn xsave_component_info(comp_id: u32) -> (usize, usize, bool) { + let result = unsafe { std::arch::x86_64::__cpuid_count(0xD, comp_id) }; + let size = result.eax as usize; + let offset = result.ebx as usize; + let align_64 = (result.ecx & 0b10) != 0; + (size, offset, align_64) + } + + /// Query CPUID.0DH.00H for the bitmap of supported user state components. + /// EDX:EAX forms a 64-bit bitmap where bit i indicates support for component i. + fn xsave_supported_components() -> u64 { + let result = unsafe { std::arch::x86_64::__cpuid_count(0xD, 0) }; + (result.edx as u64) << 32 | (result.eax as u64) + } + + /// Dirty extended state components using compacted XSAVE format (MSHV/WHP). + /// Components are stored contiguously starting at byte 576, with alignment + /// requirements from CPUID.0DH.n:ECX[1]. + /// Returns a bitmask of components that were actually dirtied. + fn dirty_xsave_extended_compacted( + xsave: &mut [u32], + xcomp_bv: u64, + supported_components: u64, + ) -> u64 { + let mut dirtied_mask = 0u64; + let mut offset = 576usize; + + for comp_id in 2..63u32 { + // Skip if component not supported by CPU or not enabled in XCOMP_BV + if (supported_components & (1u64 << comp_id)) == 0 { + continue; + } + if (xcomp_bv & (1u64 << comp_id)) == 0 { + continue; + } + + let (size, _, align_64) = xsave_component_info(comp_id); + + // ECX[1]=1 means 64-byte aligned; ECX[1]=0 means immediately after previous + if align_64 { + offset = offset.next_multiple_of(64); + } + + // Dirty this component's data area (only if it fits in the buffer) + let start_idx = offset / 4; + let end_idx = (offset + size) / 4; + if end_idx <= xsave.len() { + for i in start_idx..end_idx { + xsave[i] = 0x12345678 ^ comp_id.wrapping_mul(0x11111111); + } + dirtied_mask |= 1u64 << comp_id; + } + + offset += size; + } + + dirtied_mask + } + + /// Dirty extended state components using standard XSAVE format (KVM). + /// Components are at fixed offsets from CPUID.0DH.n:EBX. + /// Returns a bitmask of components that were actually dirtied. + fn dirty_xsave_extended_standard(xsave: &mut [u32], supported_components: u64) -> u64 { + let mut dirtied_mask = 0u64; + + for comp_id in 2..63u32 { + // Skip if component not supported by CPU + if (supported_components & (1u64 << comp_id)) == 0 { + continue; + } + + let (size, fixed_offset, _) = xsave_component_info(comp_id); + + let start_idx = fixed_offset / 4; + let end_idx = (fixed_offset + size) / 4; + if end_idx <= xsave.len() { + for i in start_idx..end_idx { + xsave[i] = 0x12345678 ^ comp_id.wrapping_mul(0x11111111); + } + dirtied_mask |= 1u64 << comp_id; + } + } + + dirtied_mask + } + + /// Dirty the legacy XSAVE region (bytes 0-511) for testing reset_vcpu. + /// This includes FPU/x87 state, SSE state, and reserved areas. + /// + /// Layout (from Intel SDM Table 13-1): + /// Bytes 0-1: FCW, 2-3: FSW, 4: FTW, 5: reserved, 6-7: FOP + /// Bytes 8-15: FIP, 16-23: FDP + /// Bytes 24-27: MXCSR, 28-31: MXCSR_MASK (preserve - hardware defined) + /// Bytes 32-159: ST0-ST7/MM0-MM7 (8 regs × 16 bytes) + /// Bytes 160-415: XMM0-XMM15 (16 regs × 16 bytes) + /// Bytes 416-511: Reserved + fn dirty_xsave_legacy(xsave: &mut [u32], current_xsave: &[u8]) { + // FCW (bytes 0-1) + FSW (bytes 2-3) - pack into xsave[0] + // FCW = 0x0F7F (different from default 0x037F), FSW = 0x1234 + xsave[0] = 0x0F7F | (0x1234 << 16); + // FTW (byte 4) + reserved (byte 5) + FOP (bytes 6-7) - pack into xsave[1] + // FTW = 0xAB, FOP = 0x0123 + xsave[1] = 0xAB | (0x0123 << 16); + // FIP (bytes 8-15) - xsave[2] and xsave[3] + xsave[2] = 0xDEAD0001; + xsave[3] = 0xBEEF0002; + // FDP (bytes 16-23) - xsave[4] and xsave[5] + xsave[4] = 0xCAFE0003; + xsave[5] = 0xBABE0004; + // MXCSR (bytes 24-27) - xsave[6], use valid value different from default + xsave[6] = 0x3F80; + // xsave[7] is MXCSR_MASK - preserve from current (hardware defined, read-only) + if current_xsave.len() >= 32 { + xsave[7] = u32::from_le_bytes(current_xsave[28..32].try_into().unwrap()); + } + + // ST0-ST7/MM0-MM7 (bytes 32-159, indices 8-39) + for i in 8..40 { + xsave[i] = 0xCAFEBABE; + } + // XMM0-XMM15 (bytes 160-415, indices 40-103) + for i in 40..104 { + xsave[i] = 0xDEADBEEF; + } + + // Reserved area (bytes 416-511, indices 104-127) + for i in 104..128 { + xsave[i] = 0xABCDEF12; + } + } + + /// Preserve XSAVE header (bytes 512-575) from current state. + /// This includes XSTATE_BV and XCOMP_BV which hypervisors require. + fn preserve_xsave_header(xsave: &mut [u32], current_xsave: &[u8]) { + for i in 128..144 { + let byte_offset = i * 4; + xsave[i] = u32::from_le_bytes( + current_xsave[byte_offset..byte_offset + 4] + .try_into() + .unwrap(), + ); + } + } + + fn dirty_xsave(current_xsave: &[u8]) -> Vec { + let mut xsave = vec![0u32; current_xsave.len() / 4]; + + dirty_xsave_legacy(&mut xsave, current_xsave); + preserve_xsave_header(&mut xsave, current_xsave); + + let xcomp_bv = u64::from_le_bytes(current_xsave[520..528].try_into().unwrap()); + let supported_components = xsave_supported_components(); + + // Dirty extended components and get mask of what was actually dirtied + let extended_mask = if (xcomp_bv & (1u64 << 63)) != 0 { + // Compacted format (MSHV/WHP) + dirty_xsave_extended_compacted(&mut xsave, xcomp_bv, supported_components) + } else { + // Standard format (KVM) + dirty_xsave_extended_standard(&mut xsave, supported_components) + }; + + // UPDATE XSTATE_BV to indicate dirtied components have valid data. + // WHP validates consistency between XSTATE_BV and actual data in the buffer. + // Bits 0,1 = legacy x87/SSE (always set after dirty_xsave_legacy) + // Bits 2+ = extended components that we actually dirtied + let xstate_bv = 0x3 | extended_mask; + + // Write XSTATE_BV to bytes 512-519 (u32 indices 128-129) + xsave[128] = (xstate_bv & 0xFFFFFFFF) as u32; + xsave[129] = (xstate_bv >> 32) as u32; + + xsave + } + + fn hyperlight_vm(code: &[u8]) -> Result { + let config: SandboxConfiguration = Default::default(); + #[cfg(crashdump)] + let rt_cfg: SandboxRuntimeConfig = Default::default(); + #[cfg(feature = "mem_profile")] + let trace_info = MemTraceInfo::new(Arc::new(DummyUnwindInfo {})).unwrap(); + + let layout = SandboxMemoryLayout::new(config, code.len(), 4096, 0, 0, None)?; + + let mem_size = layout.get_memory_size()?; + let eshm = ExclusiveSharedMemory::new(mem_size)?; + + let stack_cookie = [0u8; 16]; + let mem_mgr = + SandboxMemoryManager::new(layout, eshm, RawPtr::from(0), Offset::from(0), stack_cookie); + + let (mut mem_mgr_hshm, mut mem_mgr_gshm) = mem_mgr.build(); + + // Set up shared memory (page tables) + let rsp = { + let mut regions = layout.get_memory_regions(&mem_mgr_gshm.shared_mem)?; + let mem_size = mem_mgr_gshm.shared_mem.mem_size() as u64; + mem_mgr_gshm.set_up_shared_memory(mem_size, &mut regions)? + }; + + // Write code + let code_offset = layout.get_guest_code_offset(); + mem_mgr_hshm.shared_mem.copy_from_slice(code, code_offset)?; + + // Get regions for VM + let regions = layout + .get_memory_regions(&mem_mgr_gshm.shared_mem)? + .into_iter() + .filter(|r| !r.guest_region.is_empty()) + .collect(); + + // Calculate pml4_addr + let pml4_addr = SandboxMemoryLayout::PML4_OFFSET as u64; + + // Entrypoint + let entrypoint = layout.get_guest_code_address() as u64; + + let mut vm = HyperlightVm::new( + regions, + pml4_addr, + entrypoint, + rsp, + &config, + #[cfg(target_os = "windows")] + HandleWrapper::from( + mem_mgr_hshm + .shared_mem + .with_exclusivity(|s| s.get_mmap_file_handle())?, + ), + #[cfg(target_os = "windows")] + mem_mgr_hshm.shared_mem.raw_mem_size(), + #[cfg(gdb)] + None, + #[cfg(crashdump)] + rt_cfg, + #[cfg(feature = "mem_profile")] + trace_info, + )?; + + let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default())); + #[cfg(gdb)] + let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr_hshm.clone())); + + // Run the VM + vm.initialise( + RawPtr::from(0), + 0, + 4096, + &mut mem_mgr_hshm, + &host_funcs, + None, + #[cfg(gdb)] + dbg_mem_access_fn.clone(), + )?; + Ok(vm) + } + + #[test] + fn reset_vcpu_simple() { + const CODE: [u8; 1] = [0xf4]; // hlt + let hyperlight_vm = hyperlight_vm(&CODE).unwrap(); + let available_hv = *get_available_hypervisor().as_ref().unwrap(); + + // Set all vCPU state to dirty values + let regs = dirty_regs(); + let fpu = dirty_fpu(); + let sregs = dirty_sregs(hyperlight_vm.pml4_addr); + let current_xsave = hyperlight_vm.vm.xsave().unwrap(); + let xsave = dirty_xsave(¤t_xsave); + let debug_regs = dirty_debug_regs(); + + hyperlight_vm.vm.set_xsave(&xsave).unwrap(); + hyperlight_vm.vm.set_regs(®s).unwrap(); + hyperlight_vm.vm.set_fpu(&fpu).unwrap(); + hyperlight_vm.vm.set_sregs(&sregs).unwrap(); + hyperlight_vm.vm.set_debug_regs(&debug_regs).unwrap(); + + // Verify state was set + assert_eq!(hyperlight_vm.vm.regs().unwrap(), regs); + #[cfg_attr(not(kvm), allow(unused_mut))] + let mut got_fpu = hyperlight_vm.vm.fpu().unwrap(); + let mut expected_fpu = fpu; + // KVM doesn't preserve mxcsr via set_fpu/fpu() + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + got_fpu.mxcsr = fpu.mxcsr; + } + // fpr only uses 80 bits per register. Normalize upper bits for comparison. + for i in 0..8 { + expected_fpu.fpr[i][10..16].copy_from_slice(&got_fpu.fpr[i][10..16]); + } + assert_eq!(got_fpu, expected_fpu); + + // Verify debug regs + let got_debug_regs = hyperlight_vm.vm.debug_regs().unwrap(); + let mut expected_debug_regs = debug_regs; + // DR6: writable bits are B0-B3 (0-3), BD (13), BS (14), BT (15) = 0xE00F + // Reserved bits (4-12, 16-31) are read-only and set by CPU, copy from actual + expected_debug_regs.dr6 = + (debug_regs.dr6 & DR6_WRITABLE_MASK) | (got_debug_regs.dr6 & !DR6_WRITABLE_MASK); + // DR7: writable bits are 0-9, 13, 16-31 = 0xFFFF23FF + // Reserved bits (10-12, 14-15) have fixed values, copy from actual + expected_debug_regs.dr7 = + (debug_regs.dr7 & DR7_WRITABLE_MASK) | (got_debug_regs.dr7 & !DR7_WRITABLE_MASK); + assert_eq!(got_debug_regs, expected_debug_regs); + + // Verify sregs were set + let got_sregs = hyperlight_vm.vm.sregs().unwrap(); + let mut expected_sregs = sregs; + // ss.db (stack segment default size) may differ by hypervisor; ignored in 64-bit mode + expected_sregs.ss.db = got_sregs.ss.db; + // unusable and g are hypervisor implementation details (see comment below for details) + expected_sregs.cs.unusable = got_sregs.cs.unusable; + expected_sregs.cs.g = got_sregs.cs.g; + expected_sregs.ds.unusable = got_sregs.ds.unusable; + expected_sregs.ds.g = got_sregs.ds.g; + expected_sregs.es.unusable = got_sregs.es.unusable; + expected_sregs.es.g = got_sregs.es.g; + expected_sregs.fs.unusable = got_sregs.fs.unusable; + expected_sregs.fs.g = got_sregs.fs.g; + expected_sregs.gs.unusable = got_sregs.gs.unusable; + expected_sregs.gs.g = got_sregs.gs.g; + expected_sregs.ss.unusable = got_sregs.ss.unusable; + expected_sregs.ss.g = got_sregs.ss.g; + expected_sregs.tr.unusable = got_sregs.tr.unusable; + expected_sregs.tr.g = got_sregs.tr.g; + expected_sregs.ldt.unusable = got_sregs.ldt.unusable; + expected_sregs.ldt.g = got_sregs.ldt.g; + assert_eq!(got_sregs, expected_sregs); + + // Reset the vCPU + hyperlight_vm.reset_vcpu().unwrap(); + + // Verify the fpu was reset to defaults + assert_eq!( + hyperlight_vm.vm.regs().unwrap(), + CommonRegisters { + rflags: 1 << 1, // Reserved bit 1 is always set + ..Default::default() + } + ); + + #[cfg_attr(not(kvm), allow(unused_mut))] + let mut reset_fpu = hyperlight_vm.vm.fpu().unwrap(); + // KVM ignores mxcsr in its set_fpu/fpu() + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + reset_fpu.mxcsr = MXCSR_DEFAULT; + } + assert_eq!(reset_fpu, CommonFpu::default()); + + // Verify debug registers are reset to defaults + // Reserved bits in DR6/DR7 are read-only (set by CPU), copy from actual + // Writable bits should be cleared to 0 after reset + let reset_debug_regs = hyperlight_vm.vm.debug_regs().unwrap(); + let expected_reset_debug_regs = CommonDebugRegs { + dr6: reset_debug_regs.dr6 & !DR6_WRITABLE_MASK, + dr7: reset_debug_regs.dr7 & !DR7_WRITABLE_MASK, + ..Default::default() + }; + assert_eq!(reset_debug_regs, expected_reset_debug_regs); + + // Verify xsave is reset - should be zeroed except for hypervisor-specific fields + let reset_xsave = hyperlight_vm.vm.xsave().unwrap(); + // Build expected xsave: all zeros with fpu specific defaults. Then copy hypervisor-specific fields from actual + let mut expected_xsave = vec![0u8; reset_xsave.len()]; + #[cfg(mshv3)] + if available_hv == HypervisorType::Mshv { + // FCW (offset 0-1): When XSTATE_BV.LegacyX87 = 0 (init state), the hypervisor + // skips copying the FPU legacy region entirely, leaving zeros in the buffer. + // The actual guest FCW register is 0x037F (verified via fpu() assertion above), + // but xsave() doesn't report it because XSTATE_BV=0 means "init state, buffer + // contents undefined." We copy from actual to handle this. + expected_xsave[0..2].copy_from_slice(&reset_xsave[0..2]); + } + #[cfg(target_os = "windows")] + if available_hv == HypervisorType::Whp { + // FCW (offset 0-1): When XSTATE_BV.LegacyX87 = 0 (init state), the hypervisor + // skips copying the FPU legacy region entirely, leaving zeros in the buffer. + // The actual guest FCW register is 0x037F (verified via fpu() assertion above), + // but xsave() doesn't report it because XSTATE_BV=0 means "init state, buffer + // contents undefined." We copy from actual to handle this. + expected_xsave[0..2].copy_from_slice(&reset_xsave[0..2]); + } + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + expected_xsave[0..2].copy_from_slice(&FP_CONTROL_WORD_DEFAULT.to_le_bytes()); + } + + // - MXCSR at offset 24-27: default FPU state set by hypervisor + expected_xsave[24..28].copy_from_slice(&MXCSR_DEFAULT.to_le_bytes()); + // - MXCSR_MASK at offset 28-31: hardware-defined, read-only + expected_xsave[28..32].copy_from_slice(&reset_xsave[28..32]); + // - Reserved bytes at offset 464-511: These are in the reserved/padding area of the legacy + // FXSAVE region (after XMM registers which end at byte 416). On KVM/Intel, these bytes + // may contain hypervisor-specific metadata that isn't cleared during vCPU reset. + // Since this is not guest-visible computational state, we copy from actual to expected. + expected_xsave[464..512].copy_from_slice(&reset_xsave[464..512]); + // - XSAVE header at offset 512-575: contains XSTATE_BV and XCOMP_BV (hypervisor-managed) + // XSTATE_BV (512-519): Bitmap indicating which state components have valid data in the + // buffer. When a bit is 0, the hypervisor uses the architectural init value for that + // component. After reset, xsave() may still return non-zero XSTATE_BV since the + // hypervisor reports which components it manages, not which have been modified. + // XCOMP_BV (520-527): Compaction bitmap. Bit 63 indicates compacted format (used by MSHV/WHP). + // When set, the XSAVE area uses a compact layout where only enabled components are stored + // contiguously. This is a format indicator, not state data, so it's preserved across reset. + // Both fields are managed by the hypervisor to describe the XSAVE area format and capabilities, + // not guest-visible computational state, so they don't need to be zeroed on reset. + if reset_xsave.len() >= 576 { + expected_xsave[512..576].copy_from_slice(&reset_xsave[512..576]); + } + assert_eq!( + reset_xsave, expected_xsave, + "xsave should be zeroed except for hypervisor-specific fields" + ); + + // Verify sregs are reset to defaults + let defaults = CommonSpecialRegisters::standard_64bit_defaults(hyperlight_vm.pml4_addr); + let reset_sregs = hyperlight_vm.vm.sregs().unwrap(); + let mut expected_reset_sregs = defaults; + // ss.db (stack segment default size) may differ by hypervisor; ignored in 64-bit mode + expected_reset_sregs.ss.db = reset_sregs.ss.db; + // unusable, type_, and g (granularity) for segments are hypervisor implementation details. + // These fields are part of the hidden descriptor cache. While guests can write them + // indirectly (by loading segments from a crafted GDT), guests cannot read them back + // (e.g., `mov ax, ds` only returns the selector, not the hidden cache). + // KVM and MSHV reset to different default values, but both properly reset so there's + // no information leakage between tenants. g=0 means byte granularity, g=1 means 4KB pages. + expected_reset_sregs.cs.unusable = reset_sregs.cs.unusable; + expected_reset_sregs.cs.g = reset_sregs.cs.g; + expected_reset_sregs.ds.unusable = reset_sregs.ds.unusable; + expected_reset_sregs.ds.type_ = reset_sregs.ds.type_; + expected_reset_sregs.ds.g = reset_sregs.ds.g; + expected_reset_sregs.es.unusable = reset_sregs.es.unusable; + expected_reset_sregs.es.type_ = reset_sregs.es.type_; + expected_reset_sregs.es.g = reset_sregs.es.g; + expected_reset_sregs.fs.unusable = reset_sregs.fs.unusable; + expected_reset_sregs.fs.type_ = reset_sregs.fs.type_; + expected_reset_sregs.fs.g = reset_sregs.fs.g; + expected_reset_sregs.gs.unusable = reset_sregs.gs.unusable; + expected_reset_sregs.gs.type_ = reset_sregs.gs.type_; + expected_reset_sregs.gs.g = reset_sregs.gs.g; + expected_reset_sregs.ss.unusable = reset_sregs.ss.unusable; + expected_reset_sregs.ss.type_ = reset_sregs.ss.type_; + expected_reset_sregs.ss.g = reset_sregs.ss.g; + expected_reset_sregs.tr.unusable = reset_sregs.tr.unusable; + expected_reset_sregs.tr.g = reset_sregs.tr.g; + expected_reset_sregs.ldt.unusable = reset_sregs.ldt.unusable; + expected_reset_sregs.ldt.g = reset_sregs.ldt.g; + assert_eq!(reset_sregs, expected_reset_sregs); + } + + /// Tests that actually runs code, as opposed to just setting vCPU state. + mod run_tests { + use super::*; + + #[test] + fn reset_vcpu_regs() { + #[rustfmt::skip] + const CODE: [u8; 151] = [ + 0x48, 0xb8, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, // mov rax, 0x1111111111111111 + 0x48, 0xbb, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, // mov rbx, 0x2222222222222222 + 0x48, 0xb9, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, // mov rcx, 0x3333333333333333 + 0x48, 0xba, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, // mov rdx, 0x4444444444444444 + 0x48, 0xbe, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // mov rsi, 0x5555555555555555 + 0x48, 0xbf, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, // mov rdi, 0x6666666666666666 + 0x48, 0xbd, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, // mov rbp, 0x7777777777777777 + 0x49, 0xb8, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, // mov r8, 0x8888888888888888 + 0x49, 0xb9, 0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x99, 0x99, // mov r9, 0x9999999999999999 + 0x49, 0xba, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, // mov r10, 0xAAAAAAAAAAAAAAAA + 0x49, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, 0xbb, // mov r11, 0xBBBBBBBBBBBBBBBB + 0x49, 0xbc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // mov r12, 0xCCCCCCCCCCCCCCCC + 0x49, 0xbd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, 0xdd, // mov r13, 0xDDDDDDDDDDDDDDDD + 0x49, 0xbe, 0xee, 0xee, 0xee, 0xee, 0xee, 0xee, 0xee, 0xee, // mov r14, 0xEEEEEEEEEEEEEEEE + 0x49, 0xbf, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // mov r15, 0xFFFFFFFFFFFFFFFF + 0xf4, // hlt + ]; + + let hyperlight_vm = hyperlight_vm(&CODE).unwrap(); + + // After run, check registers match expected dirty state + let regs = hyperlight_vm.vm.regs().unwrap(); + let mut expected_dirty = CommonRegisters { + rax: 0x1111111111111111, + rbx: 0x2222222222222222, + rcx: 0x3333333333333333, + rdx: 0x4444444444444444, + rsi: 0x5555555555555555, + rdi: 0x6666666666666666, + rsp: 0, + rbp: 0x7777777777777777, + r8: 0x8888888888888888, + r9: 0x9999999999999999, + r10: 0xAAAAAAAAAAAAAAAA, + r11: 0xBBBBBBBBBBBBBBBB, + r12: 0xCCCCCCCCCCCCCCCC, + r13: 0xDDDDDDDDDDDDDDDD, + r14: 0xEEEEEEEEEEEEEEEE, + r15: 0xFFFFFFFFFFFFFFFF, + rip: 0, + rflags: 0, + }; + // rip, rsp, and rflags are set by the CPU, we don't expect those to match our expected values + expected_dirty.rip = regs.rip; + expected_dirty.rsp = regs.rsp; + expected_dirty.rflags = regs.rflags; + assert_eq!(regs, expected_dirty); + + // Reset vcpu + hyperlight_vm.reset_vcpu().unwrap(); + + // Check registers are reset to defaults + let regs = hyperlight_vm.vm.regs().unwrap(); + let expected_reset = CommonRegisters { + rax: 0, + rbx: 0, + rcx: 0, + rdx: 0, + rsi: 0, + rdi: 0, + rsp: 0, + rbp: 0, + r8: 0, + r9: 0, + r10: 0, + r11: 0, + r12: 0, + r13: 0, + r14: 0, + r15: 0, + rip: 0, + rflags: 1 << 1, // Reserved bit 1 is always set + }; + assert_eq!(regs, expected_reset); + } + + #[test] + fn reset_vcpu_fpu() { + #[cfg(kvm)] + use crate::hypervisor::regs::MXCSR_DEFAULT; + + #[cfg(kvm)] + let available_hv = *get_available_hypervisor().as_ref().unwrap(); + + #[rustfmt::skip] + const CODE: [u8; 289] = [ + // xmm0-xmm7: use movd + pshufd to fill with pattern + 0xb8, 0x11, 0x11, 0x11, 0x11, // mov eax, 0x11111111 + 0x66, 0x0f, 0x6e, 0xc0, // movd xmm0, eax + 0x66, 0x0f, 0x70, 0xc0, 0x00, // pshufd xmm0, xmm0, 0 + 0xb8, 0x22, 0x22, 0x22, 0x22, // mov eax, 0x22222222 + 0x66, 0x0f, 0x6e, 0xc8, // movd xmm1, eax + 0x66, 0x0f, 0x70, 0xc9, 0x00, // pshufd xmm1, xmm1, 0 + 0xb8, 0x33, 0x33, 0x33, 0x33, // mov eax, 0x33333333 + 0x66, 0x0f, 0x6e, 0xd0, // movd xmm2, eax + 0x66, 0x0f, 0x70, 0xd2, 0x00, // pshufd xmm2, xmm2, 0 + 0xb8, 0x44, 0x44, 0x44, 0x44, // mov eax, 0x44444444 + 0x66, 0x0f, 0x6e, 0xd8, // movd xmm3, eax + 0x66, 0x0f, 0x70, 0xdb, 0x00, // pshufd xmm3, xmm3, 0 + 0xb8, 0x55, 0x55, 0x55, 0x55, // mov eax, 0x55555555 + 0x66, 0x0f, 0x6e, 0xe0, // movd xmm4, eax + 0x66, 0x0f, 0x70, 0xe4, 0x00, // pshufd xmm4, xmm4, 0 + 0xb8, 0x66, 0x66, 0x66, 0x66, // mov eax, 0x66666666 + 0x66, 0x0f, 0x6e, 0xe8, // movd xmm5, eax + 0x66, 0x0f, 0x70, 0xed, 0x00, // pshufd xmm5, xmm5, 0 + 0xb8, 0x77, 0x77, 0x77, 0x77, // mov eax, 0x77777777 + 0x66, 0x0f, 0x6e, 0xf0, // movd xmm6, eax + 0x66, 0x0f, 0x70, 0xf6, 0x00, // pshufd xmm6, xmm6, 0 + 0xb8, 0x88, 0x88, 0x88, 0x88, // mov eax, 0x88888888 + 0x66, 0x0f, 0x6e, 0xf8, // movd xmm7, eax + 0x66, 0x0f, 0x70, 0xff, 0x00, // pshufd xmm7, xmm7, 0 + // xmm8-xmm15: REX prefix versions + 0xb8, 0x99, 0x99, 0x99, 0x99, // mov eax, 0x99999999 + 0x66, 0x44, 0x0f, 0x6e, 0xc0, // movd xmm8, eax + 0x66, 0x45, 0x0f, 0x70, 0xc0, 0x00, // pshufd xmm8, xmm8, 0 + 0xb8, 0xaa, 0xaa, 0xaa, 0xaa, // mov eax, 0xAAAAAAAA + 0x66, 0x44, 0x0f, 0x6e, 0xc8, // movd xmm9, eax + 0x66, 0x45, 0x0f, 0x70, 0xc9, 0x00, // pshufd xmm9, xmm9, 0 + 0xb8, 0xbb, 0xbb, 0xbb, 0xbb, // mov eax, 0xBBBBBBBB + 0x66, 0x44, 0x0f, 0x6e, 0xd0, // movd xmm10, eax + 0x66, 0x45, 0x0f, 0x70, 0xd2, 0x00, // pshufd xmm10, xmm10, 0 + 0xb8, 0xcc, 0xcc, 0xcc, 0xcc, // mov eax, 0xCCCCCCCC + 0x66, 0x44, 0x0f, 0x6e, 0xd8, // movd xmm11, eax + 0x66, 0x45, 0x0f, 0x70, 0xdb, 0x00, // pshufd xmm11, xmm11, 0 + 0xb8, 0xdd, 0xdd, 0xdd, 0xdd, // mov eax, 0xDDDDDDDD + 0x66, 0x44, 0x0f, 0x6e, 0xe0, // movd xmm12, eax + 0x66, 0x45, 0x0f, 0x70, 0xe4, 0x00, // pshufd xmm12, xmm12, 0 + 0xb8, 0xee, 0xee, 0xee, 0xee, // mov eax, 0xEEEEEEEE + 0x66, 0x44, 0x0f, 0x6e, 0xe8, // movd xmm13, eax + 0x66, 0x45, 0x0f, 0x70, 0xed, 0x00, // pshufd xmm13, xmm13, 0 + 0xb8, 0xff, 0xff, 0xff, 0xff, // mov eax, 0xFFFFFFFF + 0x66, 0x44, 0x0f, 0x6e, 0xf0, // movd xmm14, eax + 0x66, 0x45, 0x0f, 0x70, 0xf6, 0x00, // pshufd xmm14, xmm14, 0 + 0xb8, 0x78, 0x56, 0x34, 0x12, // mov eax, 0x12345678 + 0x66, 0x44, 0x0f, 0x6e, 0xf8, // movd xmm15, eax + 0x66, 0x45, 0x0f, 0x70, 0xff, 0x00, // pshufd xmm15, xmm15, 0 + + // Use 7 FLDs so TOP=1 after execution, different from default TOP=0. + // This ensures reset properly clears TOP, not just register contents. + 0xd9, 0xee, // fldz (0.0) + 0xd9, 0xea, // fldl2e (log2(e)) + 0xd9, 0xe9, // fldl2t (log2(10)) + 0xd9, 0xec, // fldlg2 (log10(2)) + 0xd9, 0xed, // fldln2 (ln(2)) + 0xd9, 0xeb, // fldpi (pi) + // Push a memory value to also dirty last_dp + 0x48, 0xb8, 0xef, 0xbe, 0xad, 0xde, 0x00, 0x00, 0x00, 0x00, // mov rax, 0xDEADBEEF + 0x50, // push rax + 0xdd, 0x04, 0x24, // fld qword [rsp] - dirties last_dp + 0x58, // pop rax + + // Dirty FCW (0x0F7F, different from default 0x037F) + 0xb8, 0x7f, 0x0f, 0x00, 0x00, // mov eax, 0x0F7F + 0x50, // push rax + 0xd9, 0x2c, 0x24, // fldcw [rsp] + 0x58, // pop rax + + // Dirty MXCSR (0x3F80, different from default 0x1F80) + 0xb8, 0x80, 0x3f, 0x00, 0x00, // mov eax, 0x3F80 + 0x50, // push rax + 0x0f, 0xae, 0x14, 0x24, // ldmxcsr [rsp] + 0x58, // pop rax + + 0xf4, // hlt + ]; + + let hyperlight_vm = hyperlight_vm(&CODE).unwrap(); + + // After run, check FPU state matches expected dirty values + let fpu = hyperlight_vm.vm.fpu().unwrap(); + + #[cfg_attr(not(kvm), allow(unused_mut))] + let mut expected_dirty = CommonFpu { + fcw: 0x0F7F, + ftwx: 0xFE, // 7 registers valid (bit 0 empty after 7 pushes with TOP=1) + xmm: [ + 0x11111111111111111111111111111111_u128.to_le_bytes(), + 0x22222222222222222222222222222222_u128.to_le_bytes(), + 0x33333333333333333333333333333333_u128.to_le_bytes(), + 0x44444444444444444444444444444444_u128.to_le_bytes(), + 0x55555555555555555555555555555555_u128.to_le_bytes(), + 0x66666666666666666666666666666666_u128.to_le_bytes(), + 0x77777777777777777777777777777777_u128.to_le_bytes(), + 0x88888888888888888888888888888888_u128.to_le_bytes(), + 0x99999999999999999999999999999999_u128.to_le_bytes(), + 0xAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA_u128.to_le_bytes(), + 0xBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB_u128.to_le_bytes(), + 0xCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC_u128.to_le_bytes(), + 0xDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD_u128.to_le_bytes(), + 0xEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE_u128.to_le_bytes(), + 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF_u128.to_le_bytes(), + 0x12345678123456781234567812345678_u128.to_le_bytes(), + ], + mxcsr: 0x3F80, + fsw: 0x0802, // TOP=1 after 7 pushes (bits 11-13), DE flag from denormal load + // fpr: 80-bit values with 6 bytes padding; may vary between CPU vendors + fpr: fpu.fpr, + // last_opcode: FPU Opcode update varies by CPU (may only update on unmasked exceptions) + last_opcode: fpu.last_opcode, + // last_ip: code is loaded at runtime-determined address + last_ip: fpu.last_ip, + // last_dp: points to stack (rsp) which is runtime-determined + last_dp: fpu.last_dp, + }; + // KVM doesn't preserve mxcsr via fpu() + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + expected_dirty.mxcsr = fpu.mxcsr; + } + assert_eq!(fpu, expected_dirty); + + // Verify MXCSR via xsave on KVM (since fpu() doesn't return it) + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + let xsave = hyperlight_vm.vm.xsave().unwrap(); + let mxcsr = u32::from_le_bytes(xsave[24..28].try_into().unwrap()); + assert_eq!(mxcsr, 0x3F80, "MXCSR in XSAVE should be dirty"); + } + + // Reset vcpu + hyperlight_vm.reset_vcpu().unwrap(); + + // Check FPU is reset to defaults + #[cfg_attr(not(kvm), allow(unused_mut))] + let mut fpu = hyperlight_vm.vm.fpu().unwrap(); + // KVM doesn't preserve mxcsr via fpu(), set to expected default + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + fpu.mxcsr = MXCSR_DEFAULT; + } + assert_eq!(fpu, CommonFpu::default()); + + // Verify MXCSR via xsave on KVM + #[cfg(kvm)] + if available_hv == HypervisorType::Kvm { + let xsave = hyperlight_vm.vm.xsave().unwrap(); + let mxcsr = u32::from_le_bytes(xsave[24..28].try_into().unwrap()); + assert_eq!(mxcsr, MXCSR_DEFAULT, "MXCSR in XSAVE should be reset"); + } + } + + #[test] + fn reset_vcpu_debug_regs() { + // Code that sets debug registers and halts + // In real mode (ring 0), we can access debug registers directly + #[rustfmt::skip] + let code: &[u8] = &[ + 0x48, 0xb8, 0x00, 0x00, 0x00, 0x00, 0xef, 0xbe, 0xad, 0xde, // mov rax, 0xDEADBEEF00000000 + 0x0f, 0x23, 0xc0, // mov dr0, rax + 0x48, 0xb8, 0x01, 0x00, 0x00, 0x00, 0xef, 0xbe, 0xad, 0xde, // mov rax, 0xDEADBEEF00000001 + 0x0f, 0x23, 0xc8, // mov dr1, rax + 0x48, 0xb8, 0x02, 0x00, 0x00, 0x00, 0xef, 0xbe, 0xad, 0xde, // mov rax, 0xDEADBEEF00000002 + 0x0f, 0x23, 0xd0, // mov dr2, rax + 0x48, 0xb8, 0x03, 0x00, 0x00, 0x00, 0xef, 0xbe, 0xad, 0xde, // mov rax, 0xDEADBEEF00000003 + 0x0f, 0x23, 0xd8, // mov dr3, rax + 0x48, 0xc7, 0xc0, 0x01, 0x00, 0x00, 0x00, // mov rax, 1 + 0x0f, 0x23, 0xf0, // mov dr6, rax + 0x48, 0xc7, 0xc0, 0xff, 0x00, 0x00, 0x00, // mov rax, 0xFF + 0x0f, 0x23, 0xf8, // mov dr7, rax + 0xf4, // hlt + ]; + + let hyperlight_vm = hyperlight_vm(code).unwrap(); + + // Verify debug registers are dirty + let debug_regs = hyperlight_vm.vm.debug_regs().unwrap(); + let expected_dirty = CommonDebugRegs { + dr0: 0xDEAD_BEEF_0000_0000, + dr1: 0xDEAD_BEEF_0000_0001, + dr2: 0xDEAD_BEEF_0000_0002, + dr3: 0xDEAD_BEEF_0000_0003, + // dr6: guest set B0 (bit 0) = 1, reserved bits vary by CPU + dr6: (debug_regs.dr6 & !DR6_WRITABLE_MASK) | 0x1, + // dr7: guest set lower byte = 0xFF, reserved bits vary by CPU + dr7: (debug_regs.dr7 & !DR7_WRITABLE_MASK) | 0xFF, + }; + assert_eq!(debug_regs, expected_dirty); + + // Reset vcpu + hyperlight_vm.reset_vcpu().unwrap(); + + // Check debug registers are reset to default values + let debug_regs = hyperlight_vm.vm.debug_regs().unwrap(); + let expected_reset = CommonDebugRegs { + dr0: 0, + dr1: 0, + dr2: 0, + dr3: 0, + // dr6: reserved bits preserved, writable bits (B0-B3, BD, BS, BT) cleared + dr6: debug_regs.dr6 & !DR6_WRITABLE_MASK, + // dr7: reserved bits preserved, writable bits cleared + dr7: debug_regs.dr7 & !DR7_WRITABLE_MASK, + }; + assert_eq!(debug_regs, expected_reset); + } + + #[test] + fn reset_vcpu_sregs() { + // Code that modifies special registers and halts + // We can modify CR0.WP, CR2, CR4.TSD, and CR8 from guest code in ring 0 + #[rustfmt::skip] + let code: &[u8] = &[ + // Set CR0.WP (Write Protect, bit 16) + 0x0f, 0x20, 0xc0, // mov rax, cr0 + 0x48, 0x0d, 0x00, 0x00, 0x01, 0x00, // or rax, 0x10000 + 0x0f, 0x22, 0xc0, // mov cr0, rax + // Set CR2 + 0x48, 0xb8, 0xef, 0xbe, 0xad, 0xde, 0x00, 0x00, 0x00, 0x00, // mov rax, 0xDEADBEEF + 0x0f, 0x22, 0xd0, // mov cr2, rax + // Set CR4.TSD (Time Stamp Disable, bit 2) + 0x0f, 0x20, 0xe0, // mov rax, cr4 + 0x48, 0x83, 0xc8, 0x04, // or rax, 0x4 + 0x0f, 0x22, 0xe0, // mov cr4, rax + // Set CR8 + 0x48, 0xc7, 0xc0, 0x05, 0x00, 0x00, 0x00, // mov rax, 5 + 0x44, 0x0f, 0x22, 0xc0, // mov cr8, rax + 0xf4, // hlt + ]; + + let hyperlight_vm = hyperlight_vm(code).unwrap(); + + // Get the expected defaults + let defaults = CommonSpecialRegisters::standard_64bit_defaults(hyperlight_vm.pml4_addr); + + // Verify registers are dirty (CR0.WP, CR2, CR4.TSD and CR8 modified by our code) + let sregs = hyperlight_vm.vm.sregs().unwrap(); + let mut expected_dirty = CommonSpecialRegisters { + cr0: defaults.cr0 | 0x10000, // WP bit set + cr2: 0xDEADBEEF, + cr4: defaults.cr4 | 0x4, // TSD bit set + cr8: 0x5, + ..defaults + }; + // ss.db (stack segment default size) may differ by hypervisor; ignored in 64-bit mode + expected_dirty.ss.db = sregs.ss.db; + // unusable and type_ for non-present segments are hypervisor implementation details + // KVM returns type_=1, WHP returns type_=0 for non-present segments + expected_dirty.cs.unusable = sregs.cs.unusable; + expected_dirty.ds.unusable = sregs.ds.unusable; + expected_dirty.ds.type_ = sregs.ds.type_; + expected_dirty.es.unusable = sregs.es.unusable; + expected_dirty.es.type_ = sregs.es.type_; + expected_dirty.fs.unusable = sregs.fs.unusable; + expected_dirty.fs.type_ = sregs.fs.type_; + expected_dirty.gs.unusable = sregs.gs.unusable; + expected_dirty.gs.type_ = sregs.gs.type_; + expected_dirty.ss.unusable = sregs.ss.unusable; + expected_dirty.ss.type_ = sregs.ss.type_; + expected_dirty.tr.unusable = sregs.tr.unusable; + expected_dirty.ldt.unusable = sregs.ldt.unusable; + assert_eq!(sregs, expected_dirty); + + // Reset vcpu + hyperlight_vm.reset_vcpu().unwrap(); + + // Check registers are reset to defaults + let sregs = hyperlight_vm.vm.sregs().unwrap(); + let mut expected_reset = defaults; + // ss.db (stack segment default size) may differ by hypervisor; ignored in 64-bit mode + expected_reset.ss.db = sregs.ss.db; + // unusable and type_ for non-present segments are hypervisor implementation details + // KVM returns type_=1, WHP returns type_=0 for non-present segments + expected_reset.cs.unusable = sregs.cs.unusable; + expected_reset.ds.unusable = sregs.ds.unusable; + expected_reset.ds.type_ = sregs.ds.type_; + expected_reset.es.unusable = sregs.es.unusable; + expected_reset.es.type_ = sregs.es.type_; + expected_reset.fs.unusable = sregs.fs.unusable; + expected_reset.fs.type_ = sregs.fs.type_; + expected_reset.gs.unusable = sregs.gs.unusable; + expected_reset.gs.type_ = sregs.gs.type_; + expected_reset.ss.unusable = sregs.ss.unusable; + expected_reset.ss.type_ = sregs.ss.type_; + expected_reset.tr.unusable = sregs.tr.unusable; + expected_reset.ldt.unusable = sregs.ldt.unusable; + assert_eq!(sregs, expected_reset); + } + } +} diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 8c6feb351..a66960766 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -21,7 +21,7 @@ use std::sync::LazyLock; #[cfg(gdb)] use mshv_bindings::{DebugRegisters, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT}; use mshv_bindings::{ - FloatingPointUnit, SpecialRegisters, StandardRegisters, hv_message_type, + FloatingPointUnit, SpecialRegisters, StandardRegisters, XSave, hv_message_type, hv_message_type_HVMSG_GPA_INTERCEPT, hv_message_type_HVMSG_UNMAPPED_GPA, hv_message_type_HVMSG_X64_HALT, hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT, hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES, @@ -33,6 +33,7 @@ use tracing::{Span, instrument}; #[cfg(gdb)] use crate::hypervisor::gdb::DebuggableVm; +use crate::hypervisor::regs::{CommonDebugRegs, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT}; use crate::hypervisor::{HyperlightExit, Hypervisor}; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; use crate::{Result, new_error}; @@ -208,11 +209,72 @@ impl Hypervisor for MshvVm { Ok(()) } - #[cfg(crashdump)] + fn debug_regs(&self) -> Result { + let debug_regs = self.vcpu_fd.get_debug_regs()?; + Ok(debug_regs.into()) + } + + fn set_debug_regs(&self, drs: &CommonDebugRegs) -> Result<()> { + let mshv_debug_regs = drs.into(); + self.vcpu_fd.set_debug_regs(&mshv_debug_regs)?; + Ok(()) + } + fn xsave(&self) -> Result> { let xsave = self.vcpu_fd.get_xsave()?; Ok(xsave.buffer.to_vec()) } + + fn reset_xsave(&self) -> Result<()> { + let current_xsave = self.vcpu_fd.get_xsave()?; + if current_xsave.buffer.len() < 576 { + // Minimum: 512 legacy + 64 header + return Err(new_error!( + "Unexpected xsave length {}", + current_xsave.buffer.len() + )); + } + + let mut buf = XSave::default(); // default is zeroed 4KB buffer + + // Copy XCOMP_BV (offset 520-527) - preserves feature mask + compacted bit + buf.buffer[520..528].copy_from_slice(¤t_xsave.buffer[520..528]); + + // XSAVE area layout from Intel SDM Vol. 1 Section 13.4.1: + // - Bytes 0-1: FCW (x87 FPU Control Word) + // - Bytes 24-27: MXCSR + buf.buffer[0..2].copy_from_slice(&FP_CONTROL_WORD_DEFAULT.to_le_bytes()); + buf.buffer[24..28].copy_from_slice(&MXCSR_DEFAULT.to_le_bytes()); + + self.vcpu_fd.set_xsave(&buf)?; + Ok(()) + } + + #[cfg(test)] + #[cfg(feature = "init-paging")] + fn set_xsave(&self, xsave: &[u32]) -> Result<()> { + const MSHV_XSAVE_SIZE: usize = 4096; + if std::mem::size_of_val(xsave) != MSHV_XSAVE_SIZE { + return Err(new_error!( + "Provided xsave size {} does not match MSHV supported size {}", + std::mem::size_of_val(xsave), + MSHV_XSAVE_SIZE + )); + } + + // Safety: all valid u32 values are 4 valid u8 values + let (prefix, bytes, suffix) = unsafe { xsave.align_to() }; + if !prefix.is_empty() || !suffix.is_empty() { + return Err(new_error!("Invalid xsave buffer alignment")); + } + let buf = XSave { + buffer: bytes + .try_into() + .map_err(|_| new_error!("mshv xsave must be 4096 u8s"))?, + }; + self.vcpu_fd.set_xsave(&buf)?; + Ok(()) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index cc6876a5b..e8e880849 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -24,8 +24,9 @@ use windows::core::s; use windows_result::HRESULT; use super::regs::{ - Align16, WHP_FPU_NAMES, WHP_FPU_NAMES_LEN, WHP_REGS_NAMES, WHP_REGS_NAMES_LEN, WHP_SREGS_NAMES, - WHP_SREGS_NAMES_LEN, + Align16, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT, WHP_DEBUG_REGS_NAMES, + WHP_DEBUG_REGS_NAMES_LEN, WHP_FPU_NAMES, WHP_FPU_NAMES_LEN, WHP_REGS_NAMES, WHP_REGS_NAMES_LEN, + WHP_SREGS_NAMES, WHP_SREGS_NAMES_LEN, }; use super::surrogate_process::SurrogateProcess; use super::surrogate_process_manager::get_surrogate_process_manager; @@ -33,9 +34,9 @@ use super::wrappers::HandleWrapper; #[cfg(gdb)] use crate::hypervisor::gdb::DebuggableVm; use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters}; -use crate::hypervisor::{HyperlightExit, Hypervisor}; +use crate::hypervisor::{CommonDebugRegs, HyperlightExit, Hypervisor}; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; -use crate::{Result, log_then_return, new_error}; +use crate::{HyperlightError, Result, log_then_return, new_error}; #[allow(dead_code)] // Will be used for runtime hypervisor detection pub(crate) fn is_hypervisor_present() -> bool { @@ -387,10 +388,42 @@ impl Hypervisor for WhpVm { Ok(()) } - #[cfg(crashdump)] - fn xsave(&self) -> Result> { - use crate::HyperlightError; + fn debug_regs(&self) -> Result { + let mut whp_debug_regs_values: [Align16; WHP_DEBUG_REGS_NAMES_LEN] = + unsafe { std::mem::zeroed() }; + + unsafe { + WHvGetVirtualProcessorRegisters( + self.partition, + 0, + WHP_DEBUG_REGS_NAMES.as_ptr(), + whp_debug_regs_values.len() as u32, + whp_debug_regs_values.as_mut_ptr() as *mut WHV_REGISTER_VALUE, + )?; + } + + WHP_DEBUG_REGS_NAMES + .into_iter() + .zip(whp_debug_regs_values) + .collect::)>>() + .as_slice() + .try_into() + .map_err(|e| { + new_error!( + "Failed to convert WHP registers to CommonDebugRegs: {:?}", + e + ) + }) + } + fn set_debug_regs(&self, drs: &CommonDebugRegs) -> Result<()> { + let whp_regs: [(WHV_REGISTER_NAME, Align16); WHP_DEBUG_REGS_NAMES_LEN] = + drs.into(); + self.set_registers(&whp_regs)?; + Ok(()) + } + + fn xsave(&self) -> Result> { // Get the required buffer size by calling with NULL buffer. // If the buffer is not large enough (0 won't be), WHvGetVirtualProcessorXsaveState returns // WHV_E_INSUFFICIENT_BUFFER and sets buffer_size_needed to the required size. @@ -440,6 +473,136 @@ impl Hypervisor for WhpVm { Ok(xsave_buffer) } + fn reset_xsave(&self) -> Result<()> { + // WHP uses compacted XSAVE format (bit 63 of XCOMP_BV set). + // We cannot just zero out the xsave area - we need to preserve the XCOMP_BV. + + // Get the required buffer size by calling with NULL buffer. + let mut buffer_size_needed: u32 = 0; + + let result = unsafe { + WHvGetVirtualProcessorXsaveState( + self.partition, + 0, + std::ptr::null_mut(), + 0, + &mut buffer_size_needed, + ) + }; + + // Expect insufficient buffer error; any other error is unexpected + if let Err(e) = result + && e.code() != windows::Win32::Foundation::WHV_E_INSUFFICIENT_BUFFER + { + return Err(HyperlightError::WindowsAPIError(e)); + } + + // Minimum: 512 legacy + 64 header = 576 bytes + if buffer_size_needed < 576 { + return Err(new_error!("Unexpected xsave length {}", buffer_size_needed)); + } + + // Create a buffer to hold the current state (to get the correct XCOMP_BV) + let mut current_state = vec![0u8; buffer_size_needed as usize]; + let mut written_bytes = 0; + unsafe { + WHvGetVirtualProcessorXsaveState( + self.partition, + 0, + current_state.as_mut_ptr() as *mut std::ffi::c_void, + buffer_size_needed, + &mut written_bytes, + ) + }?; + + // Setting XSTATE_BV=0 resets all XSAVE-managed registers to their architectural + // init values (FCW=0x037F, MXCSR=0x1F80, XMM/YMM/ZMM=0, etc.). From the guest's + // perspective, these registers will contain the init values when execution + // resumes. The hypervisor updates its internal vCPU state immediately, so a + // subsequent xsave() call also returns the init values. + // + // When XSTATE_BV=0, the hypervisor ignores the legacy and extended region bytes + // entirely and it uses the architectural init values regardless of buffer contents. + // + // We zero these regions anyway as a defensive measure, in case of hypervisor + // bugs or implementation differences. This is probably unnecessary. + // + // - Legacy region (0-512): x87 FPU + SSE state + // - XSTATE_BV (512-520): Feature bitmap (0 = use init values) + // - XCOMP_BV (520-528): Compaction bitmap + format bit (KEEP) + // - Reserved (528-576): Header padding + // - Extended (576+): AVX, AVX-512, MPX, PKRU, AMX, etc. + current_state[0..520].fill(0); + current_state[528..].fill(0); + + // XSAVE area layout from Intel SDM Vol. 1 Section 13.4.1: + // - Bytes 0-1: FCW (x87 FPU Control Word) + // - Bytes 24-27: MXCSR + current_state[0..2].copy_from_slice(&FP_CONTROL_WORD_DEFAULT.to_le_bytes()); + current_state[24..28].copy_from_slice(&MXCSR_DEFAULT.to_le_bytes()); + + unsafe { + WHvSetVirtualProcessorXsaveState( + self.partition, + 0, + current_state.as_ptr() as *const std::ffi::c_void, + buffer_size_needed, + ) + .map_err(|e| new_error!("Failed to reset Xsave state: {:?}", e))?; + } + + Ok(()) + } + + #[cfg(test)] + #[cfg(feature = "init-paging")] + fn set_xsave(&self, xsave: &[u32]) -> Result<()> { + use crate::HyperlightError; + + // Get the required buffer size by calling with NULL buffer. + // If the buffer is not large enough (0 won't be), WHvGetVirtualProcessorXsaveState returns + // WHV_E_INSUFFICIENT_BUFFER and sets buffer_size_needed to the required size. + let mut buffer_size_needed: u32 = 0; + + let result = unsafe { + WHvGetVirtualProcessorXsaveState( + self.partition, + 0, + std::ptr::null_mut(), + 0, + &mut buffer_size_needed, + ) + }; + + // Expect insufficient buffer error; any other error is unexpected + if let Err(e) = result + && e.code() != windows::Win32::Foundation::WHV_E_INSUFFICIENT_BUFFER + { + return Err(HyperlightError::WindowsAPIError(e)); + } + + let provided_size = std::mem::size_of_val(xsave) as u32; + if buffer_size_needed > provided_size { + return Err(new_error!( + "Xsave buffer too small: needed {} bytes, provided {} bytes", + buffer_size_needed, + provided_size + )); + } + + unsafe { + WHvSetVirtualProcessorXsaveState( + self.partition, + 0, + xsave.as_ptr() as *const std::ffi::c_void, + buffer_size_needed, + ) + .map_err(|e| new_error!("Failed to set Xsave state: {:?}", e))?; + } + + Ok(()) + } + /// Mark that initial memory setup is complete. After this, map_memory will fail. fn complete_initial_memory_setup(&mut self) { self.initial_memory_setup_done = true; @@ -529,137 +692,46 @@ impl DebuggableVm for WhpVm { use crate::hypervisor::gdb::arch::MAX_NO_OF_HW_BP; // Get current debug registers - const LEN: usize = 6; - let names: [WHV_REGISTER_NAME; LEN] = [ - WHvX64RegisterDr0, - WHvX64RegisterDr1, - WHvX64RegisterDr2, - WHvX64RegisterDr3, - WHvX64RegisterDr6, - WHvX64RegisterDr7, - ]; - - let mut out: [Align16; LEN] = unsafe { std::mem::zeroed() }; - unsafe { - WHvGetVirtualProcessorRegisters( - self.partition, - 0, - names.as_ptr(), - LEN as u32, - out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, - )?; - } - - let mut dr0 = unsafe { out[0].0.Reg64 }; - let mut dr1 = unsafe { out[1].0.Reg64 }; - let mut dr2 = unsafe { out[2].0.Reg64 }; - let mut dr3 = unsafe { out[3].0.Reg64 }; - let mut dr7 = unsafe { out[5].0.Reg64 }; + let mut regs = self.debug_regs()?; // Check if breakpoint already exists - if [dr0, dr1, dr2, dr3].contains(&addr) { + if [regs.dr0, regs.dr1, regs.dr2, regs.dr3].contains(&addr) { return Ok(()); } // Find the first available LOCAL (L0–L3) slot let i = (0..MAX_NO_OF_HW_BP) - .position(|i| dr7 & (1 << (i * 2)) == 0) + .position(|i| regs.dr7 & (1 << (i * 2)) == 0) .ok_or_else(|| new_error!("Tried to add more than 4 hardware breakpoints"))?; // Assign to corresponding debug register - *[&mut dr0, &mut dr1, &mut dr2, &mut dr3][i] = addr; + *[&mut regs.dr0, &mut regs.dr1, &mut regs.dr2, &mut regs.dr3][i] = addr; // Enable LOCAL bit - dr7 |= 1 << (i * 2); + regs.dr7 |= 1 << (i * 2); - // Set the debug registers - let registers = vec![ - ( - WHvX64RegisterDr0, - Align16(WHV_REGISTER_VALUE { Reg64: dr0 }), - ), - ( - WHvX64RegisterDr1, - Align16(WHV_REGISTER_VALUE { Reg64: dr1 }), - ), - ( - WHvX64RegisterDr2, - Align16(WHV_REGISTER_VALUE { Reg64: dr2 }), - ), - ( - WHvX64RegisterDr3, - Align16(WHV_REGISTER_VALUE { Reg64: dr3 }), - ), - ( - WHvX64RegisterDr7, - Align16(WHV_REGISTER_VALUE { Reg64: dr7 }), - ), - ]; - self.set_registers(®isters)?; + self.set_debug_regs(®s)?; Ok(()) } fn remove_hw_breakpoint(&mut self, addr: u64) -> Result<()> { // Get current debug registers - const LEN: usize = 6; - let names: [WHV_REGISTER_NAME; LEN] = [ - WHvX64RegisterDr0, - WHvX64RegisterDr1, - WHvX64RegisterDr2, - WHvX64RegisterDr3, - WHvX64RegisterDr6, - WHvX64RegisterDr7, - ]; + let mut debug_regs = self.debug_regs()?; - let mut out: [Align16; LEN] = unsafe { std::mem::zeroed() }; - unsafe { - WHvGetVirtualProcessorRegisters( - self.partition, - 0, - names.as_ptr(), - LEN as u32, - out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, - )?; - } - - let mut dr0 = unsafe { out[0].0.Reg64 }; - let mut dr1 = unsafe { out[1].0.Reg64 }; - let mut dr2 = unsafe { out[2].0.Reg64 }; - let mut dr3 = unsafe { out[3].0.Reg64 }; - let mut dr7 = unsafe { out[5].0.Reg64 }; - - let regs = [&mut dr0, &mut dr1, &mut dr2, &mut dr3]; + let regs = [ + &mut debug_regs.dr0, + &mut debug_regs.dr1, + &mut debug_regs.dr2, + &mut debug_regs.dr3, + ]; if let Some(i) = regs.iter().position(|&&mut reg| reg == addr) { // Clear the address *regs[i] = 0; // Disable LOCAL bit - dr7 &= !(1 << (i * 2)); - - // Set the debug registers - let registers = vec![ - ( - WHvX64RegisterDr0, - Align16(WHV_REGISTER_VALUE { Reg64: dr0 }), - ), - ( - WHvX64RegisterDr1, - Align16(WHV_REGISTER_VALUE { Reg64: dr1 }), - ), - ( - WHvX64RegisterDr2, - Align16(WHV_REGISTER_VALUE { Reg64: dr2 }), - ), - ( - WHvX64RegisterDr3, - Align16(WHV_REGISTER_VALUE { Reg64: dr3 }), - ), - ( - WHvX64RegisterDr7, - Align16(WHV_REGISTER_VALUE { Reg64: dr7 }), - ), - ]; - self.set_registers(®isters)?; + debug_regs.dr7 &= !(1 << (i * 2)); + + self.set_debug_regs(&debug_regs)?; Ok(()) } else { Err(new_error!("Tried to remove non-existing hw-breakpoint")) diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 037b60fc1..7771448e5 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -18,13 +18,16 @@ use std::sync::LazyLock; #[cfg(gdb)] use kvm_bindings::kvm_guest_debug; -use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs, kvm_userspace_memory_region}; +use kvm_bindings::{ + kvm_debugregs, kvm_fpu, kvm_regs, kvm_sregs, kvm_userspace_memory_region, kvm_xsave, +}; use kvm_ioctls::Cap::UserMemory; use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd}; use tracing::{Span, instrument}; #[cfg(gdb)] use crate::hypervisor::gdb::DebuggableVm; +use crate::hypervisor::regs::{CommonDebugRegs, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT}; use crate::hypervisor::{HyperlightExit, Hypervisor}; use crate::mem::memory_region::MemoryRegion; use crate::{Result, new_error}; @@ -142,12 +145,16 @@ impl Hypervisor for KvmVm { } fn fpu(&self) -> Result { + // Note: On KVM this ignores MXCSR. + // See https://github.com/torvalds/linux/blob/d358e5254674b70f34c847715ca509e46eb81e6f/arch/x86/kvm/x86.c#L12554-L12599 let kvm_fpu = self.vcpu_fd.get_fpu()?; Ok((&kvm_fpu).into()) } fn set_fpu(&self, fpu: &super::regs::CommonFpu) -> Result<()> { let kvm_fpu: kvm_fpu = fpu.into(); + // Note: On KVM this ignores MXCSR. + // See https://github.com/torvalds/linux/blob/d358e5254674b70f34c847715ca509e46eb81e6f/arch/x86/kvm/x86.c#L12554-L12599 self.vcpu_fd.set_fpu(&kvm_fpu)?; Ok(()) } @@ -163,7 +170,17 @@ impl Hypervisor for KvmVm { Ok(()) } - #[cfg(crashdump)] + fn debug_regs(&self) -> Result { + let kvm_debug_regs = self.vcpu_fd.get_debug_regs()?; + Ok(kvm_debug_regs.into()) + } + + fn set_debug_regs(&self, drs: &CommonDebugRegs) -> Result<()> { + let kvm_debug_regs: kvm_debugregs = drs.into(); + self.vcpu_fd.set_debug_regs(&kvm_debug_regs)?; + Ok(()) + } + fn xsave(&self) -> Result> { let xsave = self.vcpu_fd.get_xsave()?; Ok(xsave @@ -172,6 +189,46 @@ impl Hypervisor for KvmVm { .flat_map(u32::to_le_bytes) .collect()) } + + fn reset_xsave(&self) -> Result<()> { + let mut xsave = kvm_xsave::default(); // default is zeroed 4KB buffer with no FAM + + // XSAVE area layout from Intel SDM Vol. 1 Section 13.4.1: + // - Bytes 0-1: FCW (x87 FPU Control Word) + // - Bytes 24-27: MXCSR + xsave.region[0] = FP_CONTROL_WORD_DEFAULT as u32; + xsave.region[6] = MXCSR_DEFAULT; + + // SAFETY: No dynamic features enabled, 4KB is sufficient + unsafe { self.vcpu_fd.set_xsave(&xsave)? }; + + Ok(()) + } + + #[cfg(test)] + #[cfg(feature = "init-paging")] + fn set_xsave(&self, xsave: &[u32]) -> Result<()> { + const KVM_XSAVE_SIZE: usize = 4096; + + if std::mem::size_of_val(xsave) != KVM_XSAVE_SIZE { + return Err(new_error!( + "Provided xsave size {} does not match KVM supported size {}", + std::mem::size_of_val(xsave), + KVM_XSAVE_SIZE + )); + } + let xsave = kvm_xsave { + region: xsave + .try_into() + .map_err(|_| new_error!("kvm xsave must be 1024 u32s"))?, + ..Default::default() + }; + // Safety: Safe because we only copy 4096 bytes + // and have not enabled any dynamic xsave features + unsafe { self.vcpu_fd.set_xsave(&xsave)? }; + + Ok(()) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index 5cdd74b48..ae9bade9b 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -17,7 +17,9 @@ limitations under the License. use log::LevelFilter; use crate::Result; -use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters}; +use crate::hypervisor::regs::{ + CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, +}; use crate::mem::memory_region::MemoryRegion; /// HyperV-on-linux functionality @@ -124,9 +126,22 @@ pub(crate) trait Hypervisor: Debug + Send { /// Set special regs fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> Result<()>; - /// xsave - #[cfg(crashdump)] + /// Get xsave + #[allow(dead_code)] fn xsave(&self) -> Result>; + /// Reset xsave to default state + fn reset_xsave(&self) -> Result<()>; + /// Set xsave - only used for tests. + /// Note: On MSHV/WHP, XCOMP_BV must be preserved. + #[cfg(test)] + #[cfg(feature = "init-paging")] + fn set_xsave(&self, xsave: &[u32]) -> Result<()>; + + /// Get the debug registers of the vCPU + #[allow(dead_code)] + fn debug_regs(&self) -> Result; + /// Set the debug registers of the vCPU + fn set_debug_regs(&self, drs: &CommonDebugRegs) -> Result<()>; /// Get partition handle #[cfg(target_os = "windows")] diff --git a/src/hyperlight_host/src/hypervisor/regs.rs b/src/hyperlight_host/src/hypervisor/regs.rs index d29edf4bf..5d940ba69 100644 --- a/src/hyperlight_host/src/hypervisor/regs.rs +++ b/src/hyperlight_host/src/hypervisor/regs.rs @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +mod debug_regs; mod fpu; mod special_regs; mod standard_regs; @@ -21,6 +22,7 @@ mod standard_regs; #[cfg(target_os = "windows")] use std::collections::HashSet; +pub(crate) use debug_regs::*; pub(crate) use fpu::*; pub(crate) use special_regs::*; pub(crate) use standard_regs::*; diff --git a/src/hyperlight_host/src/hypervisor/regs/debug_regs.rs b/src/hyperlight_host/src/hypervisor/regs/debug_regs.rs new file mode 100644 index 000000000..5b2127883 --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/debug_regs.rs @@ -0,0 +1,269 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#[cfg(kvm)] +use kvm_bindings::kvm_debugregs; +#[cfg(mshv3)] +use mshv_bindings::DebugRegisters; + +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub(crate) struct CommonDebugRegs { + pub dr0: u64, + pub dr1: u64, + pub dr2: u64, + pub dr3: u64, + pub dr6: u64, + pub dr7: u64, +} + +#[cfg(kvm)] +impl From for CommonDebugRegs { + fn from(kvm_regs: kvm_debugregs) -> Self { + Self { + dr0: kvm_regs.db[0], + dr1: kvm_regs.db[1], + dr2: kvm_regs.db[2], + dr3: kvm_regs.db[3], + dr6: kvm_regs.dr6, + dr7: kvm_regs.dr7, + } + } +} +#[cfg(kvm)] +impl From<&CommonDebugRegs> for kvm_debugregs { + fn from(common_regs: &CommonDebugRegs) -> Self { + kvm_debugregs { + db: [ + common_regs.dr0, + common_regs.dr1, + common_regs.dr2, + common_regs.dr3, + ], + dr6: common_regs.dr6, + dr7: common_regs.dr7, + ..Default::default() + } + } +} +#[cfg(mshv3)] +impl From for CommonDebugRegs { + fn from(mshv_regs: DebugRegisters) -> Self { + Self { + dr0: mshv_regs.dr0, + dr1: mshv_regs.dr1, + dr2: mshv_regs.dr2, + dr3: mshv_regs.dr3, + dr6: mshv_regs.dr6, + dr7: mshv_regs.dr7, + } + } +} +#[cfg(mshv3)] +impl From<&CommonDebugRegs> for DebugRegisters { + fn from(common_regs: &CommonDebugRegs) -> Self { + DebugRegisters { + dr0: common_regs.dr0, + dr1: common_regs.dr1, + dr2: common_regs.dr2, + dr3: common_regs.dr3, + dr6: common_regs.dr6, + dr7: common_regs.dr7, + } + } +} + +#[cfg(target_os = "windows")] +use windows::Win32::System::Hypervisor::*; + +#[cfg(target_os = "windows")] +impl From<&CommonDebugRegs> + for [(WHV_REGISTER_NAME, Align16); WHP_DEBUG_REGS_NAMES_LEN] +{ + fn from(regs: &CommonDebugRegs) -> Self { + [ + ( + WHvX64RegisterDr0, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr0 }), + ), + ( + WHvX64RegisterDr1, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr1 }), + ), + ( + WHvX64RegisterDr2, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr2 }), + ), + ( + WHvX64RegisterDr3, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr3 }), + ), + ( + WHvX64RegisterDr6, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr6 }), + ), + ( + WHvX64RegisterDr7, + Align16(WHV_REGISTER_VALUE { Reg64: regs.dr7 }), + ), + ] + } +} + +#[cfg(target_os = "windows")] +use std::collections::HashSet; + +#[cfg(target_os = "windows")] +use super::{Align16, FromWhpRegisterError}; + +#[cfg(target_os = "windows")] +pub(crate) const WHP_DEBUG_REGS_NAMES_LEN: usize = 6; +#[cfg(target_os = "windows")] +pub(crate) const WHP_DEBUG_REGS_NAMES: [WHV_REGISTER_NAME; WHP_DEBUG_REGS_NAMES_LEN] = [ + WHvX64RegisterDr0, + WHvX64RegisterDr1, + WHvX64RegisterDr2, + WHvX64RegisterDr3, + WHvX64RegisterDr6, + WHvX64RegisterDr7, +]; + +#[cfg(target_os = "windows")] +impl TryFrom<&[(WHV_REGISTER_NAME, Align16)]> for CommonDebugRegs { + type Error = FromWhpRegisterError; + + #[expect( + non_upper_case_globals, + reason = "Windows API has lowercase register names" + )] + fn try_from( + regs: &[(WHV_REGISTER_NAME, Align16)], + ) -> Result { + if regs.len() != WHP_DEBUG_REGS_NAMES_LEN { + return Err(FromWhpRegisterError::InvalidLength(regs.len())); + } + let mut registers = CommonDebugRegs::default(); + let mut seen_registers = HashSet::new(); + + for &(name, value) in regs { + let name_id = name.0; + + // Check for duplicates + if !seen_registers.insert(name_id) { + return Err(FromWhpRegisterError::DuplicateRegister(name_id)); + } + + unsafe { + match name { + WHvX64RegisterDr0 => registers.dr0 = value.0.Reg64, + WHvX64RegisterDr1 => registers.dr1 = value.0.Reg64, + WHvX64RegisterDr2 => registers.dr2 = value.0.Reg64, + WHvX64RegisterDr3 => registers.dr3 = value.0.Reg64, + WHvX64RegisterDr6 => registers.dr6 = value.0.Reg64, + WHvX64RegisterDr7 => registers.dr7 = value.0.Reg64, + _ => { + // Given unexpected register + return Err(FromWhpRegisterError::InvalidRegister(name_id)); + } + } + } + } + + // Set of all expected register names + let expected_registers: HashSet = WHP_DEBUG_REGS_NAMES + .map(|name| name.0) + .into_iter() + .collect(); + + // Technically it should not be possible to have any missing registers at this point + // since we are guaranteed to have WHP_DEBUG_REGS_NAMES_LEN (6) non-duplicate registers that have passed the match-arm above, but leaving this here for safety anyway + let missing: HashSet<_> = expected_registers + .difference(&seen_registers) + .cloned() + .collect(); + + if !missing.is_empty() { + return Err(FromWhpRegisterError::MissingRegister(missing)); + } + + Ok(registers) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn common_debug_regs() -> CommonDebugRegs { + CommonDebugRegs { + dr0: 1, + dr1: 2, + dr2: 3, + dr3: 4, + dr6: 5, + dr7: 6, + } + } + + #[cfg(kvm)] + #[test] + fn round_trip_kvm_debug_regs() { + let original = common_debug_regs(); + let kvm_regs: kvm_debugregs = (&original).into(); + let converted: CommonDebugRegs = kvm_regs.into(); + assert_eq!(original, converted); + } + + #[cfg(mshv3)] + #[test] + fn round_trip_mshv_debug_regs() { + let original = common_debug_regs(); + let mshv_regs: DebugRegisters = (&original).into(); + let converted: CommonDebugRegs = mshv_regs.into(); + assert_eq!(original, converted); + } + + #[cfg(target_os = "windows")] + #[test] + fn round_trip_whp_debug_regs() { + let original = common_debug_regs(); + let whp_regs: [(WHV_REGISTER_NAME, Align16); WHP_DEBUG_REGS_NAMES_LEN] = + (&original).into(); + let converted: CommonDebugRegs = whp_regs.as_ref().try_into().unwrap(); + assert_eq!(original, converted); + + // test for duplicate register error handling + let original = common_debug_regs(); + let mut whp_regs: [(WHV_REGISTER_NAME, Align16); + WHP_DEBUG_REGS_NAMES_LEN] = (&original).into(); + whp_regs[0].0 = WHvX64RegisterDr1; + let err = CommonDebugRegs::try_from(whp_regs.as_ref()).unwrap_err(); + assert_eq!( + err, + FromWhpRegisterError::DuplicateRegister(WHvX64RegisterDr1.0) + ); + + // test for passing non-standard register (e.g. CR8) + let original = common_debug_regs(); + let mut whp_regs: [(WHV_REGISTER_NAME, Align16); + WHP_DEBUG_REGS_NAMES_LEN] = (&original).into(); + whp_regs[0].0 = WHvX64RegisterCr8; + let err = CommonDebugRegs::try_from(whp_regs.as_ref()).unwrap_err(); + assert_eq!( + err, + FromWhpRegisterError::InvalidRegister(WHvX64RegisterCr8.0) + ); + } +} diff --git a/src/hyperlight_host/src/hypervisor/regs/fpu.rs b/src/hyperlight_host/src/hypervisor/regs/fpu.rs index 47ce8a853..93907c6a4 100644 --- a/src/hyperlight_host/src/hypervisor/regs/fpu.rs +++ b/src/hyperlight_host/src/hypervisor/regs/fpu.rs @@ -36,13 +36,11 @@ pub(crate) struct CommonFpu { pub fcw: u16, pub fsw: u16, pub ftwx: u8, - pub pad1: u8, pub last_opcode: u16, pub last_ip: u64, pub last_dp: u64, pub xmm: [[u8; 16]; 16], pub mxcsr: u32, - pub pad2: u32, } impl Default for CommonFpu { @@ -52,13 +50,11 @@ impl Default for CommonFpu { fcw: FP_CONTROL_WORD_DEFAULT, fsw: 0, ftwx: 0, - pad1: 0, last_opcode: 0, last_ip: 0, last_dp: 0, xmm: [[0u8; 16]; 16], mxcsr: MXCSR_DEFAULT, - pad2: 0, } } } @@ -71,13 +67,13 @@ impl From<&CommonFpu> for kvm_fpu { fcw: common_fpu.fcw, fsw: common_fpu.fsw, ftwx: common_fpu.ftwx, - pad1: common_fpu.pad1, + pad1: 0, last_opcode: common_fpu.last_opcode, last_ip: common_fpu.last_ip, last_dp: common_fpu.last_dp, xmm: common_fpu.xmm, mxcsr: common_fpu.mxcsr, - pad2: common_fpu.pad2, + pad2: 0, } } } @@ -90,13 +86,13 @@ impl From<&CommonFpu> for FloatingPointUnit { fcw: common_fpu.fcw, fsw: common_fpu.fsw, ftwx: common_fpu.ftwx, - pad1: common_fpu.pad1, + pad1: 0, last_opcode: common_fpu.last_opcode, last_ip: common_fpu.last_ip, last_dp: common_fpu.last_dp, xmm: common_fpu.xmm, mxcsr: common_fpu.mxcsr, - pad2: common_fpu.pad2, + pad2: 0, } } } @@ -109,13 +105,11 @@ impl From<&kvm_fpu> for CommonFpu { fcw: kvm_fpu.fcw, fsw: kvm_fpu.fsw, ftwx: kvm_fpu.ftwx, - pad1: kvm_fpu.pad1, last_opcode: kvm_fpu.last_opcode, last_ip: kvm_fpu.last_ip, last_dp: kvm_fpu.last_dp, xmm: kvm_fpu.xmm, mxcsr: kvm_fpu.mxcsr, - pad2: kvm_fpu.pad2, } } } @@ -128,13 +122,11 @@ impl From<&FloatingPointUnit> for CommonFpu { fcw: mshv_fpu.fcw, fsw: mshv_fpu.fsw, ftwx: mshv_fpu.ftwx, - pad1: mshv_fpu.pad1, last_opcode: mshv_fpu.last_opcode, last_ip: mshv_fpu.last_ip, last_dp: mshv_fpu.last_dp, xmm: mshv_fpu.xmm, mxcsr: mshv_fpu.mxcsr, - pad2: mshv_fpu.pad2, } } } @@ -174,7 +166,7 @@ impl From<&CommonFpu> for [(WHV_REGISTER_NAME, Align16); WHP FpControl: fpu.fcw, FpStatus: fpu.fsw, FpTag: fpu.ftwx, - Reserved: fpu.pad1, + Reserved: 0, LastFpOp: fpu.last_opcode, Anonymous: WHV_X64_FP_CONTROL_STATUS_REGISTER_0_0 { LastFpRip: fpu.last_ip, @@ -293,7 +285,6 @@ impl TryFrom<&[(WHV_REGISTER_NAME, Align16)]> for CommonFpu fpu.fcw = control.FpControl; fpu.fsw = control.FpStatus; fpu.ftwx = control.FpTag; - fpu.pad1 = control.Reserved; fpu.last_opcode = control.LastFpOp; fpu.last_ip = unsafe { control.Anonymous.LastFpRip }; } @@ -355,7 +346,6 @@ mod tests { fcw: 0x1234, fsw: 0x5678, ftwx: 0x9a, - pad1: 0xbc, last_opcode: 0xdef0, last_ip: 0xdeadbeefcafebabe, last_dp: 0xabad1deaf00dbabe, @@ -365,7 +355,6 @@ mod tests { [22u8; 16], [23u8; 16], ], mxcsr: 0x1f80, - pad2: 0, } } diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index 32d7b5478..b93d1f262 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -252,7 +252,7 @@ impl SandboxMemoryLayout { /// Create a new `SandboxMemoryLayout` with the given /// `SandboxConfiguration`, code size and stack/heap size. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] - pub(super) fn new( + pub(crate) fn new( cfg: SandboxConfiguration, code_size: usize, stack_size: usize, @@ -461,7 +461,7 @@ impl SandboxMemoryLayout { /// get the code offset /// This is the offset in the sandbox memory where the code starts #[instrument(skip_all, parent = Span::current(), level= "Trace")] - pub(super) fn get_guest_code_offset(&self) -> usize { + pub(crate) fn get_guest_code_offset(&self) -> usize { self.guest_code_offset } @@ -526,7 +526,7 @@ impl SandboxMemoryLayout { /// Get the total size of guest memory in `self`'s memory /// layout aligned to page size boundaries. #[instrument(skip_all, parent = Span::current(), level= "Trace")] - pub(super) fn get_memory_size(&self) -> Result { + pub(crate) fn get_memory_size(&self) -> Result { let total_memory = self.get_unaligned_memory_size(); // Size should be a multiple of page size. diff --git a/src/hyperlight_host/src/sandbox/hypervisor.rs b/src/hyperlight_host/src/sandbox/hypervisor.rs index c72b7efbf..a93bd2ce2 100644 --- a/src/hyperlight_host/src/sandbox/hypervisor.rs +++ b/src/hyperlight_host/src/sandbox/hypervisor.rs @@ -66,7 +66,7 @@ pub fn get_available_hypervisor() -> &'static Option { } /// The hypervisor types available for the current platform -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Eq, Debug, Copy, Clone)] pub(crate) enum HypervisorType { #[cfg(kvm)] Kvm, diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index dd7bc8dff..6bc34bff0 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -286,6 +286,8 @@ impl MultiUseSandbox { unsafe { self.vm.map_region(region)? }; } + self.vm.reset_vcpu()?; + // The restored snapshot is now our most current snapshot self.snapshot = Some(snapshot.clone()); @@ -1298,6 +1300,41 @@ mod tests { assert_ne!(sandbox3.id, sandbox_id); } + /// Test that snapshot restore properly resets vCPU debug registers. This test verifies + /// that restore() calls reset_vcpu. + #[test] + fn snapshot_restore_resets_debug_registers() { + let mut sandbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + let snapshot = sandbox.snapshot().unwrap(); + + // Verify DR0 is initially 0 (clean state) + let dr0_initial: u64 = sandbox.call("GetDr0", ()).unwrap(); + assert_eq!(dr0_initial, 0, "DR0 should initially be 0"); + + // Dirty DR0 by setting it to a known non-zero value + const DIRTY_VALUE: u64 = 0xDEAD_BEEF_CAFE_BABE; + sandbox.call::<()>("SetDr0", DIRTY_VALUE).unwrap(); + let dr0_dirty: u64 = sandbox.call("GetDr0", ()).unwrap(); + assert_eq!( + dr0_dirty, DIRTY_VALUE, + "DR0 should be dirty after SetDr0 call" + ); + + // Restore to the snapshot - this should reset vCPU state including debug registers + sandbox.restore(snapshot).unwrap(); + + let dr0_after_restore: u64 = sandbox.call("GetDr0", ()).unwrap(); + assert_eq!( + dr0_after_restore, 0, + "DR0 should be 0 after restore (reset_vcpu should have been called)" + ); + } + /// Test that sandboxes can be created and evolved with different heap sizes #[test] fn test_sandbox_creation_various_sizes() { diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index 1cef4a858..fea642d57 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -544,6 +544,18 @@ fn use_sse2_registers() { unsafe { core::arch::asm!("movss xmm1, DWORD PTR [{0}]", in(reg) &val) }; } +#[guest_function("SetDr0")] +fn set_dr0(value: u64) { + unsafe { core::arch::asm!("mov dr0, {}", in(reg) value) }; +} + +#[guest_function("GetDr0")] +fn get_dr0() -> u64 { + let value: u64; + unsafe { core::arch::asm!("mov {}, dr0", out(reg) value) }; + value +} + #[guest_function("Add")] fn add(a: i32, b: i32) -> Result { #[host_function("HostAdd")]