From 2557909463771e2623821aafa3c28358f9f361cb Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 18 May 2026 13:26:43 +0100 Subject: [PATCH 001/178] vmm: Revert "vmm: create memfd for private mappings" This reverts commit ced3762a67b69d66cda5b929500e5bcf266c4217. This change lead to a serious memory regression when not using hugepages or shared=on. `MAP_PRIVATE` creates an anonymous memory allocation for every page written when the backing store is a file. This CoW behaviour is useful but leads to double allocations when the backing store is an empty file created by `memfd_create()`. When the page is written to, the CoW semantics require a real page to be created in the memory for the memfd (previously before the page was touched they would all point to the zero page). This real page is filled with zeroes because in theory this page would be accessible via read/write syscalls on the FD even though in our implementation it is only ever `mmap()`ed. The intention of the commit was to enable `fallocate()` to be used to punch holes but that would only affect the inaccessible backing page and the page in the CoW anonymous memory would be unaffected. Leading it likely not to have the desired effect. Fixes: #8211 Signed-off-by: Rob Bradford --- vmm/src/memory_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index dfe0f3e490..61e21aad6b 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -1932,8 +1932,8 @@ impl MemoryManager { mmap_flags |= libc::MAP_SHARED; Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) } else { - mmap_flags |= libc::MAP_PRIVATE; - Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) + mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + None }; let region = MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) From 3759424bfcfb3dd908fa49eca451b1255d40dc91 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 15 May 2026 19:01:06 +0200 Subject: [PATCH 002/178] vm-migration: be explicit about commands Reordering commands or adding commands in-between is breaking the migration protocol. By using explicit numbers, we can increase the attention required when touching this code. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/protocol.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 997ac8d815..5fffbada87 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -111,19 +111,19 @@ use crate::bitpos_iterator::BitposIteratorExt; #[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] pub enum Command { #[default] - Invalid, - Start, - Config, - State, - Memory, + Invalid = 0, + Start = 1, + Config = 2, + State = 3, + Memory = 4, /// Finalizes the migration and resumes the VM on the destination. /// Sent when the source VM was running at migration time. - Complete, - Abandon, - MemoryFd, + Complete = 5, + Abandon = 6, + MemoryFd = 7, /// Finalizes the migration without resuming the VM on the destination. /// Sent when the source VM was paused at migration time. - CompletePaused, + CompletePaused = 8, } #[repr(C)] From 086059736a76e7cc5800a3ec29b38f3d8c1e2327 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 15 May 2026 19:02:30 +0200 Subject: [PATCH 003/178] vmm: migration: better observe and log invalid states This increases debugability. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 6c9d476ab2..4355836735 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -685,6 +685,18 @@ enum ReceiveMigrationState { } impl ReceiveMigrationState { + fn variant_name(&self) -> &'static str { + match self { + ReceiveMigrationState::Established => "Established", + ReceiveMigrationState::Started => "Started", + ReceiveMigrationState::MemoryFdsReceived(_) => "MemoryFdsReceived", + ReceiveMigrationState::Configured(_) => "Configured", + ReceiveMigrationState::StateReceived { .. } => "StateReceived", + ReceiveMigrationState::Completed => "Completed", + ReceiveMigrationState::Aborted => "Aborted", + } + } + fn finished(&self) -> bool { matches!( self, @@ -887,9 +899,9 @@ impl Vmm { ) -> std::result::Result { use ReceiveMigrationState::*; - let invalid_command = || { + let invalid_command = |state: &str, cmd: Command| { Err(MigratableError::MigrateReceive(anyhow!( - "Can't handle command in current state" + "Can't handle command {cmd:?} in current receive state {state}" ))) }; @@ -927,22 +939,23 @@ impl Vmm { return Ok(Aborted); } + let state_name = state.variant_name(); match state { Established => match req.command() { Command::Start => Ok(Started), - _ => invalid_command(), + c => invalid_command(state_name, c), }, Started => match req.command() { Command::MemoryFd => recv_memory_fd(socket, Vec::new()).map(MemoryFdsReceived), Command::Config => configure_vm(socket, Default::default()).map(Configured), - _ => invalid_command(), + c => invalid_command(state_name, c), }, MemoryFdsReceived(memory_files) => match req.command() { Command::MemoryFd => recv_memory_fd(socket, memory_files).map(MemoryFdsReceived), Command::Config => { configure_vm(socket, HashMap::from_iter(memory_files)).map(Configured) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, Configured(mut config_data) => match req.command() { // Memory commands use the main connection only in the single-connection case. @@ -981,7 +994,7 @@ impl Vmm { state_receive_begin, }) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, StateReceived { state_receive_begin, @@ -1011,7 +1024,7 @@ impl Vmm { ); Ok(Completed) } - _ => invalid_command(), + c => invalid_command(state_name, c), }, Completed | Aborted => { unreachable!("Performed a step on the finished state machine") From ebc1e7e54201206b27c4877f6b76ba6f6cfca82e Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 19 May 2026 14:15:00 +0100 Subject: [PATCH 004/178] vmm: Don't store the snapshot on the DeviceManager Storing the snapshot causes issues when needing to do a subsequent hotplug instead just pass it through on all the methods that need it making the lifecycle cleaner. Assisted-by: Claude:Opus-4.6 Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 224 ++++++++++++++++++++++---------------- vmm/src/vm.rs | 23 +++- 2 files changed, 152 insertions(+), 95 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index e6902a4cbf..c5506ba0b8 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1138,8 +1138,6 @@ pub struct DeviceManager { // Addresses for ACPI platform devices e.g. ACPI PM timer, sleep/reset registers acpi_platform_addresses: AcpiPlatformAddresses, - snapshot: Option, - rate_limit_groups: HashMap>, mmio_regions: Arc>>, @@ -1433,7 +1431,6 @@ impl DeviceManager { timestamp, pending_activations: Arc::new(Mutex::new(Vec::default())), acpi_platform_addresses: AcpiPlatformAddresses::default(), - snapshot: snapshot.cloned(), rate_limit_groups, mmio_regions: Arc::new(Mutex::new(Vec::new())), #[cfg(feature = "fw_cfg")] @@ -1463,8 +1460,9 @@ impl DeviceManager { pub fn create_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { - self.add_interrupt_controller() + self.add_interrupt_controller(snapshot) } #[allow(clippy::needless_pass_by_value)] @@ -1474,6 +1472,7 @@ impl DeviceManager { console_resize_pipe: Option>, original_termios_opt: Arc>>, interrupt_controller: Arc>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<()> { trace_scoped!("create_devices"); @@ -1511,7 +1510,7 @@ impl DeviceManager { )?; #[cfg(target_arch = "aarch64")] - self.add_legacy_devices(legacy_interrupt_manager.as_ref())?; + self.add_legacy_devices(legacy_interrupt_manager.as_ref(), snapshot)?; { self.ged_notification_device = self.add_acpi_devices( @@ -1531,6 +1530,7 @@ impl DeviceManager { legacy_interrupt_manager.as_ref(), console_info, console_resize_pipe, + snapshot, )?; #[cfg(not(target_arch = "riscv64"))] @@ -1541,8 +1541,8 @@ impl DeviceManager { } self.legacy_interrupt_manager = Some(legacy_interrupt_manager); - self.make_virtio_devices()?; - self.add_pci_devices()?; + self.make_virtio_devices(snapshot)?; + self.add_pci_devices(snapshot)?; // Add pvmemcontrol if required #[cfg(feature = "pvmemcontrol")] @@ -1556,12 +1556,12 @@ impl DeviceManager { } if self.config.clone().lock().unwrap().pvpanic { - self.pvpanic_device = self.add_pvpanic_device()?; + self.pvpanic_device = self.add_pvpanic_device(snapshot)?; } #[cfg(feature = "ivshmem")] if let Some(ivshmem) = self.config.clone().lock().unwrap().ivshmem.as_ref() { - self.ivshmem_device = self.add_ivshmem_device(ivshmem)?; + self.ivshmem_device = self.add_ivshmem_device(ivshmem, snapshot)?; } Ok(()) @@ -1651,7 +1651,7 @@ impl DeviceManager { } #[allow(unused_variables)] - fn add_pci_devices(&mut self) -> DeviceManagerResult<()> { + fn add_pci_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let iommu_id = String::from(IOMMU_DEVICE_NAME); let iommu_address_width_bits = @@ -1670,7 +1670,7 @@ impl DeviceManager { .map_err(DeviceManagerError::EventFd)?, self.get_msi_iova_space(), iommu_address_width_bits, - state_from_id(self.snapshot.as_ref(), iommu_id.as_str()) + state_from_id(snapshot, iommu_id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioIommu)?; @@ -1713,6 +1713,7 @@ impl DeviceManager { false, handle.dma_handler, handle.pci_common.pci_device_id, + snapshot, )?; // Track device BDF for Generic Initiator support @@ -1723,10 +1724,10 @@ impl DeviceManager { } } - let mut vfio_iommu_device_ids = self.add_vfio_devices()?; + let mut vfio_iommu_device_ids = self.add_vfio_devices(snapshot)?; iommu_attached_devices.append(&mut vfio_iommu_device_ids); - let mut vfio_user_iommu_device_ids = self.add_user_devices()?; + let mut vfio_user_iommu_device_ids = self.add_user_devices(snapshot)?; iommu_attached_devices.append(&mut vfio_user_iommu_device_ids); // Add all devices from forced iommu segments @@ -1752,6 +1753,7 @@ impl DeviceManager { false, None, None, + snapshot, )?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } @@ -1774,6 +1776,7 @@ impl DeviceManager { #[cfg(target_arch = "aarch64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let interrupt_controller: Arc> = Arc::new(Mutex::new( gic::Gic::new( @@ -1788,7 +1791,7 @@ impl DeviceManager { // Restore the vGic if this is in the process of restoration let id = String::from(gic::GIC_SNAPSHOT_ID); - if let Some(vgic_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) { + if let Some(vgic_snapshot) = snapshot_from_id(snapshot, &id) { // PMU support is optional. Nothing should be impacted if the PMU initialization failed. if self .cpu_manager @@ -1827,6 +1830,7 @@ impl DeviceManager { #[cfg(target_arch = "riscv64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let interrupt_controller: Arc> = Arc::new(Mutex::new( aia::Aia::new( @@ -1841,7 +1845,7 @@ impl DeviceManager { // Restore the vAia if this is in the process of restoration let id = String::from(aia::_AIA_SNAPSHOT_ID); - if let Some(_vaia_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) { + if let Some(_vaia_snapshot) = snapshot_from_id(snapshot, &id) { // TODO: vAia snapshotting and restoration is scheduled to next stage of riscv64 support. // TODO: PMU support is scheduled to next stage of riscv64 support. // PMU support is optional. Nothing should be impacted if the PMU initialization failed. @@ -1864,11 +1868,12 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] fn add_interrupt_controller( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(IOAPIC_DEVICE_NAME); - let state = state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?; + let state = + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?; // Create IOAPIC let interrupt_controller = Arc::new(Mutex::new( ioapic::Ioapic::new( @@ -2090,6 +2095,7 @@ impl DeviceManager { fn add_legacy_devices( &mut self, interrupt_manager: &dyn InterruptManager, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<()> { // Add a RTC device let rtc_irq = self @@ -2140,8 +2146,7 @@ impl DeviceManager { let gpio_device = Arc::new(Mutex::new(devices::legacy::Gpio::new( id.clone(), interrupt_group, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2224,6 +2229,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { // Serial is tied to IRQ #4 let serial_irq = 4; @@ -2240,8 +2246,7 @@ impl DeviceManager { id.clone(), interrupt_group, serial_writer, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2275,6 +2280,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(SERIAL_DEVICE_NAME); @@ -2297,8 +2303,7 @@ impl DeviceManager { interrupt_group, serial_writer, self.timestamp, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2339,6 +2344,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, serial_writer: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let id = String::from(SERIAL_DEVICE_NAME); @@ -2360,8 +2366,7 @@ impl DeviceManager { id.clone(), interrupt_group, serial_writer, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ))); self.bus_devices @@ -2401,6 +2406,7 @@ impl DeviceManager { &mut self, transport: ConsoleTransport, resize_pipe: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>> { let mut console_config = self.config.lock().unwrap().console.clone(); let endpoint = match transport { @@ -2457,8 +2463,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, + state_from_id(snapshot, id.as_str()).map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioConsole)?; let virtio_console_device = Arc::new(Mutex::new(virtio_console_device)); @@ -2497,6 +2502,7 @@ impl DeviceManager { interrupt_manager: &dyn InterruptManager, console_info: Option, console_resize_pipe: Option>, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult> { let serial_config = self.config.lock().unwrap().serial.clone(); if console_info.is_none() { @@ -2517,7 +2523,7 @@ impl DeviceManager { }; if !matches!(console_info.serial, ConsoleTransport::Off) { - let serial = self.add_serial_device(interrupt_manager, serial_writer)?; + let serial = self.add_serial_device(interrupt_manager, serial_writer, snapshot)?; self.serial_manager = match console_info.serial { ConsoleTransport::Pty(_) | ConsoleTransport::Tty(_) @@ -2560,7 +2566,7 @@ impl DeviceManager { } let console_resizer = - self.add_virtio_console_device(console_info.console, console_resize_pipe)?; + self.add_virtio_console_device(console_info.console, console_resize_pipe, snapshot)?; Ok(Arc::new(Console { console_resizer })) } @@ -2618,34 +2624,34 @@ impl DeviceManager { Ok(()) } - fn make_virtio_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Create "standard" virtio devices (net/block/rng) - self.make_virtio_block_devices()?; - self.make_virtio_net_devices()?; - self.make_virtio_rng_devices()?; + self.make_virtio_block_devices(snapshot)?; + self.make_virtio_net_devices(snapshot)?; + self.make_virtio_rng_devices(snapshot)?; // Add generic vhost-user if required - self.make_generic_vhost_user_devices()?; + self.make_generic_vhost_user_devices(snapshot)?; // Add virtio-fs if required - self.make_virtio_fs_devices()?; + self.make_virtio_fs_devices(snapshot)?; // Add virtio-pmem if required - self.make_virtio_pmem_devices()?; + self.make_virtio_pmem_devices(snapshot)?; // Add virtio-vsock if required - self.make_virtio_vsock_devices()?; + self.make_virtio_vsock_devices(snapshot)?; - self.make_virtio_mem_devices()?; + self.make_virtio_mem_devices(snapshot)?; // Add virtio-balloon if required - self.make_virtio_balloon_devices()?; + self.make_virtio_balloon_devices(snapshot)?; // Add virtio-watchdog device - self.make_virtio_watchdog_devices()?; + self.make_virtio_watchdog_devices(snapshot)?; // Add vDPA devices if required - self.make_vdpa_devices()?; + self.make_vdpa_devices(snapshot)?; Ok(()) } @@ -2663,6 +2669,7 @@ impl DeviceManager { &mut self, disk_cfg: &mut DiskConfig, is_hotplug: bool, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match disk_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -2694,7 +2701,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) { Ok(vub_device) => vub_device, @@ -2829,7 +2836,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, queue_affinity, disk_cfg.sparse, @@ -2873,11 +2880,14 @@ impl DeviceManager { }) } - fn make_virtio_block_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_block_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut block_devices = self.config.lock().unwrap().disks.take(); if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { - let device = self.make_virtio_block_device(disk_cfg, false)?; + let device = self.make_virtio_block_device(disk_cfg, false, snapshot)?; self.virtio_devices.push(device); } } @@ -2889,6 +2899,7 @@ impl DeviceManager { fn make_virtio_net_device( &mut self, net_cfg: &mut NetConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match net_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -2923,7 +2934,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, net_cfg.offload_tso, net_cfg.offload_ufo, @@ -2941,7 +2952,7 @@ impl DeviceManager { vhost_user_net as Arc>, ) } else { - let state = state_from_id(self.snapshot.as_ref(), id.as_str()) + let state = state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { Arc::new(Mutex::new( @@ -3043,11 +3054,11 @@ impl DeviceManager { } /// Add virto-net and vhost-user-net devices - fn make_virtio_net_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_net_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mut net_devices = self.config.lock().unwrap().net.take(); if let Some(net_list_cfg) = &mut net_devices { for net_cfg in net_list_cfg.iter_mut() { - let device = self.make_virtio_net_device(net_cfg)?; + let device = self.make_virtio_net_device(net_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3056,7 +3067,7 @@ impl DeviceManager { Ok(()) } - fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_rng_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add virtio-rng if required let mut rng_config = self.config.lock().unwrap().rng.clone(); if let Some(rng_path) = rng_config.src.to_str() { @@ -3080,7 +3091,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioRng)?, @@ -3107,6 +3118,7 @@ impl DeviceManager { fn make_generic_vhost_user_device( &mut self, generic_vhost_user_cfg: &mut GenericVhostUserConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match generic_vhost_user_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3134,7 +3146,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateGenericVhostUser)?, @@ -3156,11 +3168,15 @@ impl DeviceManager { } } - fn make_generic_vhost_user_devices(&mut self) -> DeviceManagerResult<()> { + fn make_generic_vhost_user_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut generic_vhost_user_devices = self.config.lock().unwrap().generic_vhost_user.clone(); if let Some(generic_vhost_user_list_cfg) = &mut generic_vhost_user_devices { for generic_vhost_user_cfg in generic_vhost_user_list_cfg.iter_mut() { - let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + let device = + self.make_generic_vhost_user_device(generic_vhost_user_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3172,6 +3188,7 @@ impl DeviceManager { fn make_virtio_fs_device( &mut self, fs_cfg: &mut FsConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match fs_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3200,7 +3217,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, self.force_access_platform, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioFs)?, @@ -3221,11 +3238,11 @@ impl DeviceManager { } } - fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_fs_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mut fs_devices = self.config.lock().unwrap().fs.take(); if let Some(fs_list_cfg) = &mut fs_devices { for fs_cfg in fs_list_cfg.iter_mut() { - let device = self.make_virtio_fs_device(fs_cfg)?; + let device = self.make_virtio_fs_device(fs_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3237,6 +3254,7 @@ impl DeviceManager { fn make_virtio_pmem_device( &mut self, pmem_cfg: &mut PmemConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match pmem_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3383,7 +3401,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioPmem)?, @@ -3406,12 +3424,12 @@ impl DeviceManager { }) } - fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_pmem_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add virtio-pmem if required let mut pmem_devices = self.config.lock().unwrap().pmem.take(); if let Some(pmem_list_cfg) = &mut pmem_devices { for pmem_cfg in pmem_list_cfg.iter_mut() { - let device = self.make_virtio_pmem_device(pmem_cfg)?; + let device = self.make_virtio_pmem_device(pmem_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3423,6 +3441,7 @@ impl DeviceManager { fn make_virtio_vsock_device( &mut self, vsock_cfg: &mut VsockConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match vsock_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3454,7 +3473,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioVsock)?, @@ -3476,10 +3495,13 @@ impl DeviceManager { }) } - fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_vsock_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { let mut vsock = self.config.lock().unwrap().vsock.take(); if let Some(vsock_cfg) = &mut vsock { - let device = self.make_virtio_vsock_device(vsock_cfg)?; + let device = self.make_virtio_vsock_device(vsock_cfg, snapshot)?; self.virtio_devices.push(device); } self.config.lock().unwrap().vsock = vsock; @@ -3487,7 +3509,7 @@ impl DeviceManager { Ok(()) } - fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_mem_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { let mm = self.memory_manager.clone(); let mut mm = mm.lock().unwrap(); for (memory_zone_id, memory_zone) in mm.memory_zones_mut().iter_mut() { @@ -3509,7 +3531,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::EventFd)?, virtio_mem_zone.blocks_state().clone(), - state_from_id(self.snapshot.as_ref(), memory_zone_id.as_str()) + state_from_id(snapshot, memory_zone_id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioMem)?, @@ -3587,7 +3609,10 @@ impl DeviceManager { Ok((pvmemcontrol_bus_device, pvmemcontrol_pci_device)) } - fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_balloon_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { if let Some(balloon_config) = &self.config.lock().unwrap().balloon { let id = String::from(BALLOON_DEVICE_NAME); info!("Creating virtio-balloon device: id = {id}"); @@ -3603,7 +3628,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioBalloon)?, @@ -3630,7 +3655,10 @@ impl DeviceManager { Ok(()) } - fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult<()> { + fn make_virtio_watchdog_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult<()> { if !self.config.lock().unwrap().watchdog { return Ok(()); } @@ -3646,7 +3674,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVirtioWatchdog)?, @@ -3672,6 +3700,7 @@ impl DeviceManager { fn make_vdpa_device( &mut self, vdpa_cfg: &mut VdpaConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = match vdpa_cfg.pci_common.id.as_ref() { Some(id) => id.clone(), @@ -3695,7 +3724,7 @@ impl DeviceManager { device_path, self.memory_manager.lock().unwrap().guest_memory(), vdpa_cfg.num_queues as u16, - state_from_id(self.snapshot.as_ref(), id.as_str()) + state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) .map_err(DeviceManagerError::CreateVdpa)?, @@ -3719,12 +3748,12 @@ impl DeviceManager { }) } - fn make_vdpa_devices(&mut self) -> DeviceManagerResult<()> { + fn make_vdpa_devices(&mut self, snapshot: Option<&Snapshot>) -> DeviceManagerResult<()> { // Add vdpa if required let mut vdpa_devices = self.config.lock().unwrap().vdpa.take(); if let Some(vdpa_list_cfg) = &mut vdpa_devices { for vdpa_cfg in vdpa_list_cfg.iter_mut() { - let device = self.make_vdpa_device(vdpa_cfg)?; + let device = self.make_vdpa_device(vdpa_cfg, snapshot)?; self.virtio_devices.push(device); } } @@ -3759,6 +3788,7 @@ impl DeviceManager { fn add_passthrough_device( &mut self, device_cfg: &mut DeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { // If the passthrough device has not been created yet, it is created // here and stored in the DeviceManager structure for future needs. @@ -3771,7 +3801,7 @@ impl DeviceManager { ); } - self.add_vfio_device(device_cfg) + self.add_vfio_device(device_cfg, snapshot) } fn create_vfio_ops(&self) -> DeviceManagerResult> { @@ -3814,6 +3844,7 @@ impl DeviceManager { fn add_vfio_device( &mut self, device_cfg: &mut DeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { let vfio_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() @@ -3953,7 +3984,7 @@ impl DeviceManager { vfio_p2p_dma, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_name.as_str()), + vm_migration::snapshot_from_id(snapshot, vfio_name.as_str()), device_cfg.x_nv_gpudirect_clique, device_cfg .x_exclude_mmap_bars @@ -4062,13 +4093,16 @@ impl DeviceManager { Ok(new_resources) } - fn add_vfio_devices(&mut self) -> DeviceManagerResult> { + fn add_vfio_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult> { let mut iommu_attached_device_ids = Vec::new(); let mut devices = self.config.lock().unwrap().devices.take(); if let Some(device_list_cfg) = &mut devices { for device_cfg in device_list_cfg.iter_mut() { - let (device_id, _) = self.add_passthrough_device(device_cfg)?; + let (device_id, _) = self.add_passthrough_device(device_cfg, snapshot)?; if device_cfg.pci_common.iommu && self.iommu_device.is_some() { iommu_attached_device_ids.push(device_id); } @@ -4084,6 +4118,7 @@ impl DeviceManager { fn add_vfio_user_device( &mut self, device_cfg: &mut UserDeviceConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult<(PciBdf, String)> { let vfio_user_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() @@ -4129,7 +4164,7 @@ impl DeviceManager { legacy_interrupt_group, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_user_name.as_str()), + vm_migration::snapshot_from_id(snapshot, vfio_user_name.as_str()), ) .map_err(DeviceManagerError::VfioUserCreate)?; @@ -4191,12 +4226,15 @@ impl DeviceManager { Ok((pci_device_bdf, vfio_user_name)) } - fn add_user_devices(&mut self) -> DeviceManagerResult> { + fn add_user_devices( + &mut self, + snapshot: Option<&Snapshot>, + ) -> DeviceManagerResult> { let mut user_devices = self.config.lock().unwrap().user_devices.take(); if let Some(device_list_cfg) = &mut user_devices { for device_cfg in device_list_cfg.iter_mut() { - let (_device_id, _id) = self.add_vfio_user_device(device_cfg)?; + let (_device_id, _id) = self.add_vfio_user_device(device_cfg, snapshot)?; } } @@ -4216,6 +4254,7 @@ impl DeviceManager { is_hotplug: bool, dma_handler: Option>, pci_device_id: Option, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4313,7 +4352,7 @@ impl DeviceManager { use_64bit_bar_for_virtio_device(device_type, pci_segment_id, is_hotplug), dma_handler, self.pending_activations.clone(), - vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()), + vm_migration::snapshot_from_id(snapshot, id.as_str()), ) .map_err(DeviceManagerError::VirtioDevice)?, )); @@ -4347,6 +4386,7 @@ impl DeviceManager { fn add_pvpanic_device( &mut self, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>>> { let id = String::from(PVPANIC_DEVICE_NAME); let pci_segment_id = 0x0_u16; @@ -4356,7 +4396,7 @@ impl DeviceManager { let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources(&id, pci_segment_id, None)?; - let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); + let snapshot = snapshot_from_id(snapshot, id.as_str()); let pvpanic_device = devices::PvPanicDevice::new(id.clone(), snapshot) .map_err(DeviceManagerError::PvPanicCreate)?; @@ -4386,6 +4426,7 @@ impl DeviceManager { fn add_ivshmem_device( &mut self, ivshmem_cfg: &IvshmemConfig, + snapshot: Option<&Snapshot>, ) -> DeviceManagerResult>>> { let id = String::from(IVSHMEM_DEVICE_NAME); let pci_segment_id = 0x0_u16; @@ -4393,7 +4434,7 @@ impl DeviceManager { let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources(&id, pci_segment_id, None)?; - let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); + let snapshot = snapshot_from_id(snapshot, id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { memory_manager: self.memory_manager.clone(), @@ -4634,7 +4675,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let (bdf, device_name) = self.add_passthrough_device(device_cfg)?; + let (bdf, device_name) = self.add_passthrough_device(device_cfg, None)?; // Update the PCIU bitmap self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= @@ -4663,7 +4704,7 @@ impl DeviceManager { )); } - let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?; + let (bdf, device_name) = self.add_vfio_user_device(device_cfg, None)?; // Update the PCIU bitmap self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= @@ -5033,6 +5074,7 @@ impl DeviceManager { true, handle.dma_handler, handle.pci_common.pci_device_id, + None, )?; // Update the PCIU bitmap @@ -5065,14 +5107,14 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_block_device(disk_cfg, true)?; + let device = self.make_virtio_block_device(disk_cfg, true, None)?; self.hotplug_virtio_pci_device(device) } pub fn add_fs(&mut self, fs_cfg: &mut FsConfig) -> DeviceManagerResult { self.validate_identifier(&fs_cfg.pci_common.id)?; - let device = self.make_virtio_fs_device(fs_cfg)?; + let device = self.make_virtio_fs_device(fs_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5082,7 +5124,7 @@ impl DeviceManager { ) -> DeviceManagerResult { self.validate_identifier(&generic_vhost_user_cfg.pci_common.id)?; - let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5093,7 +5135,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_pmem_device(pmem_cfg)?; + let device = self.make_virtio_pmem_device(pmem_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5104,7 +5146,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_net_device(net_cfg)?; + let device = self.make_virtio_net_device(net_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5115,7 +5157,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_vdpa_device(vdpa_cfg)?; + let device = self.make_vdpa_device(vdpa_cfg, None)?; self.hotplug_virtio_pci_device(device) } @@ -5126,7 +5168,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_vsock_device(vsock_cfg)?; + let device = self.make_virtio_vsock_device(vsock_cfg, None)?; self.hotplug_virtio_pci_device(device) } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 1c3baf8652..8660cfb2a6 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -940,6 +940,7 @@ impl Vm { console_info, console_resize_pipe, original_termios, + snapshot, )?; } @@ -980,6 +981,7 @@ impl Vm { console_info.cloned(), console_resize_pipe.cloned(), original_termios.clone(), + snapshot, )?; } @@ -1039,10 +1041,11 @@ impl Vm { }; // Create interrupt controller and devices for MSHV + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; #[cfg(target_arch = "aarch64")] @@ -1056,6 +1059,7 @@ impl Vm { console_resize_pipe.cloned(), original_termios.clone(), ic, + dm_snapshot, ) .map_err(Error::DeviceManager)?; @@ -1073,11 +1077,13 @@ impl Vm { console_info: Option<&ConsoleInfo>, console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, + snapshot: Option<&Snapshot>, ) -> Result<()> { + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; #[cfg(target_arch = "aarch64")] @@ -1091,6 +1097,7 @@ impl Vm { console_resize_pipe.cloned(), original_termios.clone(), ic, + dm_snapshot, ) .map_err(Error::DeviceManager)?; @@ -1105,13 +1112,15 @@ impl Vm { console_info: Option, console_resize_pipe: Option>, original_termios: Arc>>, + snapshot: Option<&Snapshot>, ) -> Result<()> { // For KVM, create interrupt controller after boot vcpus // because GIC state is restored from snapshot during vcpu creation + let dm_snapshot = snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID); let ic = device_manager .lock() .unwrap() - .create_interrupt_controller() + .create_interrupt_controller(dm_snapshot) .map_err(Error::DeviceManager)?; vm.init().map_err(Error::InitializeVm)?; @@ -1119,7 +1128,13 @@ impl Vm { device_manager .lock() .unwrap() - .create_devices(console_info, console_resize_pipe, original_termios, ic) + .create_devices( + console_info, + console_resize_pipe, + original_termios, + ic, + dm_snapshot, + ) .map_err(Error::DeviceManager)?; Ok(()) From 2ab7ee6690e1e0b5a4323d3f4dfa839275537285 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 12:59:00 +0200 Subject: [PATCH 005/178] vmm: remove redundant SocketStream overrides ReadVolatile already provides a default read_volatile_exact() implementation, and WriteVolatile a default write_volatile_exact() implementation. Overriding these functions adds no behavioral value, but duplicates logic and needs to be updated whenever SocketStream gains or changes a variant. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 6440dc8fd6..0a92f3a0b4 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -162,16 +162,6 @@ impl ReadVolatile for SocketStream { SocketStream::Tcp(s) => s.read_volatile(buf), } } - - fn read_exact_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.read_exact_volatile(buf), - SocketStream::Tcp(s) => s.read_exact_volatile(buf), - } - } } impl WriteVolatile for SocketStream { @@ -184,16 +174,6 @@ impl WriteVolatile for SocketStream { SocketStream::Tcp(s) => s.write_volatile(buf), } } - - fn write_all_volatile( - &mut self, - buf: &VolatileSlice, - ) -> Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.write_all_volatile(buf), - SocketStream::Tcp(s) => s.write_all_volatile(buf), - } - } } // Wait for `fd` to become readable. In this case, we return true. In case From cbfe18352673f1b9b8860176d1fad9235e4055bf Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 10:43:19 +0200 Subject: [PATCH 006/178] vmm: remove AsRawFd trait for SocketStream The trait is not used and thus can be removed. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 0a92f3a0b4..45e0f753db 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -7,7 +7,7 @@ use std::io::{self, ErrorKind, Read, Write}; use std::net::{TcpListener, TcpStream}; use std::num::NonZeroU32; use std::os::fd::{AsFd, BorrowedFd}; -use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; use std::result::Result; @@ -134,15 +134,6 @@ impl Write for SocketStream { } } -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { - match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), - } - } -} - impl AsFd for SocketStream { fn as_fd(&self) -> BorrowedFd<'_> { match self { From 162f74eac28dbfd892d78ca24cc826b4516c7a2c Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 10:18:56 +0200 Subject: [PATCH 007/178] vm-migration: client-side of a TLS connection TLS connections have a TLS server (listens for incoming connections) and a TLS client (initiates the connection). This commit adds the code for the client side, which is the sender of a migration On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- Cargo.lock | 208 +++++++++++++++++++++++++++++++++++++--- Cargo.toml | 5 + vm-migration/Cargo.toml | 1 + vm-migration/src/lib.rs | 4 + vm-migration/src/tls.rs | 154 +++++++++++++++++++++++++++++ 5 files changed, 360 insertions(+), 12 deletions(-) create mode 100644 vm-migration/src/tls.rs diff --git a/Cargo.lock b/Cargo.lock index d0a21c5f0d..d669be9b44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,7 +71,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -82,7 +82,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -180,7 +180,7 @@ dependencies = [ "polling", "rustix", "slab", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -238,7 +238,7 @@ dependencies = [ "rustix", "signal-hook-registry", "slab", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -270,6 +270,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -468,6 +490,15 @@ dependencies = [ "zbus", ] +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -654,9 +685,15 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys", + "windows-sys 0.61.2", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -736,7 +773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -820,6 +857,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.32" @@ -1765,7 +1808,7 @@ dependencies = [ "hermit-abi", "pin-project-lite", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1926,6 +1969,20 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.27" @@ -1948,7 +2005,42 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] @@ -2146,6 +2238,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.117" @@ -2167,7 +2265,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2177,7 +2275,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2317,7 +2415,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2332,6 +2430,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2568,6 +2672,7 @@ version = "0.1.0" dependencies = [ "anyhow", "itertools", + "rustls", "serde", "serde_json", "thiserror", @@ -2795,6 +2900,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -2804,6 +2918,70 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "winnow" version = "1.0.0" @@ -2935,7 +3113,7 @@ dependencies = [ "tracing", "uds_windows", "uuid", - "windows-sys", + "windows-sys 0.61.2", "winnow", "zbus_macros", "zbus_names", @@ -2988,6 +3166,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index 3ef574c2f4..aa3cfa6cd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,11 @@ itertools = "0.14.0" jiff = { version = "0.2", default-features = false, features = ["std"] } libc = "0.2.186" log = "0.4.29" +rustls = { version = "0.23.38", default-features = false, features = [ + "aws-lc-rs", + "std", + "tls12", +] } sha2 = "0.11.0" signal-hook = "0.4.4" thiserror = "2.0.18" diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index 66b4e4f6a9..a65773eb93 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -8,6 +8,7 @@ version = "0.1.0" [dependencies] anyhow = { workspace = true } itertools = { workspace = true } +rustls = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 125d762bff..0613818c6a 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -16,6 +16,7 @@ use crate::protocol::MemoryRangeTable; mod bitpos_iterator; mod context; pub mod protocol; +pub mod tls; #[derive(Error, Debug)] pub enum UffdError { @@ -95,6 +96,9 @@ pub enum MigratableError { #[error("Lifecycle operation skipped for disconnected component {0}")] DeviceDisconnected(String), + + #[error("Error setting up a TLS-encrypted connection")] + Tls(#[source] tls::TlsError), } /// A Pausable component can be paused and resumed. diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs new file mode 100644 index 0000000000..afeed57e64 --- /dev/null +++ b/vm-migration/src/tls.rs @@ -0,0 +1,154 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! TLS support for migration streams over TCP. +//! +//! This module wraps `rustls` to provide a blocking [`TlsStream`] for migration +//! traffic. [`TlsStream::new_client`] authenticates the server against +//! `ca-cert.pem` and the expected hostname, and presents `client-cert.pem` and +//! `client-key.pem` for mutual TLS (mTLS) authentication. [`TlsServerConfig`] loads +//! `server-cert.pem` and `server-key.pem`, trusts client certificates issued by +//! the CA in `ca-cert.pem`, and [`TlsStream::new_server`] uses that +//! configuration to establish the server side of the connection. +//! +//! [`TlsStream`] implements [`Read`], [`Write`], [`ReadVolatile`], +//! [`WriteVolatile`], and [`AsFd`] so it can be used by the transport layer like +//! other migration streams. All data must pass through rustls; direct I/O on the +//! underlying socket would bypass TLS processing and break the connection. + +use std::net::TcpStream; +use std::path::Path; +use std::result; +use std::sync::Arc; + +use rustls::pki_types::pem::PemObject; +use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; +use rustls::server::VerifierBuilderError; +use rustls::{ClientConfig, ClientConnection, RootCertStore, StreamOwned}; +use thiserror::Error; + +use crate::MigratableError; + +/// Errors that can occur when establishing a TLS-encrypted migration channel. +#[derive(Error, Debug)] +pub enum TlsError { + #[error("The provided hostname could not be parsed")] + InvalidDnsName(#[source] InvalidDnsNameError), + + #[error("Rustls protocol error")] + RustlsError(#[from] rustls::Error), + + #[error("Rustls verifier configuration error")] + RustlsVerifierBuilderError(#[source] VerifierBuilderError), + + #[error("Rustls protocol IO error")] + RustlsIoError(#[from] std::io::Error), + + #[error("TLS handshake stalled: no read/write progress while handshake is still in progress")] + HandshakeError, + + #[error("Error handling PEM file")] + RustlsPemError(#[from] rustls::pki_types::pem::Error), +} + +/// Wraps the concrete rustls stream for either side (server or client) of the +/// TLS connection. +/// +/// [`TlsStream`] uses this enum to store a [`StreamOwned`] with either a +/// [`ClientConnection`] or [`ServerConnection`] while exposing a single +/// transport-agnostic API. +#[derive(Debug)] +enum TlsStreamParticipant { + Client(StreamOwned), +} + +/// Server/Client-agnostic TLS stream. +pub struct TlsStream { + stream: TlsStreamParticipant, +} + +impl TlsStream { + /// Creates a client [`TlsStream`]. + /// + /// The client verifies the server certificate against `ca-cert.pem` and the + /// provided `hostname`, and presents the certificate chain in + /// `client-cert.pem` together with the private key in `client-key.pem` for + /// mutual TLS authentication. + pub fn new_client( + socket: TcpStream, + cert_dir: &Path, + hostname: &str, + ) -> result::Result { + let root_store = load_root_store(&cert_dir.join("ca-cert.pem"))?; + let client_certs = load_cert_chain(&cert_dir.join("client-cert.pem"))?; + let client_key = load_private_key(&cert_dir.join("client-key.pem"))?; + + let config = ClientConfig::builder() + .with_root_certificates(root_store) + .with_client_auth_cert(client_certs, client_key) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + let config = Arc::new(config); + + let server_name = ServerName::try_from(hostname.to_string()) + .map_err(TlsError::InvalidDnsName) + .map_err(MigratableError::Tls)?; + let conn = ClientConnection::new(config.clone(), server_name.clone()) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError) + .map_err(MigratableError::Tls)?; + // No handshake progress on a connection that should be handshaking, we treat + // that as a failure. + if rd == 0 && wr == 0 { + Err(MigratableError::Tls(TlsError::HandshakeError))?; + } + } + + Ok(Self { + stream: TlsStreamParticipant::Client(tls), + }) + } +} + +/// Loads trusted CA certificates into a root store, i.e. the set of trust anchors +/// used to verify the peer's certificate chain. +fn load_root_store(cert_path: &Path) -> result::Result { + let mut root_store = RootCertStore::empty(); + root_store.add_parsable_certificates( + CertificateDer::pem_file_iter(cert_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>() + .map_err(MigratableError::Tls)?, + ); + Ok(root_store) +} + +/// Loads a certificate chain to present during the TLS handshake. +fn load_cert_chain( + cert_path: &Path, +) -> result::Result>, MigratableError> { + CertificateDer::pem_file_iter(cert_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>() + .map_err(MigratableError::Tls) +} + +/// Loads the private key that proves ownership of the presented certificate chain. +fn load_private_key(key_path: &Path) -> result::Result, MigratableError> { + PrivateKeyDer::from_pem_file(key_path) + .map_err(TlsError::RustlsPemError) + .map_err(MigratableError::Tls) +} From 1eb31b1c0f5703a130e6147bc01cdba330148f7f Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 10:42:10 +0200 Subject: [PATCH 008/178] vm-migration: server-side of a TLS connection Code for the TLS server, i.e. the receiver of a live migration. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/tls.rs | 73 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs index afeed57e64..f83da17cf6 100644 --- a/vm-migration/src/tls.rs +++ b/vm-migration/src/tls.rs @@ -25,8 +25,10 @@ use std::sync::Arc; use rustls::pki_types::pem::PemObject; use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; -use rustls::server::VerifierBuilderError; -use rustls::{ClientConfig, ClientConnection, RootCertStore, StreamOwned}; +use rustls::server::{VerifierBuilderError, WebPkiClientVerifier}; +use rustls::{ + ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, +}; use thiserror::Error; use crate::MigratableError; @@ -62,6 +64,7 @@ pub enum TlsError { #[derive(Debug)] enum TlsStreamParticipant { Client(StreamOwned), + Server(StreamOwned), } /// Server/Client-agnostic TLS stream. @@ -117,6 +120,72 @@ impl TlsStream { stream: TlsStreamParticipant::Client(tls), }) } + + /// Creates a server [`TlsStream`]. Encrypts and decrypts data sent through + /// this stream using the certificates and key from the provided + /// [`TlsServerConfig`]. + pub fn new_server( + socket: TcpStream, + config: &TlsServerConfig, + ) -> result::Result { + let conn = ServerConnection::new(config.config.clone()) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError) + .map_err(MigratableError::Tls)?; + // No handshake progress on a connection that should be handshaking, we treat + // that as a failure. + if rd == 0 && wr == 0 { + Err(MigratableError::Tls(TlsError::HandshakeError))?; + } + } + + Ok(Self { + stream: TlsStreamParticipant::Server(tls), + }) + } +} + +/// Carries a server-TLS-config. Intended to be turned into a [`TlsStream`] +/// when paired with a [`TcpStream`]. +#[derive(Debug)] +pub struct TlsServerConfig { + /// This config is shared between all server connections. + config: Arc, +} + +impl TlsServerConfig { + /// Creates a [`TlsServerConfig`] from the certificate chain in + /// `server-cert.pem`, the private key in `server-key.pem`, and the client + /// trust anchors in `ca-cert.pem`. + /// + /// Client certificates presented during the TLS handshake must chain to a CA in + /// `ca-cert.pem`. + pub fn new(cert_dir: &Path) -> result::Result { + let server_certs = load_cert_chain(&cert_dir.join("server-cert.pem"))?; + let server_key = load_private_key(&cert_dir.join("server-key.pem"))?; + // Trust anchors used to verify client certificates for mTLS. + let client_roots = Arc::new(load_root_store(&cert_dir.join("ca-cert.pem"))?); + + let client_verifier = WebPkiClientVerifier::builder(client_roots) + .build() + .map_err(TlsError::RustlsVerifierBuilderError) + .map_err(MigratableError::Tls)?; + + let config = ServerConfig::builder() + .with_client_cert_verifier(client_verifier) + .with_single_cert(server_certs, server_key) + .map_err(TlsError::RustlsError) + .map_err(MigratableError::Tls)?; + let config = Arc::new(config); + Ok(Self { config }) + } } /// Loads trusted CA certificates into a root store, i.e. the set of trust anchors From 725f0f8f75b0a2ad86e4dadd1d24f617f0efeb16 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 13:18:54 +0200 Subject: [PATCH 009/178] vmm: add TLS streams to migration transport Teach the migration transport to handle TLS-backed streams alongside plain TCP and UNIX sockets. Introduce a Tls variant in SocketStream and implement the necessary traits. Also updates the local-migration error path to reject any non-UNIX transport, which now includes TLS-wrapped TCP connections. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/tls.rs | 123 +++++++++++++++++++++++++++++++++ vmm/src/lib.rs | 4 +- vmm/src/migration_transport.rs | 8 +++ 3 files changed, 133 insertions(+), 2 deletions(-) diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs index f83da17cf6..e7af10d489 100644 --- a/vm-migration/src/tls.rs +++ b/vm-migration/src/tls.rs @@ -18,7 +18,9 @@ //! other migration streams. All data must pass through rustls; direct I/O on the //! underlying socket would bypass TLS processing and break the connection. +use std::io::{self, Read, Write}; use std::net::TcpStream; +use std::os::fd::{AsFd, BorrowedFd}; use std::path::Path; use std::result; use std::sync::Arc; @@ -30,6 +32,8 @@ use rustls::{ ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, }; use thiserror::Error; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; use crate::MigratableError; @@ -70,9 +74,17 @@ enum TlsStreamParticipant { /// Server/Client-agnostic TLS stream. pub struct TlsStream { stream: TlsStreamParticipant, + // We have to implement [`ReadVolatile`] and [`WriteVolatile`] for + // [`TlsStream`]. We use this buffer to avoid allocating a new buffer for + // every volatile read or write. + buf: Vec, } impl TlsStream { + /// The maximum size of [`TlsStream::buf`]. This keeps the reusable buffer + /// from growing without bound. + const BUF_SIZE: usize = 64 /* KiB */ << 10; + /// Creates a client [`TlsStream`]. /// /// The client verifies the server certificate against `ca-cert.pem` and the @@ -118,6 +130,7 @@ impl TlsStream { Ok(Self { stream: TlsStreamParticipant::Client(tls), + buf: Vec::new(), }) } @@ -148,10 +161,120 @@ impl TlsStream { Ok(Self { stream: TlsStreamParticipant::Server(tls), + buf: Vec::new(), }) } } +impl Read for TlsStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Read::read(s, buf), + TlsStreamParticipant::Server(s) => Read::read(s, buf), + } + } +} + +impl Write for TlsStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Write::write(s, buf), + TlsStreamParticipant::Server(s) => Write::write(s, buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match &mut self.stream { + TlsStreamParticipant::Client(s) => Write::flush(s), + TlsStreamParticipant::Server(s) => Write::flush(s), + } + } +} + +// Reading from or writing to these FDs would break the connection, because +// those reads or writes wouldn't go through rustls. But the FD is necessary to +// listen for incoming connections. +impl AsFd for TlsStream { + fn as_fd(&self) -> BorrowedFd<'_> { + match &self.stream { + TlsStreamParticipant::Client(s) => s.get_ref().as_fd(), + TlsStreamParticipant::Server(s) => s.get_ref().as_fd(), + } + } +} + +impl ReadVolatile for TlsStream { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> result::Result { + let len = vs.len().min(Self::BUF_SIZE); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let n = { + let (stream, buf) = (&mut self.stream, &mut self.buf[..len]); + + match stream { + TlsStreamParticipant::Client(s) => Read::read(s, buf), + TlsStreamParticipant::Server(s) => Read::read(s, buf), + } + .map_err(VolatileMemoryError::IOError)? + }; + + if n == 0 { + return Ok(0); + } + + vs.copy_from(&self.buf[..n]); + self.buf.clear(); + Ok(n) + } +} + +impl WriteVolatile for TlsStream { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> Result { + let len = vs.len().min(Self::BUF_SIZE); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = vs.copy_to(&mut buf[..len]); + + if n == 0 { + return Ok(0); + } + + let n = { + let stream = &mut self.stream; + + match stream { + TlsStreamParticipant::Client(s) => Write::write(s, buf), + TlsStreamParticipant::Server(s) => Write::write(s, buf), + } + .map_err(VolatileMemoryError::IOError)? + }; + + self.buf.clear(); + Ok(n) + } +} + /// Carries a server-TLS-config. Intended to be turned into a [`TlsStream`] /// when paired with a [`TcpStream`]. #[derive(Debug)] diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 4355836735..2b91244f7a 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1461,9 +1461,9 @@ impl Vmm { // Proceed with sending memory file descriptors over UNIX socket vm.send_memory_fds(unix_socket)?; } - SocketStream::Tcp(_tcp_socket) => { + _ => { return Err(MigratableError::MigrateSend(anyhow!( - "--local option is not supported with TCP sockets", + "--local option is only supported with UNIX sockets", ))); } } diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 45e0f753db..4763d9949e 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -26,6 +26,7 @@ use vm_memory::{ VolatileSlice, WriteVolatile, }; use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; +use vm_migration::tls::TlsStream; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; @@ -107,6 +108,7 @@ impl AsFd for ReceiveListener { pub(crate) enum SocketStream { Unix(UnixStream), Tcp(TcpStream), + Tls(Box), } impl Read for SocketStream { @@ -114,6 +116,7 @@ impl Read for SocketStream { match self { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), + SocketStream::Tls(stream) => stream.read(buf), } } } @@ -123,6 +126,7 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), + SocketStream::Tls(stream) => stream.write(buf), } } @@ -130,6 +134,7 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), + SocketStream::Tls(stream) => stream.flush(), } } } @@ -139,6 +144,7 @@ impl AsFd for SocketStream { match self { SocketStream::Unix(s) => s.as_fd(), SocketStream::Tcp(s) => s.as_fd(), + SocketStream::Tls(s) => s.as_fd(), } } } @@ -151,6 +157,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), + SocketStream::Tls(s) => s.read_volatile(buf), } } } @@ -163,6 +170,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), + SocketStream::Tls(s) => s.write_volatile(buf), } } } From a5a248b5c2537108bb484f3f1306805ec71607a8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 14 Apr 2026 13:56:18 +0200 Subject: [PATCH 010/178] vmm: accept migration connections over TLS Extend ReceiveListener with a TLS-backed listener variant for migration receivers. Store the TCP listener together with the server TLS configuration, wrap accepted sockets in TlsStream::new_server(), and preserver the existing listener cloning and fd polling behavior so receive-side migration code can treat TLS listeners like the existing TCP and UNIX cases. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/tls.rs | 2 +- vmm/src/migration_transport.rs | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs index e7af10d489..41ad5fab7e 100644 --- a/vm-migration/src/tls.rs +++ b/vm-migration/src/tls.rs @@ -277,7 +277,7 @@ impl WriteVolatile for TlsStream { /// Carries a server-TLS-config. Intended to be turned into a [`TlsStream`] /// when paired with a [`TcpStream`]. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct TlsServerConfig { /// This config is shared between all server connections. config: Arc, diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 4763d9949e..87d00681af 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -26,7 +26,7 @@ use vm_memory::{ VolatileSlice, WriteVolatile, }; use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; -use vm_migration::tls::TlsStream; +use vm_migration::tls::{TlsServerConfig, TlsStream}; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; @@ -42,6 +42,7 @@ pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; pub(crate) enum ReceiveListener { Tcp(TcpListener), Unix(UnixListener), + Tls(TcpListener, TlsServerConfig), } impl ReceiveListener { @@ -58,6 +59,15 @@ impl ReceiveListener { .map(|(socket, _)| SocketStream::Unix(socket)) .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tls(listener, config) => listener + .accept() + .map(|(socket, _)| TlsStream::new_server(socket, config)) + .context("Failed to accept TCP connection") + .map_err(MigratableError::MigrateReceive)? + .map(Box::new) + .map(SocketStream::Tls) + .context("Failed to accept TLS migration connection") + .map_err(MigratableError::MigrateReceive), } } @@ -91,6 +101,11 @@ impl ReceiveListener { .map(ReceiveListener::Unix) .context("Failed to clone Unix listener") .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tls(listener, config) => listener + .try_clone() + .map(|listener| ReceiveListener::Tls(listener, config.clone())) + .context("Failed to clone TLS listener") + .map_err(MigratableError::MigrateReceive), } } } @@ -100,6 +115,7 @@ impl AsFd for ReceiveListener { match self { ReceiveListener::Tcp(listener) => listener.as_fd(), ReceiveListener::Unix(listener) => listener.as_fd(), + ReceiveListener::Tls(listener, _) => listener.as_fd(), } } } From 7da1e8ce3791dc3c1e332b990fbcb48a76e13c5e Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 15 Apr 2026 09:53:07 +0200 Subject: [PATCH 011/178] vmm: tighten migration URL validation For TLS we have to parse the hostname from the given migration URL. For that we have to make a few assumptions about the URL (e.g. it always has a port). To catch problems early, we tighten the URL validation. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/api/mod.rs | 132 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 17 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index e4ee7235ad..91e3f02b30 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -271,6 +271,42 @@ pub struct VmReceiveMigrationData { pub receiver_url: String, } +/// Validates the host and port portion of a TCP migration URL. +/// +/// The expected format is `:` for hostnames and IPv4 addresses, or +/// `[]:` for IPv6 addresses. The host and port must both be +/// present, and the port must parse as a `u16`. +fn validate_tcp_migration_address(address: &str) -> Result<(), String> { + let (host, port) = if let Some(rest) = address.strip_prefix('[') { + let (host, rest) = rest + .split_once(']') + .ok_or_else(|| "missing closing ']' for bracketed IPv6 address".to_string())?; + + let port = rest + .strip_prefix(':') + .ok_or_else(|| "missing port separator after bracketed host".to_string())?; + + (host, port) + } else { + address + .rsplit_once(':') + .ok_or_else(|| "missing TCP port".to_string())? + }; + + if host.is_empty() { + return Err("host must not be empty".to_string()); + } + + if port.is_empty() { + return Err("port must not be empty".to_string()); + } + + port.parse::() + .map_err(|_| format!("invalid TCP port: {port}"))?; + + Ok(()) +} + #[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq)] /// The migration timeout strategy. /// @@ -440,26 +476,27 @@ impl VmSendMigrationData { } pub fn validate(&self) -> Result<(), VmSendMigrationConfigError> { - match self.destination_url.as_str() { - url if url - .strip_prefix("tcp:") - .is_some_and(|addr| !addr.is_empty()) => {} - url if url - .strip_prefix("unix:") - .is_some_and(|path| !path.is_empty()) => - { - if self.connections.get() > 1 { - return Err(VmSendMigrationConfigError::ValidationError( - "UNIX sockets and connections option cannot be used at the same time." - .to_string(), - )); - } - } - _ => { + if let Some(addr) = self.destination_url.strip_prefix("tcp:") { + validate_tcp_migration_address(addr).map_err(|e| { + VmSendMigrationConfigError::ValidationError(format!( + "destination_url must use tcp:: or unix:: {e}." + )) + })?; + } else if self + .destination_url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) + { + if self.connections.get() > 1 { return Err(VmSendMigrationConfigError::ValidationError( - "destination_url must use tcp:: or unix:.".to_string(), + "UNIX sockets and connections option cannot be used at the same time." + .to_string(), )); } + } else { + return Err(VmSendMigrationConfigError::ValidationError( + "destination_url must use tcp:: or unix:.".to_string(), + )); } if self.connections.get() > MAX_MIGRATION_CONNECTIONS { @@ -1758,6 +1795,57 @@ impl ApiAction for VmNmi { mod unit_tests { use super::*; + #[test] + fn test_validate_tcp_migration_address() { + for address in [ + "192.168.1.1:8080", + "destination.example:8080", + "[2001:db8::1]:8080", + "[::1]:0", + "localhost:65535", + ] { + validate_tcp_migration_address(address) + .unwrap_or_else(|e| panic!("expected {address} to be valid, got: {e}")); + } + + assert_eq!( + validate_tcp_migration_address("192.168.1.1").unwrap_err(), + "missing TCP port" + ); + assert_eq!( + validate_tcp_migration_address(":8080").unwrap_err(), + "host must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("host:").unwrap_err(), + "port must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("host:not-a-port").unwrap_err(), + "invalid TCP port: not-a-port" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1").unwrap_err(), + "missing closing ']' for bracketed IPv6 address" + ); + assert_eq!( + validate_tcp_migration_address("[]:8080").unwrap_err(), + "host must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]").unwrap_err(), + "missing port separator after bracketed host" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]:").unwrap_err(), + "port must not be empty" + ); + assert_eq!( + validate_tcp_migration_address("[2001:db8::1]:99999").unwrap_err(), + "invalid TCP port: 99999" + ); + } + #[test] fn test_vm_send_migration_data_parse() { // Fully specified @@ -1781,6 +1869,14 @@ mod unit_tests { assert_eq!(data.timeout_strategy, TimeoutStrategy::default()); assert_eq!(data.connections, VmSendMigrationData::default_connections()); + let data = VmSendMigrationData::parse("destination_url=tcp:[2001:db8::1]:8080") + .expect("IPv6 migration string should parse"); + assert_eq!(data.destination_url, "tcp:[2001:db8::1]:8080"); + + let data = VmSendMigrationData::parse("destination_url=tcp:destination.example:8080") + .expect("hostname migration string should parse"); + assert_eq!(data.destination_url, "tcp:destination.example:8080"); + // Missing destination_url is an error VmSendMigrationData::parse("local=on,downtime_ms=200").unwrap_err(); @@ -1817,6 +1913,8 @@ mod unit_tests { // Invalid destination URL scheme is rejected VmSendMigrationData::parse("destination_url=file:///tmp/migration").unwrap_err(); + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1").unwrap_err(); + VmSendMigrationData::parse("destination_url=tcp:[2001:db8::1]").unwrap_err(); // Local migration requires a UNIX socket destination VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,local=yes").unwrap_err(); From db5213feddae30f4df8f0aa6c8bfb257a603dd80 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 15 Apr 2026 10:16:33 +0200 Subject: [PATCH 012/178] vmm: add TLS API option to send migration call To enable TLS, the caller has to provide a path to a directory that contains the necessary files. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/api/mod.rs | 36 +++++++++++++++++++++-- vmm/src/api/openapi/cloud-hypervisor.yaml | 6 ++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 91e3f02b30..8c173b98de 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -35,6 +35,7 @@ pub mod http; use std::io; use std::num::{NonZeroU32, NonZeroU64}; +use std::path::PathBuf; use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; @@ -366,13 +367,17 @@ pub struct VmSendMigrationData { /// Must be between 1 and `MAX_MIGRATION_CONNECTIONS` inclusive. #[serde(default = "VmSendMigrationData::default_connections")] pub connections: NonZeroU32, + /// Path to the directory containing the TLS root CA certificate (ca-cert.pem), the TLS client certificate (client-cert.pem), and TLS client key (client-key.pem). + #[serde(default)] + pub tls_dir: Option, } impl VmSendMigrationData { pub const SYNTAX: &'static str = "VM send migration parameters \ \"destination_url=[,local=on|off,\ downtime_ms=,timeout_s=,\ - timeout_strategy=cancel|ignore,connections=]\""; + timeout_strategy=cancel|ignore,connections=,\ + tls_dir=]\""; // Same as QEMU. pub const DEFAULT_DOWNTIME: Duration = Duration::from_millis(300); @@ -400,7 +405,8 @@ impl VmSendMigrationData { .add("downtime_ms") .add("timeout_s") .add("timeout_strategy") - .add("connections"); + .add("connections") + .add("tls_dir"); parser .parse(migration) .map_err(VmSendMigrationConfigError::ParseError)?; @@ -452,6 +458,10 @@ impl VmSendMigrationData { })?, None => Self::default_connections(), }; + let tls_dir = parser + .convert::("tls_dir") + .map_err(VmSendMigrationConfigError::ParseError)? + .map(|path| PathBuf::from(&path)); let data = Self { destination_url, @@ -460,6 +470,7 @@ impl VmSendMigrationData { timeout_s, timeout_strategy, connections, + tls_dir, }; data.validate()?; @@ -493,6 +504,11 @@ impl VmSendMigrationData { .to_string(), )); } + if self.tls_dir.is_some() { + return Err(VmSendMigrationConfigError::ValidationError( + "UNIX sockets and TLS encryption cannot be used at the same time.".to_string(), + )); + } } else { return Err(VmSendMigrationConfigError::ValidationError( "destination_url must use tcp:: or unix:.".to_string(), @@ -520,6 +536,17 @@ impl VmSendMigrationData { } } + // The TLS implementation checks for all necessary files. Here we only + // check whether the path exists and points to a directory. + if let Some(tls_dir) = &self.tls_dir + && !tls_dir.is_dir() + { + return Err(VmSendMigrationConfigError::ValidationError(format!( + "tls_dir must point to a directory. Path: {}", + tls_dir.display() + ))); + } + Ok(()) } } @@ -1936,12 +1963,14 @@ mod unit_tests { timeout_s: VmSendMigrationData::default_timeout_s(), timeout_strategy: Default::default(), connections: VmSendMigrationData::default_connections(), + tls_dir: None, } ); // Happy path, fully specified + let tls_dir = std::env::temp_dir(); let data = - VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") + VmSendMigrationData::parse(&format!("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4,tls_dir={}", tls_dir.display())) .unwrap(); assert_eq!( data, @@ -1952,6 +1981,7 @@ mod unit_tests { timeout_s: NonZeroU64::new(900).unwrap(), timeout_strategy: TimeoutStrategy::Ignore, connections: NonZeroU32::new(4).unwrap(), + tls_dir: Some(tls_dir), } ); } diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index e8b72f7484..64ab6f8bf5 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1499,6 +1499,12 @@ components: The number of parallel TCP connections to use for migration. Must be between 1 and 128. Multiple connections are not supported with local UNIX-socket migration. + tls_dir: + type: string + description: > + Directory containing the TLS root CA certificate (ca-cert.pem), the TLS client + certificate (client-cert.pem), and TLS client key (client-key.pem). + TLS is only supported with tcp:: destination URLs. VmAddUserDevice: required: From cd1c00db8c71b24f53e2a2ebf2641ac2137274fb Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 15 Apr 2026 10:32:02 +0200 Subject: [PATCH 013/178] vmm: add TLS API option to receive migration call As we now have more than one parameter for the receive migration call, this commit also adds parsing and validation for those parameters. We maintain backwards compatibility by also correctly parsing the case where the caller only provides a URL. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- cloud-hypervisor/src/bin/ch-remote.rs | 18 +-- vmm/src/api/mod.rs | 137 ++++++++++++++++++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 6 + 3 files changed, 152 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 236e7438e0..3fd399a77f 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -71,6 +71,8 @@ enum Error { ReadingFile(#[source] std::io::Error), #[error("Invalid disk size")] InvalidDiskSize(#[source] ByteSizedParseError), + #[error("Error parsing receive migration configuration")] + ReceiveMigrationConfig(#[from] vmm::api::VmReceiveMigrationConfigError), #[error("Error parsing send migration configuration")] SendMigrationConfig(#[from] vmm::api::VmSendMigrationConfigError), } @@ -534,7 +536,7 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap() .get_one::("receive_migration_config") .unwrap(), - ); + )?; simple_api_command( socket, "PUT", @@ -753,7 +755,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .unwrap() .get_one::("receive_migration_config") .unwrap(), - ); + )?; proxy.api_vm_receive_migration(&receive_migration_data) } Some("create") => { @@ -941,12 +943,10 @@ fn coredump_config(destination_url: &str) -> String { serde_json::to_string(&coredump_config).unwrap() } -fn receive_migration_data(url: &str) -> String { - let receive_migration_data = vmm::api::VmReceiveMigrationData { - receiver_url: url.to_owned(), - }; - - serde_json::to_string(&receive_migration_data).unwrap() +fn receive_migration_data(config: &str) -> Result { + let receive_migration_data = + vmm::api::VmReceiveMigrationData::parse(config).map_err(Error::ReceiveMigrationConfig)?; + Ok(serde_json::to_string(&receive_migration_data).unwrap()) } fn send_migration_data(config: &str) -> Result { @@ -1069,7 +1069,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("receive_migration_config") .index(1) - .help(""), + .help(vmm::api::VmReceiveMigrationData::SYNTAX), ), Command::new("remove-device") .about("Remove VFIO and PCI device") diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 8c173b98de..2ed360eeae 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -267,9 +267,22 @@ pub struct VmCoredumpData { } #[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[cfg_attr(test, derive(PartialEq))] pub struct VmReceiveMigrationData { /// URL for the reception of migration state pub receiver_url: String, + /// Directory containing the TLS server certificate (server-cert.pem), the TLS server key (server-key.pem), and the client TLS root CA certificate (ca-cert.pem). + #[serde(default)] + pub tls_dir: Option, +} + +#[derive(Debug, Error)] +pub enum VmReceiveMigrationConfigError { + #[error("Error parsing receive migration parameters")] + ParseError(#[source] OptionParserError), + + #[error("Error validating receive migration parameters")] + ValidationError(String), } /// Validates the host and port portion of a TCP migration URL. @@ -308,6 +321,90 @@ fn validate_tcp_migration_address(address: &str) -> Result<(), String> { Ok(()) } +impl VmReceiveMigrationData { + pub const SYNTAX: &'static str = "VM receive migration parameters \ + \"\" or \"receiver_url=[,tls_dir=]\""; + + pub fn parse(migration: &str) -> Result { + let uses_key_value_syntax = migration.split(',').any( + |part| matches!(part, p if p.starts_with("receiver_url=") || p.starts_with("tls_dir=")), + ); + + if !uses_key_value_syntax { + let data = Self { + receiver_url: migration.to_owned(), + tls_dir: None, + }; + + data.validate()?; + + return Ok(data); + } + + let mut parser = OptionParser::new(); + parser.add("receiver_url").add("tls_dir"); + parser + .parse(migration) + .map_err(VmReceiveMigrationConfigError::ParseError)?; + + let receiver_url = parser.get("receiver_url").ok_or_else(|| { + VmReceiveMigrationConfigError::ParseError(OptionParserError::InvalidSyntax( + "receiver_url is required".to_string(), + )) + })?; + let tls_dir = parser + .convert::("tls_dir") + .map_err(VmReceiveMigrationConfigError::ParseError)? + .map(|path| PathBuf::from(&path)); + + let data = Self { + receiver_url, + tls_dir, + }; + + data.validate()?; + + Ok(data) + } + + pub fn validate(&self) -> Result<(), VmReceiveMigrationConfigError> { + if let Some(addr) = self.receiver_url.strip_prefix("tcp:") { + validate_tcp_migration_address(addr).map_err(|e| { + VmReceiveMigrationConfigError::ValidationError(format!( + "receiver_url must use tcp:: or unix:: {e}." + )) + })?; + } else if self + .receiver_url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) + { + if self.tls_dir.is_some() { + return Err(VmReceiveMigrationConfigError::ValidationError( + "UNIX sockets and TLS encryption cannot be used at the same time.".to_string(), + )); + } + } else { + return Err(VmReceiveMigrationConfigError::ValidationError( + "receiver_url must use tcp:: or unix:.".to_string(), + )); + } + + // The TLS implementation checks for all necessary files. Here we only + // check whether the path exists and points to a directory. + if let Some(tls_dir) = &self.tls_dir + && !tls_dir.is_dir() + { + return Err(VmReceiveMigrationConfigError::ValidationError(format!( + "tls_dir must point to a directory. Path: {}", + tls_dir.display() + ))); + } + + Ok(()) + } +} + #[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq)] /// The migration timeout strategy. /// @@ -1873,6 +1970,46 @@ mod unit_tests { ); } + #[test] + fn test_vm_receive_migration_data_parse() { + let data = VmReceiveMigrationData::parse("tcp:192.168.1.1:8080").unwrap(); + assert_eq!( + data, + VmReceiveMigrationData { + receiver_url: "tcp:192.168.1.1:8080".to_string(), + tls_dir: None, + } + ); + + let data = VmReceiveMigrationData::parse("tcp:[2001:db8::1]:8080").unwrap(); + assert_eq!(data.receiver_url, "tcp:[2001:db8::1]:8080"); + + let data = VmReceiveMigrationData::parse("tcp:destination.example:8080").unwrap(); + assert_eq!(data.receiver_url, "tcp:destination.example:8080"); + + let data = VmReceiveMigrationData::parse("unix:/tmp/ch=migrate.sock").unwrap(); + assert_eq!(data.receiver_url, "unix:/tmp/ch=migrate.sock"); + + let tls_dir = std::env::temp_dir(); + let data = VmReceiveMigrationData::parse(&format!( + "receiver_url=tcp:192.168.1.1:8080,tls_dir={}", + tls_dir.display() + )) + .unwrap(); + assert_eq!( + data, + VmReceiveMigrationData { + receiver_url: "tcp:192.168.1.1:8080".to_string(), + tls_dir: Some(tls_dir), + } + ); + + VmReceiveMigrationData::parse("receiver_url=file:///tmp/migration").unwrap_err(); + VmReceiveMigrationData::parse("tcp:192.168.1.1").unwrap_err(); + VmReceiveMigrationData::parse("tcp:[2001:db8::1]").unwrap_err(); + VmReceiveMigrationData::parse("receiver_url=unix:/tmp/sock,tls_dir=/tmp").unwrap_err(); + } + #[test] fn test_vm_send_migration_data_parse() { // Fully specified diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 64ab6f8bf5..91eb5af245 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1452,6 +1452,12 @@ components: properties: receiver_url: type: string + tls_dir: + type: string + description: > + Directory containing the TLS server certificate (server-cert.pem), the TLS + server key (server-key.pem), and the client TLS root CA certificate (ca-cert.pem). + TLS is only supported with tcp:: receiver URLs. TimeoutStrategy: type: string From 5e0484b140f5c832b2bf740170738a334fa2e2c8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 15 Apr 2026 10:33:58 +0200 Subject: [PATCH 014/178] vmm: encrypt migration data with TLS if configured Wire in the code paths that activate the TLS encrypting if the necessary API arguments are provided. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 26 +++++++--- vmm/src/migration_transport.rs | 93 +++++++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 13 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 2b91244f7a..250ace1f59 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1416,8 +1416,10 @@ impl Vmm { let mut ctx = OngoingMigrationContext::new(); // Set up the socket connection - let mut socket = - migration_transport::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = migration_transport::send_migration_socket( + &send_data_migration.destination_url, + send_data_migration.tls_dir.as_deref(), + )?; // Start the migration migration_transport::send_request_expect_ok( @@ -1496,6 +1498,7 @@ impl Vmm { let mut mem_send = migration_transport::SendAdditionalConnections::new( &send_data_migration.destination_url, send_data_migration.connections, + send_data_migration.tls_dir.as_deref(), &vm.guest_memory(), )?; @@ -2516,13 +2519,21 @@ impl RequestHandler for Vmm { &mut self, receive_data_migration: VmReceiveMigrationData, ) -> result::Result<(), MigratableError> { + receive_data_migration + .validate() + .context("Invalid receive migration configuration") + .map_err(MigratableError::MigrateReceive)?; + info!( - "Receiving migration: receiver_url = {}", - receive_data_migration.receiver_url + "Receiving migration: receiver_url={},tls={}", + receive_data_migration.receiver_url, + receive_data_migration.tls_dir.is_some() ); - let mut listener = - migration_transport::receive_migration_listener(&receive_data_migration.receiver_url)?; + let mut listener = migration_transport::receive_migration_listener( + &receive_data_migration.receiver_url, + receive_data_migration.tls_dir.as_deref(), + )?; // Accept the connection and get the socket let mut socket = listener.accept()?; @@ -2578,9 +2589,10 @@ impl RequestHandler for Vmm { .map_err(MigratableError::MigrateSend)?; info!( - "Sending migration: destination_url={},local={},downtime={}ms,timeout={}s,timeout_strategy={:?}", + "Sending migration: destination_url={},local={},tls={},downtime={}ms,timeout={}s,timeout_strategy={:?}", send_data_migration.destination_url, send_data_migration.local, + send_data_migration.tls_dir.is_some(), send_data_migration.downtime().as_millis(), send_data_migration.timeout().as_secs(), send_data_migration.timeout_strategy diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 87d00681af..d9e3e20077 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -9,7 +9,7 @@ use std::num::NonZeroU32; use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{Receiver, Sender, SyncSender, TrySendError, channel, sync_channel}; @@ -507,6 +507,7 @@ impl SendAdditionalConnections { pub(crate) fn new( destination: &str, connections: NonZeroU32, + tls_dir: Option<&Path>, guest_memory: &GuestMemoryAtomic, ) -> Result { let mut threads = Vec::new(); @@ -533,7 +534,7 @@ impl SendAdditionalConnections { // the memory chunks to the workers, but does not send memory anymore. Thus in // this case we create one additional thread for each connection. for n in 0..configured_connections { - let mut socket = send_migration_socket(destination)?; + let mut socket = send_migration_socket(destination, tls_dir)?; let guest_memory = guest_memory.clone(); let message_rx = message_rx.clone(); let worker_error = worker_error.clone(); @@ -779,9 +780,40 @@ fn socket_url_to_path(url: &str) -> Result { .map(|s| s.into()) } +/// Extract the server name from a TCP address. This function assumes that +/// `tcp:` has already been stripped. +fn tcp_address_to_server_name(address: &str) -> Result<&str, anyhow::Error> { + if let Some(rest) = address.strip_prefix('[') { + let (host, port) = rest + .split_once(']') + .ok_or_else(|| anyhow!("Could not extract host from TCP address: {address}"))?; + + if host.is_empty() || !port.starts_with(':') || port.len() == 1 { + return Err(anyhow!( + "Could not extract host from TCP address: {address}" + )); + } + + Ok(host) + } else { + let (host, port) = address + .rsplit_once(':') + .ok_or_else(|| anyhow!("Could not extract host from TCP address: {address}"))?; + + if host.is_empty() || port.is_empty() { + return Err(anyhow!( + "Could not extract host from TCP address: {address}" + )); + } + + Ok(host) + } +} + /// Connect to a migration endpoint and return the established stream. pub(crate) fn send_migration_socket( destination_url: &str, + tls_dir: Option<&Path>, ) -> Result { if let Some(address) = destination_url.strip_prefix("tcp:") { info!("Connecting to TCP socket at {address}"); @@ -790,7 +822,18 @@ pub(crate) fn send_migration_socket( MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) })?; - Ok(SocketStream::Tcp(socket)) + if let Some(tls_dir) = tls_dir { + let server_name = tcp_address_to_server_name(address) + .context("Error extracting TLS server name from destination URL") + .map_err(MigratableError::MigrateSend)?; + TlsStream::new_client(socket, tls_dir, server_name) + .map(Box::new) + .map(SocketStream::Tls) + .context("Error creating TLS migration stream") + .map_err(MigratableError::MigrateSend) + } else { + Ok(SocketStream::Tcp(socket)) + } } else { let path = socket_url_to_path(destination_url).map_err(MigratableError::MigrateSend)?; info!("Connecting to UNIX socket at {path:?}"); @@ -806,12 +849,21 @@ pub(crate) fn send_migration_socket( /// Bind a migration listener for the receiver side. pub(crate) fn receive_migration_listener( receiver_url: &str, + tls_dir: Option<&Path>, ) -> Result { if let Some(address) = receiver_url.strip_prefix("tcp:") { - TcpListener::bind(address) - .map(ReceiveListener::Tcp) + let listener = TcpListener::bind(address) .context("Error binding to TCP socket") - .map_err(MigratableError::MigrateReceive) + .map_err(MigratableError::MigrateReceive)?; + + if let Some(tls_dir) = tls_dir { + let config = TlsServerConfig::new(tls_dir) + .context("Error creating TLS server config") + .map_err(MigratableError::MigrateReceive)?; + Ok(ReceiveListener::Tls(listener, config)) + } else { + Ok(ReceiveListener::Tcp(listener)) + } } else { let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateReceive)?; UnixListener::bind(&path) @@ -972,3 +1024,32 @@ pub(crate) fn receive_memory_ranges( Ok(()) } + +#[cfg(test)] +mod tests { + use super::tcp_address_to_server_name; + + #[test] + fn test_tcp_address_to_server_name() { + assert_eq!( + tcp_address_to_server_name("example.com:1234").unwrap(), + "example.com" + ); + assert_eq!( + tcp_address_to_server_name("192.0.2.1:1234").unwrap(), + "192.0.2.1" + ); + assert_eq!( + tcp_address_to_server_name("[2001:db8::1]:1234").unwrap(), + "2001:db8::1" + ); + } + + #[test] + fn test_tcp_address_to_server_name_rejects_invalid_addresses() { + tcp_address_to_server_name("example.com").unwrap_err(); + tcp_address_to_server_name(":1234").unwrap_err(); + tcp_address_to_server_name("[2001:db8::1]").unwrap_err(); + tcp_address_to_server_name("[2001:db8::1]1234").unwrap_err(); + } +} From ab64813b9c85a44c4e23679a7b680ac449dbfec6 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 24 Apr 2026 09:15:01 +0200 Subject: [PATCH 015/178] docs: document live migration TLS encryption On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- docs/live_migration.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/live_migration.md b/docs/live_migration.md index 81eed06665..023a854f71 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -191,6 +191,45 @@ the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. +#### Encryption + +TCP migration can be protected with TLS by passing `tls_dir=` to +both `receive-migration` and `send-migration`. + +The destination host needs a directory containing: + +- `server-cert.pem`: the certificate presented by the destination +- `server-key.pem`: the private key for `server-cert.pem` +- `ca-cert.pem`: the CA certificate used to verify client certificates + +The source host needs a directory containing: + +- `ca-cert.pem`: the CA certificate used to verify the destination + certificate +- `client-cert.pem`: the certificate presented by the source +- `client-key.pem`: the private key for `client-cert.pem` + +Current TCP migration uses mutual TLS (mTLS) authentication. The source +verifies the destination certificate against `ca-cert.pem` and presents +`client-cert.pem` and `client-key.pem`. The destination presents +`server-cert.pem` and `server-key.pem`, and only accepts client +certificates that chain to `ca-cert.pem`. + +Example receiver command: + +```console +dst $ ch-remote --api-socket=/tmp/api receive-migration receiver_url=tcp:0.0.0.0:{port},tls_dir=/path/to/dst-tls +``` + +Example sender command: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port},tls_dir=/path/to/src-tls +``` + +TLS encryption is only supported with `tcp::` migration +URLs, not with local UNIX-socket migration. + #### Migration Parameters Cloud Hypervisor supports additional parameters to control the From ce9340ed8987a0d11405e40138e976da9ec6065d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 15 Aug 2025 11:06:59 +0200 Subject: [PATCH 016/178] docs: update README for gardenlinux Cloud Hypervisor Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- README.md | 435 ++++-------------------------------------------------- 1 file changed, 27 insertions(+), 408 deletions(-) diff --git a/README.md b/README.md index 80048ab328..0ee4b3af4b 100644 --- a/README.md +++ b/README.md @@ -1,408 +1,27 @@ -- [1. What is Cloud Hypervisor?](#1-what-is-cloud-hypervisor) - - [Objectives](#objectives) - - [High Level](#high-level) - - [Architectures](#architectures) - - [Guest OS](#guest-os) -- [2. Getting Started](#2-getting-started) - - [Host OS](#host-os) - - [Use Pre-built Binaries](#use-pre-built-binaries) - - [Packages](#packages) - - [Building from Source](#building-from-source) - - [Booting Linux](#booting-linux) - - [Firmware Booting](#firmware-booting) - - [Custom Kernel and Disk Image](#custom-kernel-and-disk-image) - - [Building your Kernel](#building-your-kernel) - - [Disk image](#disk-image) - - [Booting the guest VM](#booting-the-guest-vm) -- [3. Status](#3-status) - - [Hot Plug](#hot-plug) - - [Device Model](#device-model) - - [Roadmap](#roadmap) -- [4. Relationship with _Rust VMM_ Project](#4-relationship-with-rust-vmm-project) - - [Differences with Firecracker and crosvm](#differences-with-firecracker-and-crosvm) -- [5. Community](#5-community) - - [Contribute](#contribute) - - [Slack](#slack) - - [Mailing list](#mailing-list) - - [Security issues](#security-issues) - -# 1. What is Cloud Hypervisor? - -Cloud Hypervisor is an open source Virtual Machine Monitor (VMM) that runs on -top of the [KVM](https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt) -hypervisor and the Microsoft Hypervisor (MSHV). - -The project focuses on running modern, _Cloud Workloads_, on specific, common, -hardware architectures. In this case _Cloud Workloads_ refers to those that are -run by customers inside a Cloud Service Provider. This means modern operating -systems with most I/O handled by -paravirtualised devices (e.g. _virtio_), no requirement for legacy devices, and -64-bit CPUs. - -Cloud Hypervisor is implemented in [Rust](https://www.rust-lang.org/) and is -based on the [Rust VMM](https://github.com/rust-vmm) crates. - -## Objectives - -### High Level - -- Runs on KVM or MSHV -- Minimal emulation -- Low latency -- Low memory footprint -- Low complexity -- High performance -- Small attack surface -- 64-bit support only -- CPU, memory, PCI hotplug -- Machine to machine migration - -### Architectures - -Cloud Hypervisor supports the `x86-64`, `AArch64` and `riscv64` -architectures, with functionality varying across these platforms. The -functionality differences between `x86-64` and `AArch64` are documented -in [#1125](https://github.com/cloud-hypervisor/cloud-hypervisor/issues/1125). -The `riscv64` architecture support is experimental and offers limited -functionality. For more details and instructions, please refer to [riscv -documentation](docs/riscv.md). - -### Guest OS - -Cloud Hypervisor supports `64-bit Linux` and Windows 10/Windows Server 2019. - -# 2. Getting Started - -The following sections describe how to build and run Cloud Hypervisor. - -## Prerequisites for AArch64 - -- AArch64 servers (recommended) or development boards equipped with the GICv3 - interrupt controller. - -## Host OS - -For required KVM functionality and adequate performance the recommended host -kernel version is 5.13. The majority of the CI currently tests with kernel -version 5.15. - -## Use Pre-built Binaries - -The recommended approach to getting started with Cloud Hypervisor is by using a -pre-built binary. Binaries are available for the [latest -release](https://github.com/cloud-hypervisor/cloud-hypervisor/releases/latest). -Use `cloud-hypervisor-static` for `x86-64` or `cloud-hypervisor-static-aarch64` -for `AArch64` platform. - -## Packages - -For convenience, packages are also available targeting some popular Linux -distributions. This is thanks to the [Open Build -Service](https://build.opensuse.org). The [OBS -README](https://github.com/cloud-hypervisor/obs-packaging) explains how to -enable the repository in a supported Linux distribution and install Cloud Hypervisor -and accompanying packages. Please report any packaging issues in the -[obs-packaging](https://github.com/cloud-hypervisor/obs-packaging) repository. - -## Building from Source - -Please see the [instructions for building from source](docs/building.md) if you -do not wish to use the pre-built binaries. - -## Booting Linux - -Cloud Hypervisor boots guests in one of two ways. The first is direct -kernel boot, where a kernel image is passed to `--kernel`. The x86-64 -kernel must be built with PVH support or be a bzImage. The second is -firmware boot, where a firmware image is passed to `--firmware` and -brings up the guest's normal boot loader. - -Two firmware options are supported, and which one works best depends -on the guest OS. [Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware) -is a lightweight Rust-based PVH firmware. The edk2 UEFI firmware is -called `CLOUDHV.fd` for x86-64 and `CLOUDHV_EFI.fd` for AArch64. -Prebuilt binaries for both are available at their respective releases -pages, [Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/latest) -and [our edk2 -fork](https://github.com/cloud-hypervisor/edk2/releases/latest). -The edk2 fork carries customizations required to boot AArch64 guests -on cloud-hypervisor. See [docs/uefi.md](docs/uefi.md) for differences -with upstream tianocore/edk2. - -### Firmware Booting - -Cloud Hypervisor supports booting disk images containing all needed components -to run cloud workloads, a.k.a. cloud images. - -The following sample commands will download an Ubuntu Cloud image, converting -it into a format that Cloud Hypervisor can use and a firmware to boot the image -with. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw -$ wget https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/download/0.4.2/hypervisor-fw -``` - -The Ubuntu cloud images do not ship with a default password so it necessary to -use a `cloud-init` disk image to customise the image on the first boot. A basic -`cloud-init` image is generated by this [script](scripts/create-cloud-init.sh). -This seeds the image with a default username/password of `cloud/cloud123`. It -is only necessary to add this disk image on the first boot. Script also assigns -default IP address using `test_data/cloud-init/ubuntu/local/network-config` details -with `--net "mac=12:34:56:78:90:ab,tap="` option. Then the matching mac address -interface will be enabled as per `network-config` details. - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --firmware ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If access to the firmware messages or interaction with the boot loader (e.g. -GRUB) is required then it necessary to switch to the serial console instead of -`virtio-console`. - -```shell -$ ./cloud-hypervisor \ - --kernel ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" \ - --serial tty \ - --console off -``` - -## Booting: `--firmware` vs `--kernel` - -The following scenarios are supported by Cloud Hypervisor to bootstrap a VM, i.e., -to load a payload/bootitem(s): - -- Provide firmware -- Provide kernel \[+ cmdline\]\ [+ initrd\] - -Please note that our Cloud Hypervisor firmware (`hypervisor-fw`) has a Xen PVH -boot entry, therefore it can also be booted via the `--kernel` parameter, as -seen in some examples. - -### Custom Kernel and Disk Image - -#### Building your Kernel - -Cloud Hypervisor also supports direct kernel boot. For x86-64, a `vmlinux` ELF kernel (compiled with PVH support) or a regular bzImage are supported. In order to support development there is a custom branch; however provided the required options are enabled any recent kernel will suffice. - -To build the kernel: - -```shell -# Clone the Cloud Hypervisor Linux branch -$ git clone --depth 1 https://github.com/cloud-hypervisor/linux.git -b ch-6.12.8 linux-cloud-hypervisor -$ pushd linux-cloud-hypervisor -$ make ch_defconfig -# Do native build of the x86-64 kernel -$ KCFLAGS="-Wa,-mx86-used-note=no" make bzImage -j `nproc` -# Do native build of the AArch64 kernel -$ make -j `nproc` -$ popd -``` - -For x86-64, the `vmlinux` kernel image will then be located at -`linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin`. -For AArch64, the `Image` kernel image will then be located at -`linux-cloud-hypervisor/arch/arm64/boot/Image`. - -#### Disk image - -For the disk image the same Ubuntu image as before can be used. This contains -an `ext4` root filesystem. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img # x86-64 -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-arm64.img # AArch64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw # x86-64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-arm64.img focal-server-cloudimg-arm64.raw # AArch64 -``` - -#### Booting the guest VM - -These sample commands boot the disk image using the custom kernel whilst also -supplying the desired kernel command line. - -- x86-64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --disk path=focal-server-cloudimg-arm64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If earlier kernel messages are required the serial console should be used instead of `virtio-console`. - -- x86-64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-amd64.raw \ - --cmdline "console=ttyS0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-arm64.raw \ - --cmdline "console=ttyAMA0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -# 3. Status - -Cloud Hypervisor is under active development. The following stability -guarantees are currently made: - -* The API (including command line options) will not be removed or changed in a - breaking way without a minimum of 2 major releases notice. Where possible - warnings will be given about the use of deprecated functionality and the - deprecations will be documented in the release notes. - -* Point releases will be made between individual releases where there are - substantial bug fixes or security issues that need to be fixed. These point - releases will only include bug fixes. - -Currently the following items are **not** guaranteed across updates: - -* Snapshot/restore is not supported across different versions -* Live migration is not supported across different versions -* The following features are considered experimental and may change - substantially between releases: TDX, vfio-user, vDPA. - -Further details can be found in the [release documentation](docs/releases.md). - -As of 2023-01-03, the following cloud images are supported: - -- [Ubuntu Focal](https://cloud-images.ubuntu.com/focal/current/) (focal-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Jammy](https://cloud-images.ubuntu.com/jammy/current/) (jammy-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Noble](https://cloud-images.ubuntu.com/noble/current/) (noble-server-cloudimg-{amd64,arm64}.img) -- [Fedora 36](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/) ([Fedora-Cloud-Base-36-1.5.x86_64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/x86_64/images/) / [Fedora-Cloud-Base-36-1.5.aarch64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/aarch64/images/)) - -Direct kernel boot to userspace should work with a rootfs from most -distributions although you may need to enable exotic filesystem types in the -reference kernel configuration (e.g. XFS or btrfs.) - -## Hot Plug - -Cloud Hypervisor supports hotplug of CPUs, passthrough devices (VFIO), -`virtio-{net,block,pmem,fs,vsock}` and memory resizing. This -[document](docs/hotplug.md) details how to add devices to a running VM. - -## Device Model - -Details of the device model can be found in this -[documentation](docs/device_model.md). - -## Roadmap - -The project roadmap is tracked through a [GitHub -project](https://github.com/orgs/cloud-hypervisor/projects/6). - -# 4. Relationship with _Rust VMM_ Project - -In order to satisfy the design goal of having a high-performance, -security-focused hypervisor the decision was made to use the -[Rust](https://www.rust-lang.org/) programming language. The language's strong -focus on memory and thread safety makes it an ideal candidate for implementing -VMMs. - -Instead of implementing the VMM components from scratch, Cloud Hypervisor is -importing the [Rust VMM](https://github.com/rust-vmm) crates, and sharing code -and architecture together with other VMMs like e.g. Amazon's -[Firecracker](https://firecracker-microvm.github.io/) and Google's -[crosvm](https://chromium.googlesource.com/chromiumos/platform/crosvm/). - -Cloud Hypervisor embraces the _Rust VMM_ project's goals, which is to be able -to share and re-use as many virtualization crates as possible. - -## Differences with Firecracker and crosvm - -A large part of the Cloud Hypervisor code is based on either the Firecracker or -the crosvm project's implementations. Both of these are VMMs written in Rust -with a focus on safety and security, like Cloud Hypervisor. - -The goal of the Cloud Hypervisor project differs from the aforementioned -projects in that it aims to be a general purpose VMM for _Cloud Workloads_ and -not limited to container/serverless or client workloads. - -The Cloud Hypervisor community thanks the communities of both the Firecracker -and crosvm projects for their excellent work. - -# 5. Community - -The Cloud Hypervisor project follows the governance, and community guidelines -described in the [Community](https://github.com/cloud-hypervisor/community) -repository. - -## Contribute - -The project strongly believes in building a global, diverse and collaborative -community around the Cloud Hypervisor project. Anyone who is interested in -[contributing](CONTRIBUTING.md) to the project is welcome to participate. - -Contributing to a open source project like Cloud Hypervisor covers a lot more -than just sending code. Testing, documentation, pull request -reviews, bug reports, feature requests, project improvement suggestions, etc, -are all equal and welcome means of contribution. See the -[CONTRIBUTING](CONTRIBUTING.md) document for more details. - -## Slack - -Get an [invite to our Slack channel](https://join.slack.com/t/cloud-hypervisor/shared_invite/enQtNjY3MTE3MDkwNDQ4LWQ1MTA1ZDVmODkwMWQ1MTRhYzk4ZGNlN2UwNTI3ZmFlODU0OTcwOWZjMTkwZDExYWE3YjFmNzgzY2FmNDAyMjI), - [join us on Slack](https://cloud-hypervisor.slack.com/), and [participate in our community activities](https://cloud-hypervisor.slack.com/archives/C04R5DUQVBN). - -## Mailing list - -Please report bugs using the [GitHub issue -tracker](https://github.com/cloud-hypervisor/cloud-hypervisor/issues) but for -broader community discussions you may use our [mailing -list](https://lists.cloudhypervisor.org/g/dev/). - -## Security issues - -Please contact the maintainers listed in the MAINTAINERS.md file with security issues. +# Cloud Hypervisor Fork for SAP gardenlinux + +The `gardenlinux` branch is the branch from that our SAP colleagues [build] +[sap-gl-ci] their Cloud Hypervisor packages. + +## Development Model + +- The `gardenlinux` branch is always what SAP builds. From SAPs side, we can + force push or rewrite history on that branch. +- We use branch protection for `gradenlinux`, PRs, CI, and code reviews +- With every new CHV release, we rename `gardenlinux` to `gardenlinux-vXX` and + create a new `gardenlinux` branch manually: + - use release as base and push it into the repo + - cherry-pick all commits from `gardenlinux-vXX` that are still relevant onto a + new branch and create a pull request against this fork + - adapt git commit history +- PoC Development: + - happens here (in [cyberus-technology/cloud-hypervisor](https://github.com/cyberus-technology/cloud-hypervisor)) + - open PR against `gardenlinux` + - Branch name patterns **must not** follow `gardenlinux-*` pattern + - We recommend `cyberus-fork-*` as branch pattern to better keep the overview. +- Productization: + - happens upstream (in [cloud-hypervisor/cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor)) + - We recommend `productize-*` as branch pattern to better keep the overview. + + +[sap-gl-ci]: https://github.com/gardenlinux/package-cloud-hypervisor-gl/blob/main/prepare_source#L1 From 41c8341430a3b76bfb16061a3a1fe7812168a7dc Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 30 Mar 2026 09:51:04 +0200 Subject: [PATCH 017/178] misc: adjust AGENTS.md for our fork On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- AGENTS.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 94ad277331..2c6a5d4bf5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,10 @@ This is a compact [AGENTS.md](https://agents.md/) file for Cloud Hypervisor. It is meant to help automated coding agents make useful changes that stay safe, reviewable, and compatible with the project's normal engineering constraints. +This checkout is a Cyberus Technology fork of Cloud Hypervisor. It is maintained +independently from upstream, while still following upstream contribution and +code-quality guidance unless fork-specific requirements say otherwise. + ## For LLMs ### Project Context @@ -14,6 +18,9 @@ reviewable, and compatible with the project's normal engineering constraints. - The main supported architectures are `x86_64` and `aarch64`; the main hypervisor backends are KVM and MSHV. `x86_64` with KVM gets the most regular exercise, but changes must not make the other first-class targets worse. +- Treat live migration and the vCPU lifecycle as first-class production areas. + Preserve deterministic state transfer, robust failure handling, correct device + and memory state, and explicit race-free vCPU state transitions. ### Change Guidelines @@ -30,6 +37,11 @@ reviewable, and compatible with the project's normal engineering constraints. - Preserve existing behavior unless the requested change explicitly needs a behavior change; refactors must preserve behavior. Call out compatibility or migration implications. +- Prefer simple solutions over unnecessary traits, excessive indirection, or + premature abstraction. +- Prefer `Result` over panics for recoverable production-path errors. Handle + syscall and KVM ioctl return values explicitly and include useful context in + error messages. - Do not invent APIs, behavior, or requirements. If something is uncertain, state the uncertainty and proceed only with minimal, explicit assumptions. @@ -39,6 +51,8 @@ reviewable, and compatible with the project's normal engineering constraints. comment with the invariants, and make sure the surrounding code upholds them. - Assume concurrency matters. Avoid races, unsynchronized shared state, and implicit ordering assumptions; prefer clear ownership and synchronization. +- Keep KVM code aligned with the kernel API. Do not rely on undocumented + behavior or ignore backend-specific failure modes. - Keep docs and comments short and useful. Document non-trivial invariants at struct definitions and critical state transitions. - Logging should be minimal and high signal. Use `info!` for important normal @@ -64,6 +78,11 @@ reviewable, and compatible with the project's normal engineering constraints. these code paths; otherwise the integration-test code is not included. Do not assume the tests can be run directly in a restricted agent environment; ask the developer to run them when real integration coverage is needed. +- For broader VM behavior, this fork also uses an external `libvirt-tests` suite + outside this repository. If a change likely needs that coverage, say so and + ask whether it should be run, skipped, or handled manually by the developer. + Only run it yourself if the developer provides the necessary instructions and + access details. ### Commit and Patch Formatting @@ -78,4 +97,15 @@ reviewable, and compatible with the project's normal engineering constraints. - Temporary allowances such as `#[allow(unused)]` or ignored tests are only acceptable if resolved within the same commit series or paired with a clear TODO referencing a ticket. Ask the developer if in doubt. +- Commits need a `On-behalf-of: SAP $firstname.$lastname@sap.com` trailer: e.g.: + ``` + $component: $summary + + $body + On-behalf-of: SAP philipp.schuster@sap.com + Signed-off-by: Philipp Schuster + ``` + as our work is sponsored by SAP, which gets its money from the EU (Apeiro + project). The enforcing CI rule is in + `./scripts/gitlint/rules/on-behalf-of-marker.py` From 53be0f710b48094692c24d4f7c85fe6e790d3e48 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 30 Oct 2025 14:34:25 +0100 Subject: [PATCH 018/178] ci: enforce SAP commit style To check gitlint locally, one can run: gitlint --commits "HEAD~2..HEAD" which for example checks the last two commits. Although this is just our kinda private (but public) fork, people might cherry-pick commits from us for whatever reason. So we should have proper commit style. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- scripts/gitlint/rules/on-behalf-of-marker.py | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 scripts/gitlint/rules/on-behalf-of-marker.py diff --git a/scripts/gitlint/rules/on-behalf-of-marker.py b/scripts/gitlint/rules/on-behalf-of-marker.py new file mode 100644 index 0000000000..d08e334b17 --- /dev/null +++ b/scripts/gitlint/rules/on-behalf-of-marker.py @@ -0,0 +1,36 @@ +from gitlint.rules import LineRule, RuleViolation, CommitMessageTitle, CommitRule + +class BodyContainsOnBehalfOfSAPMarker(CommitRule): + """Enforce that each commit coming from an SAP contractor contains an + "On-behalf-of SAP user@sap.com" marker. + """ + + # A rule MUST have a human friendly name + name = "body-requires-on-behalf-of-sap" + + # A rule MUST have a *unique* id + # We recommend starting with UC (for User-defined Commit-rule). + id = "UC-sap" + + # Lower-case list of contractors + contractors = [ + "@cyberus-technology.de" + ] + + # Marker followed by " name.surname@sap.com" + marker = "On-behalf-of: SAP" + + def validate(self, commit): + if "@sap.com" in commit.author_email.lower(): + return + + # Allow third-party open-source contributions + if not any(contractor in commit.author_email.lower() for contractor in self.contractors): + return + + for line in commit.message.body: + if line.startswith(self.marker) and "@sap.com" in line.lower(): + return + + msg = f"Body does not contain a '{self.marker} user@sap.com' line" + return [RuleViolation(self.id, msg, line_nr=1)] From 6f07b15576b9a17e140b70ed3d85b494aff68d91 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 28 Apr 2026 21:09:55 +0200 Subject: [PATCH 019/178] ci: remove irrelevant CI for this fork Remove irrelevant/annoying CI here to accelerate development. Further, we don't have the runners to run the integration tests, but at least we want to run the unit tests, clippy, etc. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- .github/workflows/ci.yaml | 272 --------------------- .github/workflows/docker-image.yaml | 65 ----- .github/workflows/integration-metrics.yaml | 22 -- .github/workflows/mshv-infra.yaml | 246 ------------------- .github/workflows/mshv-integration.yaml | 129 ---------- .github/workflows/release.yaml | 95 ------- 6 files changed, 829 deletions(-) delete mode 100644 .github/workflows/docker-image.yaml delete mode 100644 .github/workflows/integration-metrics.yaml delete mode 100644 .github/workflows/mshv-infra.yaml delete mode 100644 .github/workflows/mshv-integration.yaml delete mode 100644 .github/workflows/release.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e080c3fef9..34ee87f942 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -106,44 +106,6 @@ jobs: - name: Lint git commit messages run: | gitlint --commits "origin/$GITHUB_BASE_REF.." - lychee: - name: lychee - needs: [preflight] - if: needs.preflight.outputs.docs == 'true' || needs.preflight.outputs.full == 'true' - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Get changed files in PR - id: changed-files - uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6 - with: - base_sha: ${{ github.event.pull_request.base.sha }} - - name: Verify Changed Files - run: | - set -eufo pipefail - echo "--- tj-actions/changed-files Outputs ---" - echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" - echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" - echo "added_files: ${{ steps.changed-files.outputs.added_files }}" - echo "modified_files: ${{ steps.changed-files.outputs.modified_files }}" - echo "deleted_files: ${{ steps.changed-files.outputs.deleted_files }}" - echo "renamed_files: ${{ steps.changed-files.outputs.renamed_files }}" - echo "----------------------------------------" - if [ -n "${{ steps.changed-files.outputs.all_changed_files }}" ]; then - echo "Detected changes: all_changed_files output is NOT empty." - else - echo "No changes detected: all_changed_files output IS empty." - fi - - name: Link Availability Check (Diff Only) - if: ${{ steps.changed-files.outputs.all_changed_files != '' }} - uses: lycheeverse/lychee-action@8646ba30535128ac92d33dfc9133794bfdd9b411 # v2.8.0 - with: - args: --verbose --config .lychee.toml ${{ steps.changed-files.outputs.all_changed_files }} - failIfEmpty: false - fail: true taplo: name: taplo needs: [preflight] @@ -314,16 +276,10 @@ jobs: fail-fast: false matrix: rust: - - beta - stable target: - - aarch64-unknown-linux-gnu - - aarch64-unknown-linux-musl - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl include: - - rust: beta - experimental: true - rust: stable experimental: false steps: @@ -462,12 +418,8 @@ jobs: matrix: rust: - stable - - beta - - nightly - - "1.89.0" # MSRV — keep quoted. target: - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl steps: - name: Code checkout uses: actions/checkout@v6 @@ -510,223 +462,6 @@ jobs: run: cargo build --locked --all --release --target=${{ matrix.target }} - name: Check build did not modify any files run: test -z "$(git status --porcelain)" - # garm-jammy + gnu: runs on PR and MQ. Other 3 matrix entries are in - # integration-x86-64-mq (sibling, MQ-only, runs in parallel). - integration-x86-64-pr: - name: integration-x86-64-pr - needs: [preflight, dco, quality, build] - if: >- - needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 80 - env: - # Our runner has 16 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 12 - runs-on: garm-jammy-16 - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - run: scripts/prepare_vdpa.sh - - name: Run unit tests - run: scripts/dev_cli.sh tests --unit --libc gnu - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc gnu - # MQ-only: the 3 matrix entries that integration-x86-64-pr does not cover. - integration-x86-64-mq: - name: integration-x86-64-mq - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 80 - env: - # Our runner has 16 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 12 - strategy: - fail-fast: false - matrix: - include: - - {runner: garm-jammy, libc: musl} - - {runner: garm-jammy-amd, libc: gnu} - - {runner: garm-jammy-amd, libc: musl} - # format() because `${{ matrix.runner }}-16` is not valid in runs-on. - runs-on: ${{ format('{0}-16', matrix.runner) }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - run: scripts/prepare_vdpa.sh - - name: Run unit tests - run: scripts/dev_cli.sh tests --unit --libc ${{ matrix.libc }} - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - integration-arm64: - name: integration-arm64 - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - timeout-minutes: 120 - env: - # Our runner has 80 cores (nproc). - # We limit parallelism only to avoid exhausting disk space and memory - # resources, not to save CPU resources. - PARALLEL_INTEGRATION_TESTS_NUM: 25 - runs-on: bookworm-arm64 - steps: - # arm64 runner user is "runner" (vfio's is "github-runner"). - - name: Fix workspace permissions - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run unit tests (musl) - run: scripts/dev_cli.sh tests --unit --libc musl - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests (musl) - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc musl - - name: Install Azure CLI - run: | - set -eufo pipefail - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - shell: bash - run: | - set -eufo pipefail - IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw - IMG_PATH=$HOME/workloads/$IMG_BASENAME - IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz - IMG_GZ_BLOB_NAME=windows-11-iot-enterprise-aarch64-9-min.raw.gz - cp "scripts/$IMG_BASENAME.sha1" "$HOME/workloads/" - pushd "$HOME/workloads" - if sha1sum "$IMG_BASENAME.sha1" --check; then - exit - fi - popd - mkdir -p "$HOME/workloads" - az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d "$IMG_GZ_PATH" - - name: Run Windows guest integration tests - timeout-minutes: 30 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - integration-vfio: - name: integration-vfio - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: vfio-nvidia - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - # vfio-nvidia runner user is "github-runner" (not "runner" like arm64). - - name: Fix workspace permissions - run: sudo chown -R github-runner:github-runner "${GITHUB_WORKSPACE}" - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run VFIO integration tests - timeout-minutes: 25 - run: scripts/dev_cli.sh tests --integration-vfio - # Most tests are failing with musl, see #6790 - # - name: Run VFIO integration tests for musl - # timeout-minutes: 25 - # run: scripts/dev_cli.sh tests --integration-vfio --libc musl - integration-windows: - name: integration-windows - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: garm-jammy-16 - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - run: | - set -eufo pipefail - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Install Azure CLI - run: | - set -eufo pipefail - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - run: | - set -eufo pipefail - mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2025-amd64-1.raw" --name windows-server-2025-amd64-1.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - - name: Run Windows guest integration tests - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows - - name: Run Windows guest integration tests for musl - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - integration-rate-limiter: - name: integration-rate-limiter - needs: [preflight, dco, quality, build] - if: >- - github.event_name == 'merge_group' && needs.preflight.outputs.full == 'true' && needs.dco.result == 'success' && needs.quality.result == 'success' && needs.build.result == 'success' - runs-on: bare-metal-9950x - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run rate-limiter integration tests - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-rate-limiter # The single required-status check. Branch protection requires this one job. all-green: name: all-green @@ -738,13 +473,6 @@ jobs: - fuzz-build - gitlint - hadolint - - integration-arm64 - # VFIO worker is failing #8160 - # - integration-vfio - # See: #8211 - # - integration-windows - - integration-x86-64-mq - - integration-x86-64-pr - openapi - package-consistency - preflight diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml deleted file mode 100644 index 8636d35f00..0000000000 --- a/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Cloud Hypervisor's Docker image update -on: - push: - branches: main - paths: resources/Dockerfile - pull_request: - paths: resources/Dockerfile -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v4 - - - name: Login to ghcr - uses: docker/login-action@v4 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - - name: Docker meta - id: meta - uses: docker/metadata-action@v6 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # generate Docker tags based on the following events/attributes - tags: | - type=raw,value=20251114-0 - type=sha - - - name: Build and push - if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v7 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: ${{ steps.meta.outputs.tags }} - - - name: Build only - if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v7 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml deleted file mode 100644 index 4e66f4b614..0000000000 --- a/.github/workflows/integration-metrics.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Cloud Hypervisor Tests (Metrics) -on: - push: - branches: - - main - -jobs: - build: - name: Tests (Metrics) - runs-on: bare-metal-9950x - env: - METRICS_PUBLISH_KEY: ${{ secrets.METRICS_PUBLISH_KEY }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run metrics tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- --test-exclude micro_ -- --report-file /root/workloads/metrics.json - - name: Upload metrics report - run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml deleted file mode 100644 index 89cb5f6fbc..0000000000 --- a/.github/workflows/mshv-infra.yaml +++ /dev/null @@ -1,246 +0,0 @@ -name: MSHV Infra Setup -on: - workflow_call: - inputs: - ARCH: - description: 'Architecture for the VM' - required: true - type: string - KEY: - description: 'SSH Key Name' - required: true - type: string - OS_DISK_SIZE: - description: 'OS Disk Size in GB' - required: true - type: number - RG: - description: 'Resource Group Name' - required: true - type: string - VM_SKU: - description: 'VM SKU' - required: true - type: string - secrets: - MI_CLIENT_ID: - required: true - RUNNER_RG: - required: true - STORAGE_ACCOUNT_PATHS: - required: true - ARCH_SOURCE_PATH: - required: true - USERNAME: - required: true - outputs: - RG_NAME: - description: 'Resource group of the VM' - value: ${{ jobs.infra-setup.outputs.RG_NAME }} - VM_NAME: - description: 'Name of the VM' - value: ${{ jobs.infra-setup.outputs.VM_NAME }} - PRIVATE_IP: - description: 'Private IP of the VM' - value: ${{ jobs.infra-setup.outputs.PRIVATE_IP }} -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true -jobs: - infra-setup: - name: ${{ inputs.ARCH }} VM Provision - runs-on: mshv - outputs: - RG_NAME: ${{ steps.rg-setup.outputs.RG_NAME }} - VM_NAME: ${{ steps.vm-setup.outputs.VM_NAME }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - steps: - - name: Install & login to AZ CLI - env: - MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} - run: | - set -eufo pipefail - echo "Installing Azure CLI if not already installed" - if ! command -v az &>/dev/null; then - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed" - fi - az --version - echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id "${MI_CLIENT_ID}" - - - name: Get Location - id: get-location - env: - SKU: ${{ inputs.VM_SKU }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -eufo pipefail - # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - if ! [[ "$SKU" =~ ^Standard_[A-Za-z]+([1-9][0-9]*) ]]; then - printf 'Cannot extract vCPU count from SKU: %q\n' "$SKU" - exit 1 - fi - vcpu=${BASH_REMATCH[1]} - - SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') - - for location in $SUPPORTED_LOCATIONS; do - family=$(az vm list-skus --size "$SKU" --location "$location" --resource-type "virtualMachines" --query '[0].family' -o tsv) - if [[ -z "$family" ]]; then - echo "Cannot determine VM family for SKU: $SKU in $location" - continue - fi - - remaining=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json | - jq '(.limit | tonumber) - (.currentValue | tonumber) >= ($ARGS.positional[0] | tonumber)' --jsonargs "$vcpu") - if [[ "$remaining" = true ]]; then - echo "Sufficient quota found in $location" - echo "location=$location" >> "$GITHUB_OUTPUT" - exit 0 - fi - done - - echo "No location found with sufficient vCPU quota for SKU: $SKU" - exit 1 - - - name: Create Resource Group - id: rg-setup - env: - LOCATION: ${{ steps.get-location.outputs.location }} - RG: ${{ inputs.RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -eufo pipefail - echo "Creating Resource Group: $RG" - # Create the resource group - echo "Creating resource group in location: ${LOCATION}" - az group create --name "${RG}" --location "${LOCATION}" - echo "RG_NAME=${RG}" >> $GITHUB_OUTPUT - echo "Resource group created successfully." - - - name: Generate SSH Key - id: generate-ssh-key - env: - KEY: ${{ inputs.KEY }} - run: | - set -eufo pipefail - echo "Generating SSH key: $KEY" - mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/"${KEY}" -N "" - - - name: Create VM - id: vm-setup - env: - KEY: ${{ inputs.KEY }} - LOCATION: ${{ steps.get-location.outputs.location }} - OS_DISK_SIZE: ${{ inputs.OS_DISK_SIZE }} - RG: ${{ inputs.RG }} - RUNNER_RG: ${{ secrets.RUNNER_RG }} - USERNAME: ${{ secrets.USERNAME }} - VM_SKU: ${{ inputs.VM_SKU }} - VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -eufo pipefail - echo "Creating $VM_SKU VM: $VM_NAME" - - # Extract subnet ID from the runner VM - echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group "$RUNNER_RG" --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") - if [[ -z "${SUBNET_ID}" ]]; then - echo "ERROR: Failed to retrieve Subnet ID." - exit 1 - fi - - # Extract image ID from the runner VM - echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group "$RUNNER_RG" --name "$VM_IMAGE_NAME" --query "id" -o tsv) - if [[ -z "${IMAGE_ID}" ]]; then - echo "ERROR: Failed to retrieve Image ID." - exit 1 - fi - - # Create VM - az vm create \ - --resource-group "${RG}" \ - --name "${VM_NAME}" \ - --subnet "${SUBNET_ID}" \ - --size "${VM_SKU}" \ - --location "${LOCATION}" \ - --image "${IMAGE_ID}" \ - --os-disk-size-gb "${OS_DISK_SIZE}" \ - --public-ip-sku Standard \ - --storage-sku Premium_LRS \ - --public-ip-address "" \ - --admin-username "${USERNAME}" \ - --ssh-key-value ~/.ssh/"${KEY}".pub \ - --security-type Standard \ - --output json - - az vm boot-diagnostics enable --name "${VM_NAME}" --resource-group "${RG}" - - echo "VM_NAME=${VM_NAME}" >> "$GITHUB_OUTPUT" - echo "VM creation process completed successfully." - - - name: Get VM Private IP - id: get-vm-ip - env: - RG: ${{ inputs.RG }} - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -eufo pipefail - echo "Retrieving VM Private IP address..." - # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g "${RG}" -n "${VM_NAME}" -d --query privateIps -o tsv) - if [[ -z "$PRIVATE_IP" ]]; then - echo "ERROR: Failed to retrieve private IP address." - exit 1 - fi - echo "PRIVATE_IP=$PRIVATE_IP" >> "$GITHUB_OUTPUT" - - - name: Wait for SSH availability - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/"${KEY}" -- "${USERNAME}@${PRIVATE_IP}" "exit" 2>/dev/null; do sleep 5; done' - echo "VM is accessible!" - - - name: Remove Old Host Key - env: - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - run: | - set -eufo pipefail - echo "Removing the old host key" - ssh-keygen -R "$PRIVATE_IP" - - - name: SSH into VM and Install Dependencies - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - set -eufo pipefail - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF - set -eufo pipefail - echo "Logged in successfully." - echo "Installing dependencies..." - sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel - echo "Installing Rust..." - curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable --profile default -y - export PATH="\$HOME/.cargo/bin:\$PATH" - cargo --version - sudo mkdir -p /etc/docker/ - echo '{"default-ulimits":{"nofile":{"Hard":65535,"Name":"nofile","Soft":65535}}}' | sudo tee /etc/docker/daemon.json - sudo systemctl stop docker - sudo systemctl enable docker.service - sudo systemctl enable containerd.service - sudo systemctl start docker - sudo groupadd -f docker - sudo usermod -a -G docker "${USERNAME}" - sudo systemctl restart docker - EOF diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml deleted file mode 100644 index 437cf44f6c..0000000000 --- a/.github/workflows/mshv-integration.yaml +++ /dev/null @@ -1,129 +0,0 @@ -name: Cloud Hypervisor Tests (MSHV) (x86_64) -on: [pull_request_target, merge_group] -permissions: {} - -jobs: - infra-setup: - name: MSHV Infra Setup (x86_64) - uses: ./.github/workflows/mshv-infra.yaml - with: - ARCH: x86_64 - KEY: azure_key_${{ github.run_id }} - OS_DISK_SIZE: 512 - RG: MSHV-INTEGRATION-${{ github.run_id }} - VM_SKU: Standard_D16s_v5 - secrets: - MI_CLIENT_ID: ${{ secrets.MSHV_MI_CLIENT_ID }} - RUNNER_RG: ${{ secrets.MSHV_RUNNER_RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.MSHV_STORAGE_ACCOUNT_PATHS }} - ARCH_SOURCE_PATH: ${{ secrets.MSHV_X86_SOURCE_PATH }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - - run-tests: - name: Integration Tests (x86_64) - needs: infra-setup - if: ${{ always() && needs.infra-setup.result == 'success' }} - runs-on: mshv - steps: - - name: Run integration tests - timeout-minutes: 60 - env: - KEY: azure_key_${{ github.run_id }} - PR_NUMBER: ${{ github.event.pull_request.number }} - REPO_URL: https://github.com/cloud-hypervisor/cloud-hypervisor.git - REPO_DIR: cloud-hypervisor - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - RG: MSHV-${{ github.run_id }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - set -eufo pipefail - echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF - set -e - echo "Logged in successfully." - export PATH="\$HOME/.cargo/bin:\$PATH" - - if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then - git clone --depth 1 "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - git fetch origin pull/${{ github.event.pull_request.number }}/merge - git checkout FETCH_HEAD - else - git clone --depth 1 --single-branch --branch "${{ github.ref_name }}" "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - fi - - echo "Loading VDPA kernel modules..." - sudo modprobe vdpa - sudo modprobe vhost_vdpa - sudo modprobe vdpa_sim - sudo modprobe vdpa_sim_blk - sudo modprobe vdpa_sim_net - - echo "Creating VDPA devices..." - sudo vdpa dev add name vdpa-blk0 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_net - - echo "Setting permissions..." - for i in 0 1 2; do - dev="/dev/vhost-vdpa-\$i" - if [ -e "\$dev" ]; then - sudo chown \$USER:\$USER "\$dev" - sudo chmod 660 "\$dev" - else - echo "Warning: Device \$dev not found" - fi - done - - sudo ./scripts/dev_cli.sh tests --hypervisor mshv --integration - EOF - - - name: Dump dmesg - if: always() - continue-on-error: true - env: - KEY: azure_key_${{ github.run_id }} - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" sudo dmesg - - - name: Dump serial console logs - if: always() - continue-on-error: true - env: - RG_NAME: ${{ needs.infra-setup.outputs.RG_NAME }} - VM_NAME: ${{ needs.infra-setup.outputs.VM_NAME }} - run: | - set -eufo pipefail - az vm boot-diagnostics get-boot-log --name "${VM_NAME}" --resource-group "${RG_NAME}" | jq -r - - cleanup: - name: Cleanup - needs: run-tests - if: always() - runs-on: mshv - steps: - - name: Delete RG - env: - RG: MSHV-INTEGRATION-${{ github.run_id }} - run: | - if az group exists --name "${RG}"; then - az group delete --name "${RG}" --yes --no-wait - else - echo "Resource Group ${RG} does not exist. Skipping deletion." - fi - echo "Cleanup process completed." - - - name: Delete SSH Key - env: - KEY: azure_key_${{ github.run_id }} - run: | - if [ -f ~/.ssh/"${KEY}" ]; then - rm -f ~/.ssh/"${KEY}" ~/.ssh/"${KEY}.pub" - echo "SSH key deleted successfully." - else - echo "SSH key does not exist. Skipping deletion." - fi - echo "Cleanup process completed." diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index bc7c3e152e..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: Cloud Hypervisor Release -on: [create, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true -env: - GITHUB_TOKEN: ${{ github.token }} - -jobs: - release: - if: (github.event_name == 'create' && github.event.ref_type == 'tag') || github.event_name == 'merge_group' - name: Release ${{ matrix.platform.target }} - strategy: - fail-fast: false - matrix: - platform: - - target: x86_64-unknown-linux-gnu - args: --all --release --features mshv - name_ch: cloud-hypervisor - name_ch_remote: ch-remote - - target: x86_64-unknown-linux-musl - args: --all --release --features mshv - name_ch: cloud-hypervisor-static - name_ch_remote: ch-remote-static - - target: aarch64-unknown-linux-musl - args: --all --release - name_ch: cloud-hypervisor-static-aarch64 - name_ch_remote: ch-remote-static-aarch64 - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - name: Install musl-gcc - if: contains(matrix.platform.target, 'musl') - run: sudo apt install -y musl-tools - - name: Create release directory - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: rsync -rv --exclude=.git . ../cloud-hypervisor-${{ github.event.ref }} - - name: Build ${{ matrix.platform.target }} - uses: houseabsolute/actions-rust-cross@v1 - with: - command: build - target: ${{ matrix.platform.target }} - args: ${{ matrix.platform.args }} - strip: true - toolchain: "1.89.0" - - name: Copy Release Binaries - if: github.event_name == 'create' && github.event.ref_type == 'tag' - shell: bash - run: | - cp target/${{ matrix.platform.target }}/release/cloud-hypervisor ./${{ matrix.platform.name_ch }} - cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - - name: Upload Release Artifacts - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v7 - with: - name: Artifacts for ${{ matrix.platform.target }} - path: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - - name: Vendor - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - working-directory: ../cloud-hypervisor-${{ github.event.ref }} - run: | - mkdir ../vendor-cargo-home - export CARGO_HOME=$(realpath ../vendor-cargo-home) - mkdir .cargo - cargo vendor > .cargo/config.toml - - name: Create vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: tar cJf cloud-hypervisor-${{ github.event.ref }}.tar.xz ../cloud-hypervisor-${{ github.event.ref }} - - name: Upload cloud-hypervisor vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v7 - with: - path: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - - name: Create GitHub Release - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v3 - with: - draft: true - files: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - ./cloud-hypervisor-${{ github.event.ref }}.tar.xz From f83009efa02ca1317cb9aa9f60a8ba80ba71fc25 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 28 Jan 2026 13:50:22 +0100 Subject: [PATCH 020/178] build: flake: init Adds a flake configuration that enables building Cloud Hypervisor directly from this repository using Nix. This makes it possible to deploy and test Cloud Hypervisor on NixOS systems in real environments. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- .envrc | 1 + .reuse/dep5 | 2 +- chv.nix | 65 ++++++++++++++++++++++++++++++++++++++++++ flake.lock | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ flake.nix | 76 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 .envrc create mode 100644 chv.nix create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/.envrc b/.envrc new file mode 100644 index 0000000000..3550a30f2d --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.reuse/dep5 b/.reuse/dep5 index 0e17b4b7e2..e624ecf662 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -7,6 +7,6 @@ Files: docs/*.md *.md Copyright: 2024 License: CC-BY-4.0 -Files: scripts/* test_data/* *.toml .git* .editorconfig fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock +Files: scripts/* test_data/* *.toml .git* .editorconfig fuzz/Cargo.lock fuzz/.gitignore resources/linux-config-* vmm/src/api/openapi/cloud-hypervisor.yaml CODEOWNERS Cargo.lock flake.nix flake.lock chv.nix .envrc Copyright: 2024 License: Apache-2.0 diff --git a/chv.nix b/chv.nix new file mode 100644 index 0000000000..206888d1e1 --- /dev/null +++ b/chv.nix @@ -0,0 +1,65 @@ +# Builds Cloud Hypervisor with using crane. +# +# Uses a pragmatic release profile with debug-ability and faster +# compilation times in mind without sacrificing too much performance. + +{ + # helper from nixpkgs + lib, + openssl, + pkg-config, + # other helper + craneLib, + # other + meta, # meta of pkgs.cloud-hypervisor + src, # clean source +}: +let + commonArgs = { + inherit meta src; + # Since Nov 2025 (v50), Cloud Hypervisor has a virtual manifest and the + # main package was moved into a sub directory. + cargoToml = "${src}/cloud-hypervisor/Cargo.toml"; + + # Pragmatic release profile with debug-ability and faster + # compilation times in mind. + env = { + CARGO_PROFILE_RELEASE_DEBUG_ASSERTIONS = "true"; + CARGO_PROFILE_RELEASE_OPT_LEVEL = 2; + CARGO_PROFILE_RELEASE_OVERFLOW_CHECKS = "true"; + CARGO_PROFILE_RELEASE_LTO = "thin"; + + # Fix build. Reference: + # - https://github.com/sfackler/rust-openssl/issues/1430 + # - https://docs.rs/openssl/latest/openssl/ + OPENSSL_NO_VENDOR = true; + }; + + nativeBuildInputs = [ + pkg-config + ]; + buildInputs = [ + openssl + ]; + }; + + # Downloaded and compiled dependencies. + cargoArtifacts = craneLib.buildDepsOnly ( + commonArgs + // { + doCheck = false; + } + ); + + cargoPackageKvm = craneLib.buildPackage ( + commonArgs + // { + inherit cargoArtifacts; + # Don't execute tests here. Too expensive for local development with + # frequent rebuilds + little benefit. + doCheck = false; + cargoExtraArgs = "--features kvm"; + } + ); +in +cargoPackageKvm diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000000..ba1d721934 --- /dev/null +++ b/flake.lock @@ -0,0 +1,81 @@ +{ + "nodes": { + "crane": { + "locked": { + "lastModified": 1779041105, + "narHash": "sha256-nnGD2f8OlAZT2i5OfwikJsw+ifWfiA4d6A8BWlgOXV0=", + "owner": "ipetkov", + "repo": "crane", + "rev": "10e6e3cb966f7cfcc789fe5eee7a85f3188ce08b", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "ref": "master", + "repo": "crane", + "type": "github" + } + }, + "dried-nix-flakes": { + "locked": { + "lastModified": 1756139350, + "narHash": "sha256-pObQv94NclXVXjJV8sTiKwFes4fGEWpkNzDsXw5DqnY=", + "owner": "cyberus-technology", + "repo": "dried-nix-flakes", + "rev": "1b2ba62710c6c1d9eba0e8e3adc029cc2e9291a4", + "type": "github" + }, + "original": { + "owner": "cyberus-technology", + "repo": "dried-nix-flakes", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1778737229, + "narHash": "sha256-6xWoytx8jFW4PF1GjRm/i/53trbpKGfz6zjzQGBr4cI=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "d7a713c0b7e47c908258e71cba7a2d77cc8d71d5", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-25.11", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "crane": "crane", + "dried-nix-flakes": "dried-nix-flakes", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1779074409, + "narHash": "sha256-6aXy8Ga41iLVM8ibddFU1O5+wYWcBGNEfZzZuL91eIc=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "2a77b5b1dc952f214e8102acdef1622b68515560", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000000..24b4671997 --- /dev/null +++ b/flake.nix @@ -0,0 +1,76 @@ +{ + description = "Cyberus Hypervisor for SAP / Apeiro"; + + inputs = { + dried-nix-flakes.url = "github:cyberus-technology/dried-nix-flakes"; + nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11"; + # Convenient Nix tooling to build Rust projects. + crane.url = "github:ipetkov/crane/master"; + # Get proper Rust toolchain, independent of pkgs.rustc. + rust-overlay = { + url = "github:oxalica/rust-overlay"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = + inputs: + let + dnf = (inputs.dried-nix-flakes.for inputs).override { + systems = [ "x86_64-linux" ]; + }; + inherit (dnf) + exportOutputs + ; + in + exportOutputs ( + { + self, + # Keep list sorted: + crane, + nixpkgs, + rust-overlay, + ... + }: + let + pkgs = nixpkgs.legacyPackages; + lib = pkgs.lib; + rust-bin = (rust-overlay.lib.mkRustBin { }) pkgs; + in + { + + formatter = pkgs.nixfmt-tree; + devShells.default = pkgs.mkShellNoCC { + inputsFrom = builtins.attrValues self.packages; + packages = with pkgs; [ + gitlint + rustup + ]; + }; + packages = + let + jsonFilter = path: _type: builtins.match ".*json$" path != null; + sourceFilter = path: type: (jsonFilter path type) || (craneLib.filterCargoSources path type); + src = lib.cleanSourceWith { + src = self; + filter = sourceFilter; + name = "source"; + }; + + rustToolchain = rust-bin.stable.latest.default; + craneLib = crane.mkLib pkgs; + craneLib' = craneLib.overrideToolchain rustToolchain; + + cloud-hypervisor = pkgs.callPackage ./chv.nix { + inherit (pkgs.cloud-hypervisor) meta; + inherit src; + craneLib = craneLib'; + }; + in + { + default = cloud-hypervisor; + inherit cloud-hypervisor; + }; + } + ); +} From 44dd0abdad8c26fa79e08171f8d73148731b8361 Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Thu, 9 Apr 2026 16:28:41 +0200 Subject: [PATCH 021/178] build: nix: add commit to version info Sets the `CH_EXTRA_VERSION` env var during compilation to add the git revision to the version output. On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- chv.nix | 4 ++++ flake.nix | 3 +++ 2 files changed, 7 insertions(+) diff --git a/chv.nix b/chv.nix index 206888d1e1..83b331ed8b 100644 --- a/chv.nix +++ b/chv.nix @@ -13,6 +13,7 @@ # other meta, # meta of pkgs.cloud-hypervisor src, # clean source + chExtraVersion, # Additional information to be appended to the version string. }: let commonArgs = { @@ -33,6 +34,9 @@ let # - https://github.com/sfackler/rust-openssl/issues/1430 # - https://docs.rs/openssl/latest/openssl/ OPENSSL_NO_VENDOR = true; + + # Sets additional information to be appended to the version string. + CH_EXTRA_VERSION = chExtraVersion; }; nativeBuildInputs = [ diff --git a/flake.nix b/flake.nix index 24b4671997..512e718cef 100644 --- a/flake.nix +++ b/flake.nix @@ -65,6 +65,9 @@ inherit (pkgs.cloud-hypervisor) meta; inherit src; craneLib = craneLib'; + + # Query the repo revision to pass the cloud-hypervisor to be printed in the version string. + chExtraVersion = self.dirtyRev or self.rev or "unknown-revision"; }; in { From 5b45689d35b5b51a931db924fe8a06507570a6ab Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 28 Jan 2026 13:50:35 +0100 Subject: [PATCH 022/178] build: ci/nix: build cloud-hypervisor On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- .github/workflows/build_nix.yaml | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/build_nix.yaml diff --git a/.github/workflows/build_nix.yaml b/.github/workflows/build_nix.yaml new file mode 100644 index 0000000000..586c985897 --- /dev/null +++ b/.github/workflows/build_nix.yaml @@ -0,0 +1,38 @@ +name: Cloud Hypervisor Build (Nix) +on: [push, pull_request, merge_group] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - name: Code checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + - uses: cachix/install-nix-action@v31 + # We restore Nix evaluation and Nix tarball cache, speeding up the CI. + # This does not cover any Nix artifacts from the Nix store. + - name: Restore Nix cache + uses: actions/cache@v5 + with: + path: ~/.cache/nix + key: nix-cache-${{ github.job }} + # Nix binary cache + - uses: DeterminateSystems/magic-nix-cache-action@main + # Dedicated step to separate all the + # "copying path '/nix/store/...' from 'https://cache.nixos.org'." + # messages from the actual build output. + - name: Prepare Nix Store + run: nix develop --command bash -c "nix --version" + - name: Check Nix format + run: nix fmt -- --ci + - name: Check Nix Flake + run: nix flake check -L + - name: Build Cloud Hypervisor + run: | + nix build -L .#default + nix build -L .#cloud-hypervisor From 74999912bda237dffdcbd51f91c1d399e0712aed Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 27 Aug 2025 10:33:53 +0200 Subject: [PATCH 023/178] build: cargo: add optimized-dev profile TL;DR: Fix for long rebuilds locally when testing things. The release profile is optimized for maximum performance, sacrificing build speed. As local development and testing requires frequent rebuilds, but the dev profile is way too slow for "real testing", this profile is a sweet spot and helps to investigate things. Instead of `cargo run --release`, one can now run `cargo run --profile optimized-dev`. # Measurements Measurements were done using `$ [cargo clean;] time cargo build --profile release|optimized-dev` and rustc 1.89. I've used the `time`-builtin from zsh. Note that user time is much higher as we have more threads (codegen units) now. The total time is much shorter, tho. ## Clean Build Speedup of 56%. - `$ time cargo clean --release`: `109,67s user 13,64s system 211% cpu 58,343 total` - `$ time cargo clean --profile optimized-dev`: `185,41s user 14,92s system 528% cpu 37,876 total` ## Incremental Build Speedup of 153%. - `$ time cargo clean --release`: `37,58s user 1,53s system 117% cpu 33,356 total` - `$ time cargo clean --profile optimized-dev`: `47,62s user 1,71s system 373% cpu 13,220 total` Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- Cargo.toml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index aa3cfa6cd1..77089fef55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,20 @@ lto = true opt-level = "s" strip = true +# Tradeof between performance and fast compilation times for local testing and +# development with frequent rebuilds. +[profile.optimized-dev] +codegen-units = 16 +inherits = "release" +lto = false +opt-level = 2 +strip = false + +# Optimize more for dependencies: They don't require frequent rebuilds. +[profile.optimized-dev.package."*"] +codegen-units = 1 +opt-level = 3 + [profile.profiling] debug = true inherits = "release" From e8ba7f3dbdfbc995bb120cf38addb0955e1023e8 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 1 Dec 2025 14:10:28 +0100 Subject: [PATCH 024/178] build: cargo: don't strip binary With debug symbols, we will get better backtraces and can improve our experience debugging. The only downside is larger binary size which is negligible in our case. There are no implications for the performance. Stripped: 3.9M Unstripped: 4.7M Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 77089fef55..718e45a80d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ codegen-units = 1 lto = true opt-level = "s" -strip = true # Tradeof between performance and fast compilation times for local testing and # development with frequent rebuilds. From 3caabf463c22ae6b96b728cf01d642c71c8c5df3 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 30 Mar 2026 14:35:50 +0200 Subject: [PATCH 025/178] main: print build version and date on startup This improves the quality of the logs when debugging issues. I've used the `jiff` time library as it is well-known time library of the ecosystem. Now, the first logging message (level info!) looks somewhat like this: ```text Cloud Hypervisor starting: build version: v51.1-203-g7f0f1f5cb-dirty, date: 2026-03-30T14:42:30.00730185+02:00 ``` On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- Cargo.lock | 1 + Cargo.toml | 5 ++++- cloud-hypervisor/src/main.rs | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index d669be9b44..7b2621ad22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1244,6 +1244,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", + "windows-sys 0.52.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 718e45a80d..ad0549de1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -105,7 +105,10 @@ env_logger = "0.11.10" epoll = "4.4.0" flume = "0.12.0" itertools = "0.14.0" -jiff = { version = "0.2", default-features = false, features = ["std"] } +jiff = { version = "0.2.23", default-features = false, features = [ + "std", + "tz-system", +] } libc = "0.2.186" log = "0.4.29" rustls = { version = "0.23.38", default-features = false, features = [ diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 5de2e4c2df..42b9d1f397 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -685,6 +685,11 @@ fn start_vmm( .map_err(Error::EventMonitorThread)?; } + info!( + "Cloud Hypervisor starting: build version: {}, date: {}", + env!("BUILD_VERSION"), + jiff::Zoned::now().strftime("%Y-%m-%dT%H:%M:%S%.f%:z") + ); event!("vmm", "starting"); let vmm_thread_handle = vmm::start_vmm_thread( From 714551cb086e0a55408e974d17e36a5559749659 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Mar 2026 14:18:41 +0100 Subject: [PATCH 026/178] vmm: cpu: improve timeout handling in wait_until_signal_acknowledged() We've seen big VMs under massive load that regularly fired their `"vCPU thread did not respond in {count}ms to signal - retrying` warning message. So far, all such situations recovered themselves after ~600ms. To be more fail-safe for the production environment under load, we increase this timeout to 10s. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/cpu.rs | 61 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 1f31ff61ee..a360f032e0 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -19,6 +19,7 @@ use std::mem::size_of; use std::os::unix::thread::JoinHandleExt; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; +use std::time::{Duration, Instant}; use std::{cmp, io, result, thread}; use acpi_tables::sdt::Sdt; @@ -798,30 +799,52 @@ impl VcpuState { } } - /// Blocks until the vCPU thread has acknowledged the signal. It retries to send - /// the signal every 10ms. Times out after 1000ms. + /// Blocks until the vCPU thread has acknowledged the signal. + /// + /// The signal is resent every ms until the vCPU thread acknowledges it. + /// A warning is emitted every 100ms while the acknowledgment is pending. + /// + /// The wait is bounded by a total timeout of 10 seconds. If the vCPU thread + /// does not acknowledge the signal within this time window, + /// [`Error::SignalAcknowledgeTimeout`] is returned. /// /// This is the counterpart of [`Self::signal_thread`]. fn wait_until_signal_acknowledged(&self) -> Result<()> { - if let Some(_handle) = self.handle.as_ref() { - let mut count = 0; - loop { - if self.vcpu_run_interrupted.load(Ordering::SeqCst) { - return Ok(()); - } - // This is more effective than thread::yield_now() at - // avoiding a priority inversion with the vCPU thread - thread::sleep(std::time::Duration::from_millis(1)); - count += 1; - if count >= 1000 { - return Err(Error::SignalAcknowledgeTimeout); - } else if count % 10 == 0 { - warn!("vCPU thread did not respond in {count}ms to signal - retrying"); - self.signal_thread(); - } + if self.handle.is_none() { + return Ok(()); + } + + let start = Instant::now(); + let timeout = Duration::from_secs(10); + let retry_interval = Duration::from_millis(1); + let warn_interval = Duration::from_millis(100); + + let mut next_warn = warn_interval; + loop { + if self.vcpu_run_interrupted.load(Ordering::SeqCst) { + return Ok(()); } + + // Re-signal: it is cheap and idempotent. + self.signal_thread(); + + let elapsed = start.elapsed(); + if elapsed >= timeout { + return Err(Error::SignalAcknowledgeTimeout); + } + + // Emit warning every 100ms + if elapsed >= next_warn { + warn!( + "vCPU thread did not respond in {}ms to signal - retrying (timeout: {}s)", + elapsed.as_millis(), + timeout.as_secs(), + ); + next_warn += warn_interval; + } + + thread::sleep(retry_interval); } - Ok(()) } fn join_thread(&mut self) -> Result<()> { From 9f6ca3d1a1e16e6ec68cfce39047a97388526aef Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 12 May 2026 10:28:32 +0200 Subject: [PATCH 027/178] vmm: increase logging level of precopy from debug -> info This is very helpful information in the field where libvirt just sets the info level. This retains the behavior that we already have at our customer. Further, our tests rely on that line to check the migration progresses. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 250ace1f59..a0dbabc117 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1232,7 +1232,7 @@ impl Vmm { ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); if is_converged(ctx)? { - debug!("Precopy converged: {ctx}"); + info!("Precopy converged: {ctx}"); break Ok(iteration_table); } @@ -1243,7 +1243,7 @@ impl Vmm { ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); // Log progress of the current iteration - debug!("Precopy: {ctx}"); + info!("Precopy: {ctx}"); // Enables management software (e.g., libvirt) to easily track forward progress. event!( From 26c0baf6db1ce9dccbe5578f621b70ac2b76717b Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 5 May 2026 08:55:54 +0200 Subject: [PATCH 028/178] vmm: pci: rename pci_device_id -> bdf_device This is a temporary measurement as upstream decided for a different name than we in our fork. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/vm_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 6fefc8f06c..f612911244 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -281,7 +281,7 @@ pub struct PciDeviceCommonConfig { pub iommu: bool, #[serde(default)] pub pci_segment: u16, - #[serde(default)] + #[serde(default, alias = "pci_device_id", rename = "bdf_device")] pub pci_device_id: Option, } From f795cafbe1074f0f5eefba404f0f694c3ce0cf5c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 12 May 2026 11:07:31 +0200 Subject: [PATCH 029/178] main: restore absolute timestamp behavior in logger This includes the timezone again. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/src/logger.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cloud-hypervisor/src/logger.rs b/cloud-hypervisor/src/logger.rs index 2245256c7b..6c0b40b375 100644 --- a/cloud-hypervisor/src/logger.rs +++ b/cloud-hypervisor/src/logger.rs @@ -98,7 +98,7 @@ fn parse_format(fmt: &str) -> Result, Error> { } pub const DEFAULT_FORMAT: &str = - "cloud-hypervisor: {boottime}s: <{thread}> {level}:{location} -- {msg}"; + "cloud-hypervisor: {wallclock}: <{thread}> {level}:{location} -- {msg}"; pub struct Logger { output: Mutex>, @@ -135,9 +135,7 @@ impl log::Log for Logger { Token::Literal(s) => out.write_all(s.as_bytes()), // 10: 6 decimal places + sep => whole seconds in range `0..=999` properly aligned Token::BootTime => write!(&mut *out, "{duration_s:>10.6?}"), - Token::WallClock => { - write!(out, "{:.6}", jiff::Timestamp::now()) - } + Token::WallClock => write!(out, "{:.6}", jiff::Zoned::now()), Token::Pid => write!(&mut *out, "{}", self.pid), // SAFETY: gettid(2) always succeeds Token::Tid => write!(&mut *out, "{}", unsafe { libc::gettid() }), @@ -237,7 +235,7 @@ mod tests { fn parse_default_format_succeeds() { let tokens = parse_format(DEFAULT_FORMAT).unwrap(); // Default format has 5 tokens interleaved with literals. - assert!(tokens.iter().any(|t| matches!(t, Token::BootTime))); + assert!(tokens.iter().any(|t| matches!(t, Token::WallClock))); assert!(tokens.iter().any(|t| matches!(t, Token::Thread))); assert!(tokens.iter().any(|t| matches!(t, Token::Level))); assert!(tokens.iter().any(|t| matches!(t, Token::Location))); @@ -367,14 +365,13 @@ mod tests { let out = buf.contents(); let out = out.trim(); - assert_eq!(out.len(), 27, "got: {out}"); + assert_eq!(out.len(), 40, "got: {out}"); assert_eq!(&out[4..5], "-", "got: {out}"); assert_eq!(&out[7..8], "-", "got: {out}"); assert_eq!(&out[10..11], "T", "got: {out}"); assert_eq!(&out[13..14], ":", "got: {out}"); assert_eq!(&out[16..17], ":", "got: {out}"); assert_eq!(&out[19..20], ".", "got: {out}"); - assert!(out.ends_with('Z'), "got: {out}"); } #[test] From ead7662e2cf2105256041089b54cfc63b83b2e06 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 27 Jun 2025 13:20:10 +0200 Subject: [PATCH 030/178] misc: live-migration with virtio-net devices with network fds This allows to attach FDs provided by the management layer to virtio-net devices on the live-migration receiver side. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- devices/src/ioapic.rs | 4 +- net_util/src/open_tap.rs | 9 +++- net_util/src/tap.rs | 17 +++++++- virtio-devices/src/net.rs | 26 ++++++++--- .../src/transport/pci_common_config.rs | 4 +- vmm/Cargo.toml | 2 + vmm/src/api/http/http_endpoint.rs | 43 +++++++++++++++---- vmm/src/api/mod.rs | 9 +++- vmm/src/config.rs | 35 ++++++++++++++- vmm/src/device_manager.rs | 10 +++++ vmm/src/lib.rs | 22 ++++++++-- 11 files changed, 156 insertions(+), 25 deletions(-) diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs index ba05c1ed5b..9312ab1156 100644 --- a/devices/src/ioapic.rs +++ b/devices/src/ioapic.rs @@ -172,7 +172,7 @@ impl BusDevice for Ioapic { return None; } - debug!("IOAPIC_W @ offset 0x{offset:x}"); + trace!("IOAPIC_W @ offset 0x{offset:x}"); let value = LittleEndian::read_u32(data); @@ -250,7 +250,7 @@ impl Ioapic { } fn ioapic_write(&mut self, val: u32) { - debug!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); + trace!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); match self.reg_sel as u8 { IOAPIC_REG_VERSION => { diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index a5168d22a0..f04e248540 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -77,7 +77,14 @@ fn open_tap_rx_q_0( let tap = match if_name { Some(name) => Tap::open_named(name, num_rx_q, flags).map_err(Error::TapOpen)?, // Create a new Tap device in Linux, if none was specified. - None => Tap::new(num_rx_q).map_err(Error::TapOpen)?, + None => { + let tap = Tap::new(num_rx_q).map_err(Error::TapOpen)?; + log::info!( + "Created tap device: name={}, num_rx_q={num_rx_q}", + tap.if_name_as_str() + ); + tap + } }; // Don't overwrite ip configuration of existing interfaces: if tap_exists { diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 012c5b9442..8efb13ef1f 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -13,6 +13,7 @@ use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use libc::{__c_anonymous_ifr_ifru, ifreq}; +use log::debug; use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; @@ -70,6 +71,16 @@ pub struct Tap { if_name: CString, } +impl Drop for Tap { + fn drop(&mut self) { + debug!( + "Dropping Tap: if_name={}, FD={}", + self.if_name_as_str(), + self.tap_file.as_raw_fd() + ); + } +} + impl PartialEq for Tap { fn eq(&self, other: &Tap) -> bool { self.if_name == other.if_name @@ -117,6 +128,9 @@ fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { } impl Tap { + /// The default naming scheme for Tap devices that are created by Cloud Hypervisor. + pub const DEFAULT_NAME_SCHEME: &'static str = "vmtap%d"; + /// # Safety /// The caller should ensure to pass a valid file descriptor and valid /// arguments for the `ioctl()` syscall. @@ -176,6 +190,7 @@ impl Tap { if fd < 0 { return Err(Error::OpenTun(IoError::last_os_error())); } + debug!("Opening Tap device with given name: ifname={if_name}, fd={fd}"); // SAFETY: We just checked that the fd is valid. let tuntap = unsafe { File::from_raw_fd(fd) }; @@ -235,7 +250,7 @@ impl Tap { /// Create a new tap interface. pub fn new(num_queue_pairs: usize) -> Result { - Self::open_named("vmtap%d", num_queue_pairs, None) + Self::open_named(Self::DEFAULT_NAME_SCHEME, num_queue_pairs, None) } pub fn from_tap_fd(fd: RawFd, num_queue_pairs: usize) -> Result { diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index e8d1af1e50..451cfa99b2 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -16,7 +16,7 @@ use std::{result, thread}; use anyhow::anyhow; use event_monitor::event; -use log::{debug, error, info, warn}; +use log::{debug, error, info, trace, warn}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ @@ -252,9 +252,9 @@ impl NetEpollHandler { if res { self.signal_used_queue(self.queue_index_base)?; - debug!("Signalling RX queue"); + trace!("Signalling RX queue"); } else { - debug!("Not signalling RX queue"); + trace!("Not signalling RX queue"); } Ok(()) } @@ -601,11 +601,12 @@ impl Net { for fd in fds.iter() { // Duplicate so that it can survive reboots // SAFETY: FFI call to dup. Trivially safe. - let fd = unsafe { libc::dup(*fd) }; - if fd < 0 { + let fd_duped = unsafe { libc::dup(*fd) }; + if fd_duped < 0 { return Err(Error::DuplicateTapFd(std::io::Error::last_os_error())); } - let tap = Tap::from_tap_fd(fd, num_queue_pairs).map_err(Error::TapError)?; + debug!("dup'ed fd {fd} => {fd_duped} for virtio-net device {id}"); + let tap = Tap::from_tap_fd(fd_duped, num_queue_pairs).map_err(Error::TapError)?; taps.push(tap); } @@ -649,6 +650,19 @@ impl Net { impl Drop for Net { fn drop(&mut self) { + // Get a comma-separated list of the interface names of the tap devices + // associated with this network device. + let ifnames_str = self + .taps + .iter() + .map(|tap| tap.if_name_as_str()) + .collect::>(); + let ifnames_str = ifnames_str.join(","); + debug!( + "virtio-net device closed: id={}, ifnames=[{ifnames_str}]", + self.id + ); + if let Some(kill_evt) = self.common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 98ea81392c..0ba1175007 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, Ordering}; use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use log::{debug, error, warn}; +use log::{debug, error, trace, warn}; use serde::{Deserialize, Serialize}; use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; @@ -251,7 +251,7 @@ impl VirtioPciCommonConfig { } fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { - debug!("read_common_config_word: offset 0x{offset:x}"); + trace!("read_common_config_word: offset 0x{offset:x}"); match offset { 0x10 => self.msix_config.load(Ordering::Acquire), 0x12 => queues.len() as u16, // num_queues diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 7a38fa5006..c9025fda6d 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -67,6 +67,8 @@ landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } +# Special fork of micro_http that combines HTTP traffic over a UNIX domain +# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors. micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } mshv-bindings = { workspace = true, features = [ "fam-wrappers", diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 92b53ac68e..980b3e067a 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -6,11 +6,11 @@ //! # HTTP Endpoints of the Cloud Hypervisor API //! -//! ## Special Handling for Devices Backed by Network File Descriptors (FDs) (e.g., virtio-net) +//! ## Special Handling for Externally Provided File Descriptors (FDs) (e.g., virtio-net) //! //! Some of the HTTP handlers here implement special logic for devices -//! **backed by network FDs** to enable live-migration, state save/resume -//! (restore), and similar VM lifecycle events. +//! **backed by externally opened FDs** to enable live-migration, +//! state save/resume (restore), and similar VM lifecycle events. //! //! The utilized mechanism requires that the control software (e.g., libvirt) //! connects to Cloud Hypervisor by using a UNIX domain socket and that it @@ -48,8 +48,8 @@ use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, - VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, - VmShutdown, VmSnapshot, + VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, + VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -429,13 +429,12 @@ vm_action_put_handler_body!(VmRemoveDevice); vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmReceiveMigration); vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmAddNet { fn handle_request( @@ -459,6 +458,34 @@ impl PutHandler for VmAddNet { impl GetHandler for VmAddNet {} +// Special handling for externally provided FDs. +// See module description for more info. +impl PutHandler for VmReceiveMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let mut net_cfg: VmReceiveMigrationData = serde_json::from_slice(body.raw())?; + if !net_cfg.net_fds.is_empty() { + let mut cfgs = net_cfg.net_fds.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + attach_fds_to_cfgs(files, cfgs)?; + } + + self.send(api_notifier, api_sender, net_cfg) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmReceiveMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, @@ -487,7 +514,7 @@ impl PutHandler for VmResize { impl GetHandler for VmResize {} -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmRestore { fn handle_request( diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 2ed360eeae..c262bf8e2e 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -52,7 +52,7 @@ use vmm_sys_util::eventfd::EventFd; pub use self::dbus::start_dbus_thread; pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; -use crate::config::RestoreConfig; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::device_tree::DeviceTree; use crate::migration_transport::MAX_MIGRATION_CONNECTIONS; use crate::vm::{Error as VmError, VmState}; @@ -274,6 +274,9 @@ pub struct VmReceiveMigrationData { /// Directory containing the TLS server certificate (server-cert.pem), the TLS server key (server-key.pem), and the client TLS root CA certificate (ca-cert.pem). #[serde(default)] pub tls_dir: Option, + /// Map with new network FDs on the new host. + #[serde(default)] + pub net_fds: Vec, } #[derive(Debug, Error)] @@ -334,6 +337,7 @@ impl VmReceiveMigrationData { let data = Self { receiver_url: migration.to_owned(), tls_dir: None, + net_fds: vec![], }; data.validate()?; @@ -360,6 +364,7 @@ impl VmReceiveMigrationData { let data = Self { receiver_url, tls_dir, + net_fds: vec![], }; data.validate()?; @@ -1978,6 +1983,7 @@ mod unit_tests { VmReceiveMigrationData { receiver_url: "tcp:192.168.1.1:8080".to_string(), tls_dir: None, + net_fds: vec![], } ); @@ -2001,6 +2007,7 @@ mod unit_tests { VmReceiveMigrationData { receiver_url: "tcp:192.168.1.1:8080".to_string(), tls_dir: Some(tls_dir), + net_fds: vec![], } ); diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 00bf251f3a..e8b00fc81d 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2558,6 +2558,27 @@ pub struct RestoredNetConfig { pub fds: Option>, } +impl RestoredNetConfig { + // Ensure all net devices from 'VmConfig' backed by FDs have a + // corresponding 'RestoreNetConfig' with a matched 'id' and expected + // number of FDs. + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let found = vm_config + .net + .iter() + .flatten() + .any(|net| net.pci_common.id.as_ref() == Some(&self.id)); + + if found { + Ok(()) + } else { + Err(ValidationError::RestoreMissingRequiredNetId( + self.id.clone(), + )) + } + } +} + fn deserialize_restorednetconfig_fds<'de, D>( d: D, ) -> std::result::Result>, D::Error> @@ -3560,6 +3581,8 @@ impl VmConfig { /// To use this safely, the caller must guarantee that the input /// fds are all valid. pub unsafe fn add_preserved_fds(&mut self, mut fds: Vec) { + debug!("adding preserved FDs to VM list: {fds:?}"); + if fds.is_empty() { return; } @@ -3614,7 +3637,16 @@ impl Clone for VmConfig { .preserved_fds .as_ref() // SAFETY: FFI call with valid FDs - .map(|fds| fds.iter().map(|fd| unsafe { libc::dup(*fd) }).collect()), + .map(|fds| { + fds.iter() + .map(|fd| { + // SAFETY: Trivially safe. + let fd_duped = unsafe { libc::dup(*fd) }; + warn!("Cloning VM config: duping preserved FD {fd} => {fd_duped}"); + fd_duped + }) + .collect() + }), landlock_rules: self.landlock_rules.clone(), #[cfg(feature = "ivshmem")] ivshmem: self.ivshmem.clone(), @@ -3626,6 +3658,7 @@ impl Clone for VmConfig { impl Drop for VmConfig { fn drop(&mut self) { if let Some(mut fds) = self.preserved_fds.take() { + debug!("Closing preserved FDs from VM: fds={fds:?}"); for fd in fds.drain(..) { // SAFETY: FFI call with valid FDs unsafe { libc::close(fd) }; diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index c5506ba0b8..0301c849ca 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2913,6 +2913,7 @@ impl DeviceManager { let (virtio_device, migratable_device) = if net_cfg.vhost_user { let socket = net_cfg.vhost_socket.as_ref().unwrap().clone(); + debug!("Creating virtio-net device with vhost-user backend: {socket}"); let vu_cfg = VhostUserConfig { socket, num_queues: net_cfg.num_queues, @@ -2955,6 +2956,7 @@ impl DeviceManager { let state = state_from_id(snapshot, id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { + debug!("Creating virtio-net device from Tap device: {tap_if_name}"); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2980,6 +2982,7 @@ impl DeviceManager { .map_err(DeviceManagerError::CreateVirtioNet)?, )) } else if let Some(fds) = &net_cfg.fds { + debug!("Creating virtio-net device from network FDs: {fds:?}"); let net = virtio_devices::Net::from_tap_fds( id.clone(), fds, @@ -3006,6 +3009,9 @@ impl DeviceManager { Arc::new(Mutex::new(net)) } else { + debug!( + "Creating virtio-net device: no ifname or FDs given, creating new Tap device" + ); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -4650,6 +4656,10 @@ impl DeviceManager { Ok(()) } + /// Notifies the VM for a hotplug. + /// + /// This call doesn't wait for the vCPU receiving the + /// interrupt to acknowledge. pub fn notify_hotplug( &self, _notification_type: AcpiNotificationFlags, diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index a0dbabc117..2a1a015625 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -895,7 +895,7 @@ impl Vmm { listener: &ReceiveListener, state: ReceiveMigrationState, req: &Request, - _receive_data_migration: &VmReceiveMigrationData, + receive_data_migration: &VmReceiveMigrationData, ) -> std::result::Result { use ReceiveMigrationState::*; @@ -910,6 +910,21 @@ impl Vmm { memory_files: HashMap| -> std::result::Result { let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + + if !receive_data_migration.net_fds.is_empty() { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + for restored_net in &receive_data_migration.net_fds { + for net_config in vm_config.net.iter_mut().flatten() { + // Only update net devices that are backed directly by file descriptors. + if net_config.pci_common.id.as_ref() == Some(&restored_net.id) + && net_config.fds.is_some() + { + net_config.fds.clone_from(&restored_net.fds); + } + } + } + } + let guest_memory = memory_manager.lock().unwrap().guest_memory(); // Create the additional-connection receiver even in the single-connection case. // At this point the receiver does not know whether the sender will use extra TCP @@ -2525,9 +2540,10 @@ impl RequestHandler for Vmm { .map_err(MigratableError::MigrateReceive)?; info!( - "Receiving migration: receiver_url={},tls={}", + "Receiving migration: receiver_url={},tls={},net_fds={:?}", receive_data_migration.receiver_url, - receive_data_migration.tls_dir.is_some() + receive_data_migration.tls_dir.is_some(), + &receive_data_migration.net_fds ); let mut listener = migration_transport::receive_migration_listener( From 07d1f92c3d2886ec924c7e6429fee741fd98fec8 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 10 Sep 2025 16:46:51 +0200 Subject: [PATCH 031/178] vmm: properly unset immediate_exit on -EINTR Also see [0] for more info. [0] https://docs.kernel.org/virt/kvm/api.html#the-kvm-run-structure Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- hypervisor/src/kvm/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 39f76dca31..eb45c01bb6 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -2533,7 +2533,11 @@ impl cpu::Vcpu for KvmVcpu { }, Err(ref e) => match e.errno() { - libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + Ok(cpu::VmExit::Ignore) + } + libc::EAGAIN => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {e:?}" ))), From b3b0988512293a864512e0770c307cd509acb4cd Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 11 Sep 2025 09:30:55 +0200 Subject: [PATCH 032/178] vmm: vcpu: optimize lock usage No need to grab the lock multiple times in this short period of time. The lock is anyway held for the duration of the long operation (KVM_RUN). Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/cpu.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index a360f032e0..4ce05ee9b5 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -1338,12 +1338,14 @@ impl CpuManager { #[cfg(feature = "kvm")] if matches!(hypervisor_type, HypervisorType::Kvm) { - vcpu.lock().unwrap().vcpu.set_immediate_exit(true); - if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { + let lock = vcpu.lock(); + let mut lock = lock.unwrap(); + lock.vcpu.set_immediate_exit(true); + if !matches!(lock.run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } - vcpu.lock().unwrap().vcpu.set_immediate_exit(false); + lock.vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); From 46f6d304c6e42357816faaae728ba24f781c68ea Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 30 Apr 2026 09:40:17 +0200 Subject: [PATCH 033/178] vmm: prerequisites for accessing KVM_RUN in the vCPU loop These are the prerequisites for the upcoming (quick and dirty) solution to the problem that we might miss some events. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- hypervisor/src/cpu.rs | 10 ++++++++++ hypervisor/src/kvm/mod.rs | 9 +++++++-- hypervisor/src/mshv/mod.rs | 7 +++++++ vmm/src/cpu.rs | 9 +++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 044c81a2e8..fef327ffb3 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -10,6 +10,9 @@ // // +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; + use thiserror::Error; #[cfg(not(target_arch = "riscv64"))] use {anyhow::anyhow, vm_memory::GuestAddress}; @@ -608,4 +611,11 @@ pub trait Vcpu: Send + Sync { /// Trigger NMI interrupt /// fn nmi(&self) -> Result<()>; + /// Returns the underlying vCPU FD of KVM. + /// + /// # SAFETY + /// This is safe as we only use this to map the KVM_RUN structure for the + /// signal handler and only use it from there. + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd; } diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index eb45c01bb6..76147e37bd 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -17,9 +17,9 @@ use std::mem::offset_of; #[cfg(feature = "sev_snp")] use std::os::fd::FromRawFd; use std::os::fd::OwnedFd; -#[cfg(any(feature = "sev_snp", feature = "tdx"))] +#[cfg(any(feature = "kvm", feature = "sev_snp"))] use std::os::unix::io::AsRawFd; -#[cfg(feature = "tdx")] +#[cfg(any(feature = "kvm", feature = "tdx"))] use std::os::unix::io::RawFd; use std::result; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -3226,6 +3226,11 @@ impl cpu::Vcpu for KvmVcpu { self.fd.set_kvm_immediate_exit(exit.into()); } + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } + /// /// Returns the details about TDX exit reason /// diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index d38aff9860..2fb9f8653d 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -49,6 +49,8 @@ pub mod x86_64; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; @@ -1661,6 +1663,11 @@ impl cpu::Vcpu for MshvVcpu { Ok(()) } + + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + todo!() + } } impl MshvVcpu { diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 4ce05ee9b5..bbbf104245 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -16,6 +16,8 @@ use std::collections::BTreeMap; use std::io::Write; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use std::mem::size_of; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::thread::JoinHandleExt; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; @@ -671,6 +673,13 @@ impl Vcpu { .map_err(Error::VcpuSetGicrBaseAddr)?; Ok(()) } + + #[cfg(feature = "kvm")] + pub fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + // SAFETY: We happen to know that all current uses respect the safety contract. + // TODO find a better way to keep this safe and/or express its fragile state. + unsafe { self.vcpu.get_kvm_vcpu_raw_fd() } + } } impl Pausable for Vcpu {} From caff5a725e89eceaf47214f3222351f26a2a771a Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 10 Sep 2025 16:47:06 +0200 Subject: [PATCH 034/178] vmm: fix kicking vCPU out of KVM_RUN from signal handler A common scenario for a VMM to regain control over the vCPU thread from the hypervisor is to interrupt the vCPU. A use-case might be the `pause` API call of CHV. VMMs using KVM as hypervisor must use signals for this interception, i.e., a thread sends a signal to the vCPU thread. Sending and handling these signals is inherently racy because the signal sender does not know if the receiving thread is currently in the RUN_VCPU [0] call, or executing userspace VMM code. If we are in kernel space in KVM_RUN, things are easy as KVM just exits with -EINTR. For user-space this is more complicated. For example, it might happen that we receive a signal but the vCPU thread was about to go into the KVM_RUN system call as next instruction. There is no more opportunity to check for any pending signal flag or similar. KVM offers the `immediate_exit` flag [1] as part of the KVM_RUN structure for that. The signal handler of a vCPU is supposed to set this flag, to ensure that we do not miss any events. If the flag is set, KVM_RUN will exit immediately [2]. We will miss signals to the vCPU if the vCPU thread is in userspace VMM code and we do not use the `immediate_exit` flag. We must have access to the KVM_RUN data structure when the signal handler executes in a vCPU thread's context and set the `immediate_exit` [1] flag. This way, the next invocation of KVM_RUN exits immediately and the userspace VMM code can do the normal event handling. We must not use any shared locks between the normal vCPU thread VMM code and the signal handler, as otherwise we might end up in deadlocks. The signal handler therefore needs its dedicated mutable version of KVM_RUN. This commit introduces a (very hacky but good enough for a PoC) solution to this problem. [0] https://docs.kernel.org/virt/kvm/api.html#kvm-run [1] https://docs.kernel.org/virt/kvm/api.html#the-kvm-run-structure [2] https://elixir.bootlin.com/linux/v6.12/source/arch/x86/kvm/x86.c#L11566 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- Cargo.lock | 1 + hypervisor/src/mshv/mod.rs | 2 +- vmm/Cargo.toml | 1 + vmm/src/cpu.rs | 64 ++++++++++++++++++++++++++++++++++++-- vmm/src/lib.rs | 10 ++++++ 5 files changed, 75 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7b2621ad22..86107bc64c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2713,6 +2713,7 @@ dependencies = [ "igvm", "igvm_defs", "iommufd-ioctls", + "kvm-bindings", "landlock", "libc", "linux-loader", diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 2fb9f8653d..644bc32368 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -1666,7 +1666,7 @@ impl cpu::Vcpu for MshvVcpu { #[cfg(feature = "kvm")] unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { - todo!() + unimplemented!() } } diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index c9025fda6d..865ce00449 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -63,6 +63,7 @@ hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } iommufd-ioctls = { workspace = true, optional = true } +kvm-bindings = { workspace = true } landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index bbbf104245..f42d3adfb3 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -16,8 +16,6 @@ use std::collections::BTreeMap; use std::io::Write; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use std::mem::size_of; -#[cfg(feature = "kvm")] -use std::os::fd::RawFd; use std::os::unix::thread::JoinHandleExt; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; @@ -83,6 +81,8 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::{SIGRTMIN, register_signal_handler}; use zerocopy::{FromBytes, Immutable, IntoBytes}; +#[cfg(feature = "kvm")] +use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -98,6 +98,16 @@ use crate::vm::physical_bits; use crate::vm_config::{CoreScheduling, CpusConfig}; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; +#[cfg(feature = "kvm")] +thread_local! { + static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())}; +} +#[cfg(feature = "kvm")] +/// Tell signal handler to not access certain stuff anymore during shutdown. +/// Otherwise => panics. +/// Better alternative would be to prevent signals there at all. +pub static IS_IN_SHUTDOWN: RwLock = RwLock::new(false); + #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] /// Extract the specified bits of a 64-bit integer. /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, @@ -1217,6 +1227,28 @@ impl CpuManager { thread::Builder::new() .name(format!("vcpu{vcpu_id}")) .spawn(move || { + // init thread-local kvm_run structure + #[cfg(feature = "kvm")] + { + let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd(); + + // SAFETY: We know the FD is valid and have the proper args. + let buffer = unsafe { + libc::mmap( + core::ptr::null_mut(), + 4096, + libc::PROT_WRITE | libc::PROT_READ, + libc::MAP_SHARED, + raw_kvm_fd, + 0, + ) + }; + assert!(!buffer.is_null()); + assert_ne!(buffer, libc::MAP_FAILED); + let kvm_run = buffer.cast::(); + KVM_RUN.set(kvm_run); + } + // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { let cpuset: *const libc::cpu_set_t = cpuset; @@ -1308,7 +1340,35 @@ impl CpuManager { return; } + #[cfg(not(feature = "kvm"))] extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} + #[cfg(feature = "kvm")] + extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) { + // We do not need a self-pipe for safe UNIX signal handling here as in this + // signal handler, we only expect the same signal over and over again. While + // different signals can interrupt a signal being handled, the same signal + // again can't by default. Therefore, this is safe. + + // This lock prevents accessing thread locals when a signal is received + // in the teardown phase of the Rust standard library. Otherwise, we would + // panic. + // + // Masking signals would be a nicer approach but this is the pragmatic + // solution. + // + // We don't have lock contention in normal operation. When the writer + // sets the bool to true, the lock is only held for a couple of µs. + let lock = IS_IN_SHUTDOWN.read().unwrap(); + if *lock { + return; + } + + let kvm_run = KVM_RUN.get(); + // SAFETY: the mapping is valid + let kvm_run = unsafe { + kvm_run.as_mut().expect("kvm_run should have been mapped as part of vCPU setup") }; + kvm_run.immediate_exit = 1; + } // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 2a1a015625..b6d6c8fc41 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -54,6 +54,8 @@ use crate::api::{ use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; +#[cfg(feature = "kvm")] +use crate::cpu::IS_IN_SHUTDOWN; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1540,6 +1542,14 @@ impl Vmm { vm.release_disk_locks() .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + #[cfg(feature = "kvm")] + // Prevent signal handler to access thread local storage when signals are received + // close to the end when thread-local storage is already destroyed. + { + let mut lock = IS_IN_SHUTDOWN.write().unwrap(); + *lock = true; + } + // Capture snapshot and send it let (vm_snapshot, snapshot_duration) = measure_ok(|| vm.snapshot())?; let (_, send_snapshot_duration) = From babc6632cbec80ebdf65ce8f33394d63fedc05d0 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 11 Sep 2025 08:48:41 +0200 Subject: [PATCH 035/178] vmm: temporarily make "resize" API call fail fast Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index b6d6c8fc41..9c7cb9daff 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2194,6 +2194,10 @@ impl RequestHandler for Vmm { ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + if desired_vcpus.is_some() { + todo!("doesn't work currently with our thread-local KVM_RUN approach"); + } + if let Some(ref mut vm) = self.vm { vm.resize(desired_vcpus, desired_ram, desired_balloon) .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; From a34696160a5c73da1a557086ec11dfd68905d318 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 4 May 2026 20:51:26 +0200 Subject: [PATCH 036/178] vm-migration: mTLS -> TLS (make upstream compatible with our fork) This is needed as at our customer we deployed everything without mTLS. We need to find a migration path soon, tho. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- docs/live_migration.md | 9 --------- vm-migration/src/tls.rs | 24 ++++-------------------- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/docs/live_migration.md b/docs/live_migration.md index 023a854f71..271df04aaf 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -200,20 +200,11 @@ The destination host needs a directory containing: - `server-cert.pem`: the certificate presented by the destination - `server-key.pem`: the private key for `server-cert.pem` -- `ca-cert.pem`: the CA certificate used to verify client certificates The source host needs a directory containing: - `ca-cert.pem`: the CA certificate used to verify the destination certificate -- `client-cert.pem`: the certificate presented by the source -- `client-key.pem`: the private key for `client-cert.pem` - -Current TCP migration uses mutual TLS (mTLS) authentication. The source -verifies the destination certificate against `ca-cert.pem` and presents -`client-cert.pem` and `client-key.pem`. The destination presents -`server-cert.pem` and `server-key.pem`, and only accepts client -certificates that chain to `ca-cert.pem`. Example receiver command: diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs index 41ad5fab7e..f9377728b0 100644 --- a/vm-migration/src/tls.rs +++ b/vm-migration/src/tls.rs @@ -6,12 +6,7 @@ //! TLS support for migration streams over TCP. //! //! This module wraps `rustls` to provide a blocking [`TlsStream`] for migration -//! traffic. [`TlsStream::new_client`] authenticates the server against -//! `ca-cert.pem` and the expected hostname, and presents `client-cert.pem` and -//! `client-key.pem` for mutual TLS (mTLS) authentication. [`TlsServerConfig`] loads -//! `server-cert.pem` and `server-key.pem`, trusts client certificates issued by -//! the CA in `ca-cert.pem`, and [`TlsStream::new_server`] uses that -//! configuration to establish the server side of the connection. +//! traffic. //! //! [`TlsStream`] implements [`Read`], [`Write`], [`ReadVolatile`], //! [`WriteVolatile`], and [`AsFd`] so it can be used by the transport layer like @@ -27,7 +22,7 @@ use std::sync::Arc; use rustls::pki_types::pem::PemObject; use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; -use rustls::server::{VerifierBuilderError, WebPkiClientVerifier}; +use rustls::server::VerifierBuilderError; use rustls::{ ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, }; @@ -97,14 +92,10 @@ impl TlsStream { hostname: &str, ) -> result::Result { let root_store = load_root_store(&cert_dir.join("ca-cert.pem"))?; - let client_certs = load_cert_chain(&cert_dir.join("client-cert.pem"))?; - let client_key = load_private_key(&cert_dir.join("client-key.pem"))?; let config = ClientConfig::builder() .with_root_certificates(root_store) - .with_client_auth_cert(client_certs, client_key) - .map_err(TlsError::RustlsError) - .map_err(MigratableError::Tls)?; + .with_no_client_auth(); let config = Arc::new(config); let server_name = ServerName::try_from(hostname.to_string()) @@ -293,16 +284,9 @@ impl TlsServerConfig { pub fn new(cert_dir: &Path) -> result::Result { let server_certs = load_cert_chain(&cert_dir.join("server-cert.pem"))?; let server_key = load_private_key(&cert_dir.join("server-key.pem"))?; - // Trust anchors used to verify client certificates for mTLS. - let client_roots = Arc::new(load_root_store(&cert_dir.join("ca-cert.pem"))?); - - let client_verifier = WebPkiClientVerifier::builder(client_roots) - .build() - .map_err(TlsError::RustlsVerifierBuilderError) - .map_err(MigratableError::Tls)?; let config = ServerConfig::builder() - .with_client_cert_verifier(client_verifier) + .with_no_client_auth() .with_single_cert(server_certs, server_key) .map_err(TlsError::RustlsError) .map_err(MigratableError::Tls)?; From ed3e81e056fbb5f1add95dc6bb36ad58b37aaf1a Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Jun 2025 11:38:41 +0200 Subject: [PATCH 037/178] vmm: add vCPU throttling (auto-converge) for pre-copy auto-converge (vCPU throttling) is a technique combined with precopy live-migration flows to migrate VMs with a high dirty rate (high working set with many writes). It is an alternative to postcopy migration, which is not yet implemented in Cloud Hypervisor. By throttling the vCPUs incrementally, the dirty rate drops and the VM migrates (converges) eventually. More specifically, the reduced dirty rate ensures that the configured downtime can be reached. The implementation is inspired by QEMU, but adapted to Cloud Hypervisor. Various discussions, intermediate steps, and experiments lead to this final result. vCPU throttling was implemented with a dedicated thread and a manager for that thread. This thread utilizes the CpuManager's pause() and resume() in conjunction with (interruptible) sleeps to apply the current throttling percentage onto the vCPUs, thus the VM. The implementation is designed to not block or delay normal operation any longer than necessary. The proposed design relies on the recent improvements and fixes for CpuManager's pause() and resume(). For correctness, on each pause/resume cycle, the time for these actions is measured. This way, a dynamic timeslice can be used, guaranteeing the VM is indeed throttled at the indented percentage. Although not supported yet by Cloud Hypervisor, this thread will support throttling cancellation when live-migrations are cancelled. This was intensively tested in an automated setup with thousands of live-migrations with VMs under load. - auto-converging starts always after two memory delta transfer iterations - every two iterations, it is increased (step size is 10%) - maximum throttling is 99% - the VM will get slower. At 99% throttling, it will be unsurprisingly barely usable. This is something users have to accept if they want to migrate their VMs running heavy workloads. Signed-off-by: Philipp Schuster Reviewed-by: Stefan Kober Reviewed-by: Oliver Anderson Reviewed-by: Thomas Prescher On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 42 ++- vmm/src/vcpu_throttling.rs | 605 +++++++++++++++++++++++++++++++++++++ vmm/src/vm.rs | 32 ++ 3 files changed, 676 insertions(+), 3 deletions(-) create mode 100644 vmm/src/vcpu_throttling.rs diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 9c7cb9daff..5aef732c08 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3,6 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // +/// Amount of iterations before auto-converging starts. +const AUTO_CONVERGE_ITERATION_DELAY: u64 = 2; +/// Step size in percent to increase the vCPU throttling. +const AUTO_CONVERGE_STEP_SIZE: u8 = 10; +/// Amount of iterations after that we increase vCPU throttling. +const AUTO_CONVERGE_ITERATION_INCREASE: u64 = 2; +/// Maximum vCPU throttling value. +const AUTO_CONVERGE_MAX: u8 = 99; + use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; @@ -104,6 +113,7 @@ mod sigwinch_listener; mod sync_utils; mod uffd; mod userfaultfd; +mod vcpu_throttling; pub mod vm; pub mod vm_config; @@ -1224,6 +1234,15 @@ impl Vmm { Ok((receive_duration, restore_duration)) } + fn can_increase_autoconverge_step(s: &MemoryMigrationContext) -> bool { + if (s.iteration as u64) < AUTO_CONVERGE_ITERATION_DELAY { + false + } else { + let iteration = s.iteration as u64 - AUTO_CONVERGE_ITERATION_DELAY; + iteration.is_multiple_of(AUTO_CONVERGE_ITERATION_INCREASE) + } + } + /// Performs the initial memory transmission (iteration zero) plus a /// variable number of memory iterations with the goal to eventually migrate /// the VM in a reasonably small downtime. @@ -1238,6 +1257,19 @@ impl Vmm { mem_send: &mut SendAdditionalConnections, ) -> result::Result { loop { + // todo: check if auto-converge is enabled at all? + if Self::can_increase_autoconverge_step(ctx) + && vm.throttle_percent() < AUTO_CONVERGE_MAX + { + let current_throttle = vm.throttle_percent(); + let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; + let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); + info!("Increasing auto-converge: {new_throttle}%"); + if new_throttle != current_throttle { + vm.set_throttle_percent(new_throttle); + } + } + let iteration_begin = Instant::now(); let iteration_table = if ctx.iteration == 0 { @@ -1395,9 +1427,13 @@ impl Vmm { mem_send, )?; let downtime_begin = Instant::now(); - if vm.get_state() != VmState::Paused { - vm.pause()?; - } + // End throttle thread + info!("stopping vcpu thread"); + vm.stop_vcpu_throttling(); + info!("stopped vcpu thread"); + info!("pausing VM"); + vm.pause()?; + info!("paused VM"); // Send last batch of dirty pages: final iteration { diff --git a/vmm/src/vcpu_throttling.rs b/vmm/src/vcpu_throttling.rs new file mode 100644 index 0000000000..e8fd0d3b12 --- /dev/null +++ b/vmm/src/vcpu_throttling.rs @@ -0,0 +1,605 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! # vCPU throttling for Auto Converging +//! +//! vCPU throttling is crucial to reach a reasonable downtime when using a +//! precopy strategy for live-migration of VMs with memory-intensive workloads. +//! Auto converge means an increasing vCPU throttling over time until the memory +//! delta is small enough for the migration thread(s) to perform the switch-over +//! to the new host. +//! +//! Therefore, the migration thread(s) use this thread to help them reach their +//! goal. Next to typical lifecycle management, this thread must fulfill various +//! requirements to ensure a minimal downtime. +//! +//! ## Thread Requirements +//! - Needs to be able to gracefully wait for work. +//! - Must be able to exit gracefully. +//! - Must be able to cancel any work and return to its init state to support +//! live-migration cancellation and restart of live-migrations. +//! - Must not block the migration thread(s) whenever possible, to facilitate +//! fast live-migrations with short downtimes. +//! - Must be interruptible during a sleep phase to not block the migration +//! thread(s). +//! - Must not confuse or hinder the migration thread(s) regarding +//! pause()/resume() operations. Context: migration thread shuts down the +//! vCPUs for the handover. The throttle thread must not restart the vCPUs +//! again. + +use std::cell::Cell; +use std::cmp::min; +use std::sync::mpsc::RecvTimeoutError; +use std::sync::{Arc, Mutex, mpsc}; +use std::thread; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use log::{debug, warn}; +use vm_migration::Pausable; + +use crate::cpu::CpuManager; + +/// The possible command of the thread, i.e., the current state. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ThrottleCommand { + /// Waiting for next event. + Waiting, + /// Ongoing vCPU throttling. + /// + /// The inner value shows the current throttling percentage in range `1..=99`. + Throttling(u8 /* `1..=99` */), + /// Thread is shutting down gracefully. + Exiting, +} + +/// Helper to adapt the throttling timeslice as we go, depending on the time it +/// takes to pause() and resume() all vCPUs. +#[derive(Debug)] +struct TimesliceContext { + current_timeslice: Duration, + /// Duration it took to pause() all vCPUs on the previous iteration. + previous_pause_duration: Duration, + /// Duration it took to resume() all vCPUs on the previous iteration. + previous_resume_duration: Duration, +} + +impl TimesliceContext { + /// The initial timeslice for a throttling cycle (vCPU pause & resume). + const INITIAL_TIMESLICE: Duration = Duration::from_millis(100); + + /// The minimal value for the operations. + /// + /// Any value smaller than this is upgraded to this to prevent math + /// exceptions during timing calculations. + const MIN_DURATION: Duration = Duration::from_millis(1); + + /// Maximum time slice. This should not be too big. + /// + /// Otherwise, for example: Assuming we have 10% throttling and + /// 2000ms time slice, then the WM will be unresponsive for + /// 200ms every 1800ms. This is not convenient. /// + const MAX_TIMESLICE: Duration = Duration::from_millis(800); + + /// Creates a new instance with [`Self::INITIAL_TIMESLICE`]. + fn new() -> Self { + Self { + current_timeslice: Self::INITIAL_TIMESLICE, + previous_pause_duration: Self::MIN_DURATION, + previous_resume_duration: Self::MIN_DURATION, + } + } + + /// Updates the timeslice. + fn update_timeslice(&mut self) { + // CpuManager::pause() plus CpuManager::resume() without additional delay is the shortest + // we can get. + let one_percent = self.previous_pause_duration + self.previous_resume_duration; + self.current_timeslice = one_percent * 100; + self.current_timeslice = min(self.current_timeslice, Self::MAX_TIMESLICE); + } + + /// Calculates the sleep durations for after the `pause()` and `resume()` operations with + /// the current `timeslice`. + /// + /// It uses the `timeslice` that was calculated on the previous + /// invocation of [`Self::update_timeslice`]. + fn calc_sleep_durations( + &mut self, + percentage: u64, + ) -> ( + Duration, /* after pause */ + Duration, /* after resume */ + ) { + assert!(percentage <= 100); + assert!(percentage > 0); + + let timeslice_ms = self.current_timeslice.as_millis() as u64; + let wait_ms_after_pause_ms = timeslice_ms * percentage / 100; + let wait_ms_after_resume_ms = timeslice_ms - wait_ms_after_pause_ms; + + let wait_ms_after_pause_ms = + wait_ms_after_pause_ms.saturating_sub(self.previous_pause_duration.as_millis() as u64); + let wait_ms_after_resume_ms = wait_ms_after_resume_ms + .saturating_sub(self.previous_resume_duration.as_millis() as u64); + + ( + Duration::from_millis(wait_ms_after_pause_ms), + Duration::from_millis(wait_ms_after_resume_ms), + ) + } + + /// Set the previous pause duration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_pause_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + + self.previous_pause_duration = duration; + } + + /// Set the duration it took to `resume()` all vCPUs on the previous iteration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_resume_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + self.previous_resume_duration = duration; + } +} + +/// Context of the vCPU throttle thread. +// The main justification for this dedicated type is to split the thread +// functions from the higher-level control API. +// TODO seccomp is missing +pub struct ThrottleWorker { + handle: Option>, +} + +impl ThrottleWorker { + /// This should not be named "vcpu*" as libvirt fails when + /// iterating the vCPU threads then. Fix this first in libvirt! + const THREAD_NAME: &'static str = "throttle-vcpu"; + + /// Executes the provided callback and goes to sleep until the specified + /// `sleep_duration` passed. + /// + /// The time to execute the callback itself is not taken into account + /// when sleeping for `sleep_duration`. Therefore, the callback is + /// supposed to be quick (a couple of milliseconds). + /// + /// The thread is interruptible during the sleep phase when the `receiver` + /// receives a new [`ThrottleCommand`]. + /// + /// # Arguments + /// - `callback`: Function to run + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `sleep_duration`: Duration this function takes at most, including + /// running the `callback`. + /// - `receiver`: Receiving end of the channel to the migration managing + /// thread. + fn execute_and_wait_interruptible( + callback: &impl Fn(), + mut set_callback_duration: impl FnMut(Duration), + sleep_duration: Duration, + receiver: &mpsc::Receiver, + ) -> Option { + let begin = Instant::now(); + callback(); + let cb_duration = begin.elapsed(); + // Help to adjust the timeslice in the next cycle. + set_callback_duration(cb_duration); + + // It might happen that sometimes we get interrupted during a sleep phase + // with a new higher throttle percentage but this is negligible. For an + // auto-converge cycle, there are typically only ~10 steps involved over + // a time frame from a couple of seconds up to a couple of minutes. + match receiver.recv_timeout(sleep_duration) { + Ok(next_task) => Some(next_task), + Err(RecvTimeoutError::Timeout) => None, + Err(RecvTimeoutError::Disconnected) => { + panic!("thread and channel should exit gracefully") + } + } + } + + /// Executes one throttling step: either pause or resume of vCPUs. + /// + /// Runs the given callback, then waits for the specified duration, unless + /// interrupted by a new [`ThrottleCommand`]. + /// + /// # Behavior + /// - Runs the provided `callback` immediately. + /// - Waits up to `duration` for new commands on the `receiver`. + /// - If no command arrives before the timeout, this step completes + /// normally and returns `None`. + /// - If a [`ThrottleCommand::Throttling`] arrives, updates the current + /// throttle percentage in `current_throttle` and continues with the + /// loop. Returns `None`. + /// - If a [`ThrottleCommand::Waiting`] or [`ThrottleCommand::Exiting`] + /// arrives, this command is forwarded to the caller. + /// + /// # Arguments + /// - `callback`: Function to run (e.g., pause or resume vCPUs). + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `receiver`: Channel for receiving new [`ThrottleCommand`]s. + /// - `current_throttle`: Mutable reference to the current throttle + /// percentage (updated on [`ThrottleCommand::Throttling`]). + /// + /// # Returns + /// - `None` if the throttling cycle should continue. + /// - `Some(ThrottleCommand::Waiting | ThrottleCommand::Exiting)` if + /// throttling should stop. + fn throttle_step( + callback: &F, + set_callback_duration: impl FnMut(Duration), + duration: Duration, + receiver: &mpsc::Receiver, + current_throttle: &mut u64, + ) -> Option + where + F: Fn(), + { + let maybe_task = Self::execute_and_wait_interruptible( + callback, + set_callback_duration, + duration, + receiver, + ); + match maybe_task { + None => None, + Some(ThrottleCommand::Throttling(next)) => { + // A new throttle value is only applied at the end of a full + // throttling cycle. This is fine and negligible in a series of + // (tens of) thousands of cycles. + *current_throttle = next as u64; + None + } + Some(cmd @ (ThrottleCommand::Exiting | ThrottleCommand::Waiting)) => Some(cmd), + } + } + + /// Helper for [`Self::control_loop`] that runs the actual throttling loop. + /// + /// This function returns the next [`ThrottleCommand`] **only** if the thread + /// stopped the vCPU throttling. + fn throttle_loop( + receiver: &mpsc::Receiver, + initial_throttle: u8, + callback_pause_vcpus: &impl Fn(), + callback_resume_vcpus: &impl Fn(), + ) -> ThrottleCommand { + // The current throttle value, as long as the thread is throttling. + let mut current_throttle = initial_throttle as u64; + let mut timeslice_ctx = TimesliceContext::new(); + + loop { + // Catch logic bug: We should have exited in this case already. + assert_ne!(current_throttle, 0); + assert!(current_throttle < 100); + + let (wait_ms_after_pause, wait_ms_after_resume) = + timeslice_ctx.calc_sleep_durations(current_throttle); + + // pause vCPUs + if let Some(cmd) = Self::throttle_step( + callback_pause_vcpus, + |new_duration| timeslice_ctx.set_previous_pause_duration(new_duration), + wait_ms_after_pause, + receiver, + &mut current_throttle, + ) { + // TODO: future optimization + // Prevent unnecessary resume() here when the migration thread + // performs .pause() right after anyway. We could make .pause() and + // .resume() idempotent. + callback_resume_vcpus(); + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // resume vCPUs + if let Some(cmd) = Self::throttle_step( + callback_resume_vcpus, + |new_duration| timeslice_ctx.set_previous_resume_duration(new_duration), + wait_ms_after_resume, + receiver, + &mut current_throttle, + ) { + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // Update timeslice for next cycle. This way, we can closely match the expected + // percentage for pause() and resume(). + timeslice_ctx.update_timeslice(); + } + } + + /// Implements the control loop of the thread. + /// + /// It wraps the actual throttling with the necessary thread lifecycle + /// management. + fn control_loop( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> impl Fn() { + move || { + // In the outer loop, we gracefully wait for commands. + 'control: loop { + let thread_task = receiver.recv().expect("channel should not be closed"); + match thread_task { + ThrottleCommand::Exiting => { + break 'control; + } + ThrottleCommand::Waiting => { + continue 'control; + } + ThrottleCommand::Throttling(initial_throttle) => { + let next_task = Self::throttle_loop( + &receiver, + initial_throttle, + &callback_pause_vcpus, + &callback_resume_vcpus, + ); + if next_task == ThrottleCommand::Exiting { + break 'control; + } + // else: thread is in Waiting state + } + } + } + debug!("thread exited gracefully"); + } + } + + /// Spawns a new thread. + fn spawn( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> Self { + let handle = { + let thread_fn = + Self::control_loop(receiver, callback_pause_vcpus, callback_resume_vcpus); + thread::Builder::new() + .name(String::from(Self::THREAD_NAME)) + .spawn(thread_fn) + .expect("should spawn thread") + }; + + Self { + handle: Some(handle), + } + } +} + +impl Drop for ThrottleWorker { + fn drop(&mut self) { + // Note: The thread handle must send the shutdown command first. + if let Some(handle) = self.handle.take() { + handle.join().expect("thread should have succeeded"); + } + } +} + +/// Handler for controlling the vCPU throttle thread. +/// +/// vCPU throttling is needed for live-migration of memory-intensive workloads. +/// The current design assumes that all vCPUs are throttled equally. +/// +/// # Transitions +/// - `Waiting` -> `Throttling(x %)`, `Exit` +/// - `Throttling(x %)` -> `Exit`, `Waiting`, `Throttling(y %)` +/// - `Exiting` +pub struct ThrottleThreadHandle { + /// Thread state wrapped by synchronization primitives. + state_sender: mpsc::Sender, + /// Current throttle value. + /// + /// This is the last throttle value that was sent to the + /// thread. + current_throttle: Cell, + /// The underlying thread handle. Option to have more control over when it is dropped. + throttle_thread: Option, +} + +impl ThrottleThreadHandle { + /// Spawns a new thread and returning a handle to it. + /// + /// # Parameters + /// - `cpu_manager`: CPU manager to pause and resume vCPUs + pub fn new_from_cpu_manager(cpu_manager: &Arc>) -> Self { + let callback_pause_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().pause().unwrap()) + }; + + let callback_resume_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().resume().unwrap()) + }; + + Self::new(callback_pause_vcpus, callback_resume_vcpus) + } + + /// Spawns a new thread and returning a handle to it. + /// + /// This function returns when the thread gracefully arrived in + /// [`ThrottleCommand::Waiting`]. + /// + /// # Parameters + /// - `callback_pause_vcpus`: Function putting all vCPUs into pause state. The + /// function must not perform any artificial delay itself. + /// - `callback_resume_vcpus`: Function putting all vCPUs back into running + /// state. The function must not perform any artificial delay itself. + fn new( + callback_pause_vcpus: Box, + callback_resume_vcpus: Box, + ) -> Self { + // Channel used for synchronization. + let (sender, receiver) = mpsc::channel::(); + + let thread = ThrottleWorker::spawn(receiver, callback_pause_vcpus, callback_resume_vcpus); + + Self { + state_sender: sender, + current_throttle: Cell::new(0), + throttle_thread: Some(thread), + } + } + + /// Set's the throttle percentage to a value in range `0..=99` and updates + /// the thread's state. + /// + /// Setting the value back to `0` equals setting the thread back into + /// [`ThrottleCommand::Waiting`]. + /// + /// In case of an ongoing throttling cycle (vCPU pause & resume), any new + /// throttling percentage will be applied no later than when the current cycle + /// ends. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent_new: u8) { + assert!( + percent_new <= 100, + "setting a percentage of 100 or above is not allowed: {percent_new}%" + ); + + // We have no problematic race condition here as in normal operation + // there is exactly one thread calling these functions. + let percent_old = self.throttle_percent(); + + // Return early, no action needed. + if percent_old == percent_new { + return; + } + + if percent_new == 0 { + self.state_sender + .send(ThrottleCommand::Waiting) + .expect("channel should not be closed"); + } else { + self.state_sender + .send(ThrottleCommand::Throttling(percent_new)) + .expect("channel should not be closed"); + } + + self.current_throttle.set(percent_new); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.current_throttle.get() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. This function **must** be called before + /// the migration thread(s) do anything with the CPU manager to prevent + /// odd states. + pub fn shutdown(&mut self) { + let begin = Instant::now(); + + { + // drop thread; ensure that the channel is still alive when it is dropped + if let Some(worker) = self.throttle_thread.take() { + self.state_sender + .send(ThrottleCommand::Exiting) + .expect("channel should not be closed"); + + // Ensure the sender is still living when this is dropped. + drop(worker); + } + } + + let elapsed = begin.elapsed(); + if elapsed > Duration::from_millis(20) { + warn!( + "shutting down thread takes too long ({} ms): this increases the downtime!", + elapsed.as_millis() + ); + } + } +} + +impl Drop for ThrottleThreadHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread::sleep; + + use super::*; + + // The test is successful if it does not get stuck. Then, the thread exits + // gracefully. + #[test] + fn test_vcpu_throttling_thread_lifecycle() { + for _ in 0..5 { + // State transitions: Waiting -> Exit + { + let mut handler = ThrottleThreadHandle::new(Box::new(|| {}), Box::new(|| {})); + + // The test is successful if it does not get stuck. + handler.shutdown(); + } + + // Dummy CpuManager + let cpus_throttled = Arc::new(AtomicBool::new(false)); + let callback_pause_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(true, Ordering::SeqCst); + assert!(!old); + }) + }; + let callback_resume_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(false, Ordering::SeqCst); + assert!(old); + }) + }; + + // State transitions: Waiting -> Throttle -> Waiting -> Throttle -> Exit + { + let mut handler = + ThrottleThreadHandle::new(callback_pause_vcpus, callback_resume_vcpus); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // Assume we aborted vCPU throttling (or the live-migration at all). + handler.set_throttle_percent(0 /* reset to waiting */); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // The test is successful if we don't have a panic here due to a + // closed channel. + for _ in 0..10 { + handler.shutdown(); + sleep(Duration::from_millis(1)); + } + + // The test is successful if it does not get stuck. + drop(handler); + } + } + } +} diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 8660cfb2a6..d8edffe3d3 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -112,6 +112,7 @@ use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; target_arch = "x86_64" ))] use crate::sev::MeasuredBootInfo; +use crate::vcpu_throttling::ThrottleThreadHandle; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ @@ -540,6 +541,7 @@ pub struct Vm { hypervisor: Arc, stop_on_boot: bool, load_payload_handle: Option>>, + vcpu_throttler: ThrottleThreadHandle, } impl Vm { @@ -699,6 +701,10 @@ impl Vm { VmState::Created }; + // TODO we could also spawn the thread when a migration with auto-converge starts. + // Probably this is the better design. + let vcpu_throttler = ThrottleThreadHandle::new_from_cpu_manager(&cpu_manager); + Ok(Vm { #[cfg(feature = "tdx")] kernel, @@ -718,6 +724,7 @@ impl Vm { hypervisor, stop_on_boot, load_payload_handle, + vcpu_throttler, }) } @@ -1334,6 +1341,31 @@ impl Vm { Ok(numa_nodes) } + /// Set's the throttle percentage to a value in range `0..=99`. + /// + /// Setting the value back to `0` brings the thread back into a waiting + /// state. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent: u8 /* 1..=99 */) { + self.vcpu_throttler.set_throttle_percent(percent); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.vcpu_throttler.throttle_percent() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. + pub fn stop_vcpu_throttling(&mut self) { + self.vcpu_throttler.shutdown(); + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, From 37505a87a7a95ad224c7fba940de6f69b49e9a8d Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Thu, 11 Sep 2025 13:25:21 +0200 Subject: [PATCH 038/178] vmm: console: add tcp option In addition to configuration options like pty, file, tty, ... we allow setting the serial device to be accessed via some open TCP port on the host. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- cloud-hypervisor/src/main.rs | 2 + fuzz/Cargo.lock | 208 +++++++++++++++++++++++++++++++++- fuzz/fuzz_targets/http_api.rs | 2 + vmm/src/config.rs | 29 ++++- vmm/src/console_devices.rs | 3 + vmm/src/lib.rs | 2 + vmm/src/vm_config.rs | 4 + 7 files changed, 244 insertions(+), 6 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 42b9d1f397..270e0e2ee8 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -1022,6 +1022,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -1029,6 +1030,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 3b3011114f..5e168d1560 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -53,7 +53,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -64,7 +64,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -113,6 +113,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "bitfield-struct" version = "0.13.0" @@ -262,6 +284,15 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -396,6 +427,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -445,7 +482,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -508,6 +545,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures-core" version = "0.3.32" @@ -562,6 +605,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -1086,6 +1140,55 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -1252,6 +1355,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.117" @@ -1352,6 +1461,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -1524,6 +1639,7 @@ version = "0.1.0" dependencies = [ "anyhow", "itertools", + "rustls", "serde", "serde_json", "thiserror", @@ -1557,6 +1673,7 @@ dependencies = [ "gdbstub_arch", "hypervisor", "iommufd-ioctls", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -1601,6 +1718,12 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "wasip2" version = "1.0.3+wasi-0.2.9" @@ -1726,6 +1849,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -1735,6 +1867,70 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "winnow" version = "1.0.2" @@ -1858,6 +2054,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zmij" version = "1.0.21" diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 07a1effa4d..34d24de77a 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -176,6 +176,7 @@ impl RequestHandler for StubApiRequestHandler { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -183,6 +184,7 @@ impl RequestHandler for StubApiRequestHandler { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/vmm/src/config.rs b/vmm/src/config.rs index e8b00fc81d..d0226b01b7 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -222,6 +222,9 @@ pub enum ValidationError { /// Missing file value for console #[error("Path missing when using file console mode")] ConsoleFileMissing, + /// Missing TCP address for console + #[error("Address missing when using TCP console mode")] + ConsoleTcpAddressMissing, /// Missing socket path for console #[error("Path missing when using socket console mode")] ConsoleSocketPathMissing, @@ -2141,7 +2144,7 @@ impl PmemConfig { } impl CommonConsoleConfig { - const VALUELESS_OPTIONS: &[&str] = &["off", "pty", "tty", "null"]; + const VALUELESS_OPTIONS: &[&str] = &["off", "pty", "tty", "null", "tcp"]; const VALUE_OPTIONS: &[&str] = &["file", "socket"]; fn parse(console: &str, map_err: impl Fn(OptionParserError) -> Error) -> Result { @@ -2153,6 +2156,7 @@ impl CommonConsoleConfig { let mut file: Option = None; let mut socket: Option = None; + let mut url: Option = None; let mut mode: ConsoleOutputMode = ConsoleOutputMode::Off; if parser.is_set("off") { @@ -2168,6 +2172,13 @@ impl CommonConsoleConfig { Some(PathBuf::from(parser.get("file").ok_or( Error::Validation(ValidationError::ConsoleFileMissing), )?)); + } else if parser.is_set("tcp") { + mode = ConsoleOutputMode::Tcp; + url = Some( + parser + .get("tcp") + .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, + ); } else if parser.is_set("socket") { mode = ConsoleOutputMode::Socket; socket = Some(PathBuf::from(parser.get("socket").ok_or( @@ -2177,7 +2188,12 @@ impl CommonConsoleConfig { return Err(Error::ParseConsoleInvalidModeGiven); } - Ok(Self { mode, file, socket }) + Ok(Self { + mode, + file, + socket, + url, + }) } } @@ -4445,7 +4461,12 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" #[test] fn test_console_parsing() -> Result<()> { let console_config = |mode, file, socket, iommu| ConsoleConfig { - common: CommonConsoleConfig { file, mode, socket }, + common: CommonConsoleConfig { + file, + mode, + socket, + url: None, + }, pci_common: PciDeviceCommonConfig { iommu, ..Default::default() @@ -5084,6 +5105,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -5091,6 +5113,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index a1f3493fd2..a1e0624b48 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -227,6 +227,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleTransport::Null, ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, @@ -264,6 +265,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult ConsoleTransport::Null, ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, @@ -290,6 +292,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleTransport::Null, ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5aef732c08..137f08164a 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2819,6 +2819,7 @@ mod unit_tests { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, }, console: ConsoleConfig { @@ -2827,6 +2828,7 @@ mod unit_tests { // Caution: Don't use `Tty` to not mess with users terminal mode: ConsoleOutputMode::Off, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), }, diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index f612911244..a022a1a23b 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -542,6 +542,7 @@ pub enum ConsoleOutputMode { Tty, File, Socket, + Tcp, Null, } @@ -555,6 +556,7 @@ pub struct CommonConsoleConfig { pub mode: ConsoleOutputMode, #[serde(default)] pub socket: Option, + pub url: Option, } impl ApplyLandlock for CommonConsoleConfig { @@ -591,6 +593,7 @@ impl Default for SerialConfig { file: None, mode: ConsoleOutputMode::Null, socket: None, + url: None, }, } } @@ -622,6 +625,7 @@ impl Default for ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, socket: None, + url: None, }, pci_common: PciDeviceCommonConfig::default(), } From c068d9d7f1863a5c87606df1ea71e5c232955ba5 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Thu, 11 Sep 2025 14:40:22 +0200 Subject: [PATCH 039/178] vmm: serial: add Tcp enum entry Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/serial_manager.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 27f359bec6..a1c980b443 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -93,9 +93,10 @@ pub enum EpollDispatch { File = 0, Kill = 1, Socket = 2, + Tcp = 3, Unknown, } -const EPOLL_EVENTS_LEN: usize = 4; +const EPOLL_EVENTS_LEN: usize = 5; impl From for EpollDispatch { fn from(v: u64) -> Self { @@ -104,6 +105,7 @@ impl From for EpollDispatch { 0 => File, 1 => Kill, 2 => Socket, + 3 => Tcp, _ => Unknown, } } @@ -347,6 +349,7 @@ impl SerialManager { reader = Some(unix_stream); serial.lock().unwrap().set_out(Some(Box::new(writer))); } + EpollDispatch::Tcp => {} EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; From 67cc1af54d39b50fc3949ca13df31f5da844fe22 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Thu, 11 Sep 2025 16:01:26 +0200 Subject: [PATCH 040/178] vmm: serial: rename CloneUnixStream -> CloneStream Renaming the error makes it also usable for the new TCP socket support in the serial device. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/serial_manager.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index a1c980b443..84d09ad2f7 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -69,9 +69,9 @@ pub enum Error { #[error("Error accepting connection")] AcceptConnection(#[source] io::Error), - /// Cannot clone the UnixStream - #[error("Error cloning UnixStream")] - CloneUnixStream(#[source] io::Error), + /// Cannot clone the Stream + #[error("Error cloning Stream")] + CloneStream(#[source] io::Error), /// Cannot shutdown the connection #[error("Error shutting down a connection")] @@ -333,7 +333,7 @@ impl SerialManager { let (unix_stream, _) = listener.accept().map_err(Error::AcceptConnection)?; let writer = - unix_stream.try_clone().map_err(Error::CloneUnixStream)?; + unix_stream.try_clone().map_err(Error::CloneStream)?; epoll::ctl( epoll_fd.as_raw_fd(), From eae34fe2eb4321ce6e1f491d077153a325d8cd1f Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Thu, 11 Sep 2025 15:01:17 +0200 Subject: [PATCH 041/178] vmm: serial: add serial tcp implementation Similar to the unix socket implementation, we allow a user to set a port where the serial is reachable. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/console_devices.rs | 16 ++++++++- vmm/src/device_manager.rs | 10 ++++-- vmm/src/serial_manager.rs | 70 ++++++++++++++++++++++++++++++++++---- 3 files changed, 87 insertions(+), 9 deletions(-) diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index a1e0624b48..c720a2f6bf 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions, read_link}; use std::mem::zeroed; +use std::net::TcpListener; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::net::UnixListener; @@ -40,6 +41,10 @@ pub enum ConsoleDeviceError { #[error("No socket option support for console device")] NoSocketOptionSupportForConsoleDevice, + /// Error parsing the TCP address + #[error("Wrong TCP address format: {0}")] + WrongTcpAddressFormat(std::string::String), + /// Error setting pty raw mode #[error("Error setting pty raw mode")] SetPtyRaw(#[source] vmm_sys_util::errno::Error), @@ -62,6 +67,7 @@ pub enum ConsoleTransport { Tty(Arc), Null, Socket(Arc), + Tcp(Arc), Off, } @@ -265,7 +271,15 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult ConsoleTransport::Null, + ConsoleOutputMode::Tcp => { + let url = vmconfig.serial.common.url.as_ref().unwrap(); + let socket_addr: std::net::SocketAddr = url + .parse() + .map_err(|_| ConsoleDeviceError::WrongTcpAddressFormat(url.to_string()))?; + let listener = TcpListener::bind(socket_addr) + .map_err(ConsoleDeviceError::CreateConsoleDevice)?; + ConsoleTransport::Tcp(Arc::new(listener)) + } ConsoleOutputMode::Null => ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 0301c849ca..6957548b5b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2439,6 +2439,9 @@ impl DeviceManager { ConsoleTransport::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } + ConsoleTransport::Tcp(_) => { + return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); + } ConsoleTransport::Null => Endpoint::Null, ConsoleTransport::Off => return Ok(None), }; @@ -2520,6 +2523,7 @@ impl DeviceManager { | ConsoleTransport::Null | ConsoleTransport::Pty(_) | ConsoleTransport::Socket(_) => None, + ConsoleTransport::Tcp(_) => None, }; if !matches!(console_info.serial, ConsoleTransport::Off) { @@ -2527,7 +2531,8 @@ impl DeviceManager { self.serial_manager = match console_info.serial { ConsoleTransport::Pty(_) | ConsoleTransport::Tty(_) - | ConsoleTransport::Socket(_) => { + | ConsoleTransport::Socket(_) + | ConsoleTransport::Tcp(_) => { let serial_manager = SerialManager::new( serial, console_info.serial, @@ -2558,7 +2563,8 @@ impl DeviceManager { ConsoleTransport::Off | ConsoleTransport::Null | ConsoleTransport::Pty(_) - | ConsoleTransport::Socket(_) => None, + | ConsoleTransport::Socket(_) + | ConsoleTransport::Tcp(_) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 84d09ad2f7..387a89c150 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -5,7 +5,7 @@ use std::fs::File; use std::io::Read; -use std::net::Shutdown; +use std::net::{Shutdown, TcpStream}; use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::os::unix::net::UnixStream; @@ -181,13 +181,14 @@ impl SerialManager { } listener.as_raw_fd() } + ConsoleTransport::Tcp(ref listener) => listener.as_raw_fd(), _ => return Ok(None), }; - let in_event = if let ConsoleTransport::Socket(_) = transport { - EpollDispatch::Socket - } else { - EpollDispatch::File + let in_event = match &transport { + ConsoleTransport::Socket(_) => EpollDispatch::Socket, + ConsoleTransport::Tcp(_) => EpollDispatch::Tcp, + _ => EpollDispatch::File, }; epoll::ctl( @@ -264,6 +265,7 @@ impl SerialManager { let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; + let mut reader_tcp: Option = None; // In case of PTY, we want to be able to detect a connection on the // other end of the PTY. This is done by detecting there's no event @@ -349,7 +351,40 @@ impl SerialManager { reader = Some(unix_stream); serial.lock().unwrap().set_out(Some(Box::new(writer))); } - EpollDispatch::Tcp => {} + EpollDispatch::Tcp => { + // New connection request arrived. + // Shutdown the previous connection, if any + if let Some(ref previous_reader) = reader_tcp { + previous_reader + .shutdown(Shutdown::Both) + .map_err(Error::AcceptConnection)?; + } + + let ConsoleTransport::Tcp(ref listener) = transport else { + unreachable!(); + }; + + // Events on the listening socket will be connection requests. + // Accept them, create a reader and a writer. + let (tcp_stream, _) = + listener.accept().map_err(Error::AcceptConnection)?; + let writer = + tcp_stream.try_clone().map_err(Error::CloneStream)?; + + epoll::ctl( + epoll_fd.as_raw_fd(), + epoll::ControlOptions::EPOLL_CTL_ADD, + tcp_stream.as_raw_fd(), + epoll::Event::new( + epoll::Events::EPOLLIN, + EpollDispatch::File as u64, + ), + ) + .map_err(Error::Epoll)?; + + reader_tcp = Some(tcp_stream); + serial.lock().unwrap().set_out(Some(Box::new(writer))); + } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; @@ -376,6 +411,29 @@ impl SerialManager { 0 } } + ConsoleTransport::Tcp(_) => { + if let Some(mut serial_reader) = reader_tcp.as_ref() + { + let count = serial_reader + .read(&mut input) + .map_err(Error::ReadInput)?; + if count == 0 { + info!("Remote end closed serial socket"); + serial_reader + .shutdown(Shutdown::Both) + .map_err(Error::ShutdownConnection)?; + reader_tcp = None; + serial + .as_ref() + .lock() + .unwrap() + .set_out(None); + } + count + } else { + 0 + } + } ConsoleTransport::Pty(file) | ConsoleTransport::Tty(file) => (&**file) .read(&mut input) From 6d3b70a69f5f3662a05a877e61551aef522f1334 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Tue, 16 Sep 2025 14:13:46 +0200 Subject: [PATCH 042/178] vmm: serial: implement writer combiner When using the TCP serial mode, we also want to write the serial output to a file. We use a generic write combiner that would allow us adding even more writers. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/serial_manager.rs | 72 ++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 387a89c150..4ea4e2b602 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::collections::HashMap; use std::fs::File; -use std::io::Read; +use std::io::{Read, Write}; use std::net::{Shutdown, TcpStream}; use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; @@ -111,6 +112,58 @@ impl From for EpollDispatch { } } +/// A thread-safe writer that fans out to multiple keyed writers. Allows for +/// bundling different kinds of writers for the serial device, e.g. writing to +/// a TCP socket and a file. +#[derive(Clone)] +pub struct FanoutWriter { + writers: Arc>>>, +} + +impl FanoutWriter { + pub fn new() -> Self { + FanoutWriter { + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub fn add_writer(&self, key: String, writer: W) { + let mut writers = self.writers.lock().unwrap(); + writers.insert(key, Box::new(writer)); + } + + pub fn remove_writer(&self, key: &str) -> Option> { + let mut writers = self.writers.lock().unwrap(); + writers.remove(key) + } +} + +impl Write for FanoutWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut writers = self.writers.lock().unwrap(); + let mut result: io::Result = Ok(buf.len()); + + for (i, w) in writers.values_mut().enumerate() { + let r = w.write(buf); + if i == 0 { + result = r; + } else { + r?; + } + } + + result + } + + fn flush(&mut self) -> io::Result<()> { + let mut writers = self.writers.lock().unwrap(); + for w in writers.values_mut() { + w.flush()?; + } + Ok(()) + } +} + pub struct SerialManager { #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, @@ -279,8 +332,15 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { + let write_distributor = FanoutWriter::new(); + let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(write_distributor.clone()))); loop { let num_events = @@ -358,6 +418,7 @@ impl SerialManager { previous_reader .shutdown(Shutdown::Both) .map_err(Error::AcceptConnection)?; + write_distributor.remove_writer("tcp"); } let ConsoleTransport::Tcp(ref listener) = transport else { @@ -381,9 +442,8 @@ impl SerialManager { ), ) .map_err(Error::Epoll)?; - reader_tcp = Some(tcp_stream); - serial.lock().unwrap().set_out(Some(Box::new(writer))); + write_distributor.add_writer("tcp".into(), writer); } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { @@ -423,11 +483,7 @@ impl SerialManager { .shutdown(Shutdown::Both) .map_err(Error::ShutdownConnection)?; reader_tcp = None; - serial - .as_ref() - .lock() - .unwrap() - .set_out(None); + write_distributor.remove_writer("tcp"); } count } else { From 4c396f3c08ac45c2ea88d39ce267bf419ad14bd4 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Wed, 17 Sep 2025 08:42:29 +0200 Subject: [PATCH 043/178] vmm: config: allow additional file when mode TCP When using the newly added TCP serial mode, we allow specifying an additional file to log into. This allows users to access the complete bootlog of a VM, as the TCP socket does not buffer anything. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/config.rs | 6 ++++++ vmm/src/console_devices.rs | 10 ++++++++-- vmm/src/device_manager.rs | 8 ++++---- vmm/src/serial_manager.rs | 40 +++++++++++++++++++++++++------------- vmm/src/vm_config.rs | 3 ++- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index d0226b01b7..8fb8341c53 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2179,6 +2179,12 @@ impl CommonConsoleConfig { .get("tcp") .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, ); + if parser.is_set("file") { + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); + } } else if parser.is_set("socket") { mode = ConsoleOutputMode::Socket; socket = Some(PathBuf::from(parser.get("socket").ok_or( diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index c720a2f6bf..672d4277b3 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -67,7 +67,7 @@ pub enum ConsoleTransport { Tty(Arc), Null, Socket(Arc), - Tcp(Arc), + Tcp(Arc, Option>), Off, } @@ -278,7 +278,13 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 6957548b5b..ff71462571 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2439,7 +2439,7 @@ impl DeviceManager { ConsoleTransport::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } - ConsoleTransport::Tcp(_) => { + ConsoleTransport::Tcp(_, _) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } ConsoleTransport::Null => Endpoint::Null, @@ -2523,7 +2523,7 @@ impl DeviceManager { | ConsoleTransport::Null | ConsoleTransport::Pty(_) | ConsoleTransport::Socket(_) => None, - ConsoleTransport::Tcp(_) => None, + ConsoleTransport::Tcp(_, _) => None, }; if !matches!(console_info.serial, ConsoleTransport::Off) { @@ -2532,7 +2532,7 @@ impl DeviceManager { ConsoleTransport::Pty(_) | ConsoleTransport::Tty(_) | ConsoleTransport::Socket(_) - | ConsoleTransport::Tcp(_) => { + | ConsoleTransport::Tcp(_, _) => { let serial_manager = SerialManager::new( serial, console_info.serial, @@ -2564,7 +2564,7 @@ impl DeviceManager { | ConsoleTransport::Null | ConsoleTransport::Pty(_) | ConsoleTransport::Socket(_) - | ConsoleTransport::Tcp(_) => None, + | ConsoleTransport::Tcp(_, _) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 4ea4e2b602..830dc30d77 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -234,13 +234,13 @@ impl SerialManager { } listener.as_raw_fd() } - ConsoleTransport::Tcp(ref listener) => listener.as_raw_fd(), + ConsoleTransport::Tcp(ref listener, _) => listener.as_raw_fd(), _ => return Ok(None), }; let in_event = match &transport { ConsoleTransport::Socket(_) => EpollDispatch::Socket, - ConsoleTransport::Tcp(_) => EpollDispatch::Tcp, + ConsoleTransport::Tcp(_, _) => EpollDispatch::Tcp, _ => EpollDispatch::File, }; @@ -332,15 +332,21 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { - let write_distributor = FanoutWriter::new(); + let write_distributor = match &transport { + ConsoleTransport::Tcp(_, _) => { + let distributor = FanoutWriter::new(); + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(distributor.clone()))); + Some(distributor) + } + _ => None, + }; let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; - serial - .as_ref() - .lock() - .unwrap() - .set_out(Some(Box::new(write_distributor.clone()))); loop { let num_events = @@ -418,10 +424,12 @@ impl SerialManager { previous_reader .shutdown(Shutdown::Both) .map_err(Error::AcceptConnection)?; - write_distributor.remove_writer("tcp"); + if let Some(distributor) = &write_distributor { + distributor.remove_writer("tcp"); + } } - let ConsoleTransport::Tcp(ref listener) = transport else { + let ConsoleTransport::Tcp(ref listener, _) = transport else { unreachable!(); }; @@ -443,7 +451,9 @@ impl SerialManager { ) .map_err(Error::Epoll)?; reader_tcp = Some(tcp_stream); - write_distributor.add_writer("tcp".into(), writer); + if let Some(distributor) = &write_distributor { + distributor.add_writer("tcp".into(), writer); + } } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { @@ -471,7 +481,7 @@ impl SerialManager { 0 } } - ConsoleTransport::Tcp(_) => { + ConsoleTransport::Tcp(_, _) => { if let Some(mut serial_reader) = reader_tcp.as_ref() { let count = serial_reader @@ -483,7 +493,11 @@ impl SerialManager { .shutdown(Shutdown::Both) .map_err(Error::ShutdownConnection)?; reader_tcp = None; - write_distributor.remove_writer("tcp"); + if let Some(distributor) = + &write_distributor + { + distributor.remove_writer("tcp"); + } } count } else { diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index a022a1a23b..6702772dfa 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -583,7 +583,8 @@ pub struct SerialConfig { } impl SerialConfig { - pub const SYNTAX: &str = "Control serial port: \"off|null|pty|tty|file=|socket=\""; + pub const SYNTAX: &str = + "Control serial port: \"off|null|pty|tty|file=|socket=|tcp=\""; } impl Default for SerialConfig { From 73a98d0fe93d6f4f6be174c8065c2bc898d5f837 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Wed, 17 Sep 2025 09:37:46 +0200 Subject: [PATCH 044/178] vmm: serial: additional log to file Use the user provided file path that can be additionally specified when TCP mode is selected for serial. Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/serial_manager.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 830dc30d77..f191f54af0 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -333,8 +333,11 @@ impl SerialManager { .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { let write_distributor = match &transport { - ConsoleTransport::Tcp(_, _) => { + ConsoleTransport::Tcp(_, file_opt) => { let distributor = FanoutWriter::new(); + if let Some(file) = file_opt { + distributor.add_writer("file".into(), Arc::clone(file)); + } serial .as_ref() .lock() From 8a65636020bd2df43e2dd7e182a0d7aa94aa19dd Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Mon, 29 Sep 2025 11:00:25 +0200 Subject: [PATCH 045/178] vmm: serial_manager: use TypeId trait for key type in FanoutWriter Signed-off-by: Stefan Kober On-behalf-of: SAP stefan.kober@sap.com --- vmm/src/serial_manager.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index f191f54af0..638a2ff37d 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::any::TypeId; use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write}; @@ -117,7 +118,7 @@ impl From for EpollDispatch { /// a TCP socket and a file. #[derive(Clone)] pub struct FanoutWriter { - writers: Arc>>>, + writers: Arc>>>, } impl FanoutWriter { @@ -127,14 +128,14 @@ impl FanoutWriter { } } - pub fn add_writer(&self, key: String, writer: W) { + pub fn add_writer(&self, writer: W) { let mut writers = self.writers.lock().unwrap(); - writers.insert(key, Box::new(writer)); + writers.insert(TypeId::of::(), Box::new(writer)); } - pub fn remove_writer(&self, key: &str) -> Option> { + pub fn remove_writer(&self, id: TypeId) -> Option> { let mut writers = self.writers.lock().unwrap(); - writers.remove(key) + writers.remove(&id) } } @@ -336,7 +337,7 @@ impl SerialManager { ConsoleTransport::Tcp(_, file_opt) => { let distributor = FanoutWriter::new(); if let Some(file) = file_opt { - distributor.add_writer("file".into(), Arc::clone(file)); + distributor.add_writer(Arc::clone(file)); } serial .as_ref() @@ -428,7 +429,7 @@ impl SerialManager { .shutdown(Shutdown::Both) .map_err(Error::AcceptConnection)?; if let Some(distributor) = &write_distributor { - distributor.remove_writer("tcp"); + distributor.remove_writer(TypeId::of::()); } } @@ -455,7 +456,7 @@ impl SerialManager { .map_err(Error::Epoll)?; reader_tcp = Some(tcp_stream); if let Some(distributor) = &write_distributor { - distributor.add_writer("tcp".into(), writer); + distributor.add_writer(writer); } } EpollDispatch::File => { @@ -499,7 +500,9 @@ impl SerialManager { if let Some(distributor) = &write_distributor { - distributor.remove_writer("tcp"); + distributor.remove_writer( + TypeId::of::(), + ); } } count From fef3e3f0cd90f20c04cc49209ab532fc266f87f5 Mon Sep 17 00:00:00 2001 From: Stefan Kober Date: Tue, 21 Oct 2025 15:11:07 +0200 Subject: [PATCH 046/178] vmm: api: receive_migration can get a tcp_serial_url The TCP serial mode utilizes a port on the host and needs to listen on a specific IP. If that IP or port are not available on the receiver host when migrating, we need to be able to set a new configuration for the destination host. As a shortcut, we add a tcp_serial_url parameter to the receive migration API call. The caller can specify a new value, that will lead to an update of the VM config on the receiver side. On-behalf-of: SAP stefan.kober@sap.com Signed-off-by: Stefan Kober --- vmm/src/api/mod.rs | 22 ++++++++++++++++++---- vmm/src/lib.rs | 19 ++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index c262bf8e2e..2c53493294 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -277,6 +277,10 @@ pub struct VmReceiveMigrationData { /// Map with new network FDs on the new host. #[serde(default)] pub net_fds: Vec, + /// Optional URL if the TCP serial configuration must be changed during + /// migration. Example: "192.168.1.1:2222". + #[serde(default)] + pub tcp_serial_url: Option, } #[derive(Debug, Error)] @@ -326,7 +330,7 @@ fn validate_tcp_migration_address(address: &str) -> Result<(), String> { impl VmReceiveMigrationData { pub const SYNTAX: &'static str = "VM receive migration parameters \ - \"\" or \"receiver_url=[,tls_dir=]\""; + \"\" or \"receiver_url=[,tls_dir=,tcp_serial_url=]\""; pub fn parse(migration: &str) -> Result { let uses_key_value_syntax = migration.split(',').any( @@ -338,6 +342,7 @@ impl VmReceiveMigrationData { receiver_url: migration.to_owned(), tls_dir: None, net_fds: vec![], + tcp_serial_url: None, }; data.validate()?; @@ -346,7 +351,10 @@ impl VmReceiveMigrationData { } let mut parser = OptionParser::new(); - parser.add("receiver_url").add("tls_dir"); + parser + .add("receiver_url") + .add("tls_dir") + .add("tcp_serial_url"); parser .parse(migration) .map_err(VmReceiveMigrationConfigError::ParseError)?; @@ -360,11 +368,15 @@ impl VmReceiveMigrationData { .convert::("tls_dir") .map_err(VmReceiveMigrationConfigError::ParseError)? .map(|path| PathBuf::from(&path)); + let tcp_serial_url = parser + .convert::("tcp_serial_url") + .map_err(VmReceiveMigrationConfigError::ParseError)?; let data = Self { receiver_url, tls_dir, net_fds: vec![], + tcp_serial_url, }; data.validate()?; @@ -1984,6 +1996,7 @@ mod unit_tests { receiver_url: "tcp:192.168.1.1:8080".to_string(), tls_dir: None, net_fds: vec![], + tcp_serial_url: None, } ); @@ -1998,8 +2011,8 @@ mod unit_tests { let tls_dir = std::env::temp_dir(); let data = VmReceiveMigrationData::parse(&format!( - "receiver_url=tcp:192.168.1.1:8080,tls_dir={}", - tls_dir.display() + "receiver_url=tcp:192.168.1.1:8080,tls_dir={},tcp_serial_url=1.2.3.4:6789", + tls_dir.display(), )) .unwrap(); assert_eq!( @@ -2008,6 +2021,7 @@ mod unit_tests { receiver_url: "tcp:192.168.1.1:8080".to_string(), tls_dir: Some(tls_dir), net_fds: vec![], + tcp_serial_url: Some("1.2.3.4:6789".to_string()), } ); diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 137f08164a..f0b12cce69 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -921,7 +921,12 @@ impl Vmm { |socket: &mut SocketStream, memory_files: HashMap| -> std::result::Result { - let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + let memory_manager = self.vm_receive_config( + req, + socket, + memory_files, + receive_data_migration.tcp_serial_url.clone(), + )?; if !receive_data_migration.net_fds.is_empty() { let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); @@ -1064,6 +1069,7 @@ impl Vmm { req: &Request, socket: &mut T, existing_memory_files: HashMap, + tcp_serial_url: Option, ) -> std::result::Result>, MigratableError> where T: Read, @@ -1088,6 +1094,12 @@ impl Vmm { let config = vm_migration_config.vm_config.clone(); self.vm_config = Some(vm_migration_config.vm_config); + + if let Some(tcp_serial_url) = tcp_serial_url { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + vm_config.serial.common.url = Some(tcp_serial_url); + } + self.console_info = Some(pre_create_console_devices(self).map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) })?); @@ -2590,10 +2602,11 @@ impl RequestHandler for Vmm { .map_err(MigratableError::MigrateReceive)?; info!( - "Receiving migration: receiver_url={},tls={},net_fds={:?}", + "Receiving migration: receiver_url={},tls={},net_fds={:?}, tcp_url={:?}", receive_data_migration.receiver_url, receive_data_migration.tls_dir.is_some(), - &receive_data_migration.net_fds + &receive_data_migration.net_fds, + &receive_data_migration.tcp_serial_url, ); let mut listener = migration_transport::receive_migration_listener( From 59763b39a022d740f377d94485bd521f561c0f63 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Feb 2026 15:28:03 +0100 Subject: [PATCH 047/178] arch: x86_64: refactor SMBIOS helpers Split the System Information write into helper functions and reuse the string writer so the table layout and inputs are unchanged. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- arch/src/x86_64/mod.rs | 10 ++-- arch/src/x86_64/smbios.rs | 116 ++++++++++++++++++++++++-------------- vmm/src/config.rs | 5 +- vmm/src/vm.rs | 9 +-- vmm/src/vm_config.rs | 2 +- 5 files changed, 87 insertions(+), 55 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 09577b436c..f9193c22bf 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -960,7 +960,7 @@ pub fn configure_system( rsdp_addr: Option, serial_number: Option<&str>, uuid: Option<&str>, - oem_strings: Option<&[&str]>, + oem_strings: Vec, topology: Option<(u16, u16, u16, u16)>, ) -> super::Result<()> { // Write EBDA address to location where ACPICA expects to find it @@ -1514,7 +1514,7 @@ mod unit_tests { Some(layout::RSDP_POINTER), None, None, - None, + Vec::new(), None, ); config_err.unwrap_err(); @@ -1538,7 +1538,7 @@ mod unit_tests { None, None, None, - None, + Vec::new(), None, ) .unwrap(); @@ -1567,7 +1567,7 @@ mod unit_tests { None, None, None, - None, + Vec::new(), None, ) .unwrap(); @@ -1582,7 +1582,7 @@ mod unit_tests { None, None, None, - None, + Vec::new(), None, ) .unwrap(); diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index 6f1139888b..cb8b331bb8 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -47,6 +47,8 @@ const OEM_STRINGS: u8 = 11; const END_OF_TABLE: u8 = 127; const PCI_SUPPORTED: u64 = 1 << 7; const IS_VIRTUAL_MACHINE: u8 = 1 << 4; +pub const DEFAULT_SYSTEM_MANUFACTURER: &str = "Cloud Hypervisor"; +pub const DEFAULT_SYSTEM_PRODUCT_NAME: &str = "cloud-hypervisor"; fn compute_checksum(v: &T) -> u8 { let v: *const T = v; @@ -59,8 +61,7 @@ fn compute_checksum(v: &T) -> u8 { (!checksum).wrapping_add(1) } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct Smbios30Entrypoint { signature: [u8; 5usize], @@ -75,8 +76,7 @@ struct Smbios30Entrypoint { physptr: u64, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosBiosInfo { r#type: u8, @@ -92,8 +92,7 @@ struct SmbiosBiosInfo { characteristics_ext2: u8, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosSysInfo { r#type: u8, @@ -109,8 +108,7 @@ struct SmbiosSysInfo { family: u8, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosOemStrings { r#type: u8, @@ -119,8 +117,7 @@ struct SmbiosOemStrings { count: u8, } -#[repr(C)] -#[repr(packed)] +#[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosEndOfTable { r#type: u8, @@ -163,11 +160,73 @@ fn write_string( Ok(curptr) } +fn write_opt_string( + mem: &GuestMemoryMmap, + s: Option<&str>, + cur: GuestAddress, +) -> Result { + if let Some(v) = s { + write_string(mem, v, cur) + } else { + Ok(cur) + } +} + +fn write_string_terminator( + mem: &GuestMemoryMmap, + cur: GuestAddress, + has_strings: bool, +) -> Result { + // SMBIOS DSP0134 §6.1.3: if all string-reference fields are 0, follow the + // formatted section with two null bytes (empty string-set). + if has_strings { + write_and_incr(mem, 0u8, cur) + } else { + let cur = write_and_incr(mem, 0u8, cur)?; + write_and_incr(mem, 0u8, cur) + } +} + +fn write_type1_system( + mem: &GuestMemoryMmap, + curptr: &mut GuestAddress, + handle: &mut u16, + serial_number: Option<&str>, + uuid: Option<&str>, +) -> Result<()> { + *handle += 1; + + let uuid_number = uuid + .map(Uuid::parse_str) + .transpose() + .map_err(|e| Error::ParseUuid(e, uuid.unwrap().to_string()))? + .unwrap_or(Uuid::nil()); + let serial_idx = serial_number.map(|_| 3).unwrap_or_default(); + + let smbios_sysinfo = SmbiosSysInfo { + r#type: SYSTEM_INFORMATION, + length: mem::size_of::() as u8, + handle: *handle, + manufacturer: 1, // First string written in this section + product_name: 2, // Second string written in this section + serial_number: serial_idx, + uuid: uuid_number.to_bytes_le(), + ..Default::default() + }; + + *curptr = write_and_incr(mem, smbios_sysinfo, *curptr)?; + *curptr = write_string(mem, DEFAULT_SYSTEM_MANUFACTURER, *curptr)?; + *curptr = write_string(mem, DEFAULT_SYSTEM_PRODUCT_NAME, *curptr)?; + *curptr = write_opt_string(mem, serial_number, *curptr)?; + *curptr = write_and_incr(mem, 0u8, *curptr)?; + Ok(()) +} + pub fn setup_smbios( mem: &GuestMemoryMmap, serial_number: Option<&str>, uuid: Option<&str>, - oem_strings: Option<&[&str]>, + oem_strings: Vec, ) -> Result { let physptr = GuestAddress(SMBIOS_START) .checked_add(mem::size_of::() as u64) @@ -193,34 +252,9 @@ pub fn setup_smbios( curptr = write_and_incr(mem, 0u8, curptr)?; } - { - handle += 1; + write_type1_system(mem, &mut curptr, &mut handle, serial_number, uuid)?; - let uuid_number = uuid - .map(Uuid::parse_str) - .transpose() - .map_err(|e| Error::ParseUuid(e, uuid.unwrap().to_string()))? - .unwrap_or(Uuid::nil()); - let smbios_sysinfo = SmbiosSysInfo { - r#type: SYSTEM_INFORMATION, - length: mem::size_of::() as u8, - handle, - manufacturer: 1, // First string written in this section - product_name: 2, // Second string written in this section - serial_number: serial_number.map(|_| 3).unwrap_or_default(), // 3rd string - uuid: uuid_number.to_bytes_le(), // set uuid - ..Default::default() - }; - curptr = write_and_incr(mem, smbios_sysinfo, curptr)?; - curptr = write_string(mem, "Cloud Hypervisor", curptr)?; - curptr = write_string(mem, "cloud-hypervisor", curptr)?; - if let Some(serial_number) = serial_number { - curptr = write_string(mem, serial_number, curptr)?; - } - curptr = write_and_incr(mem, 0u8, curptr)?; - } - - if let Some(oem_strings) = oem_strings { + if !oem_strings.is_empty() { handle += 1; let smbios_oemstrings = SmbiosOemStrings { @@ -233,10 +267,10 @@ pub fn setup_smbios( curptr = write_and_incr(mem, smbios_oemstrings, curptr)?; for s in oem_strings { - curptr = write_string(mem, s, curptr)?; + curptr = write_string(mem, &s, curptr)?; } - curptr = write_and_incr(mem, 0u8, curptr)?; + curptr = write_string_terminator(mem, curptr, true)?; } { @@ -299,7 +333,7 @@ mod unit_tests { fn entrypoint_checksum() { let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); - setup_smbios(&mem, None, None, None).unwrap(); + setup_smbios(&mem, None, None, Vec::new()).unwrap(); let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 8fb8341c53..f917c87bf5 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -892,7 +892,8 @@ impl PlatformConfig { let oem_strings = parser .convert::("oem_strings") .map_err(Error::ParsePlatform)? - .map(|v| v.0); + .map(|v| v.0) + .unwrap_or_default(); let iommufd = parser .convert::("iommufd") .map_err(Error::ParsePlatform)? @@ -5039,7 +5040,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" iommu_address_width_bits: MAX_IOMMU_ADDRESS_WIDTH_BITS, serial_number: None, uuid: None, - oem_strings: None, + oem_strings: Vec::new(), iommufd: false, vfio_p2p_dma: default_platformconfig_vfio_p2p_dma(), #[cfg(feature = "tdx")] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index d8edffe3d3..62e9a391a6 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1907,11 +1907,8 @@ impl Vm { .unwrap() .platform .as_ref() - .and_then(|p| p.oem_strings.clone()); - - let oem_strings = oem_strings - .as_deref() - .map(|strings| strings.iter().map(|s| s.as_ref()).collect::>()); + .map(|p| p.oem_strings.clone()) + .unwrap_or_default(); let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); @@ -1925,7 +1922,7 @@ impl Vm { rsdp_addr, serial_number.as_deref(), uuid.as_deref(), - oem_strings.as_deref(), + oem_strings, topology, ) .map_err(Error::ConfigureSystem)?; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 6702772dfa..239c682066 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -130,7 +130,7 @@ pub struct PlatformConfig { #[serde(default)] pub uuid: Option, #[serde(default)] - pub oem_strings: Option>, + pub oem_strings: Vec, #[cfg(feature = "tdx")] #[serde(default)] pub tdx: bool, From 7a1d3310fbf12e69b08c5810d68fb1beddd3cf6a Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Feb 2026 15:34:16 +0100 Subject: [PATCH 048/178] vmm: plumb legacy SMBIOS config Add a small SMBIOS config that carries serial_number, uuid, and OEM strings, and pass it from platform config into x86_64 setup. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- arch/src/x86_64/mod.rs | 16 +++------------- arch/src/x86_64/smbios.rs | 21 +++++++++++++-------- vmm/src/vm.rs | 25 +++---------------------- vmm/src/vm_config.rs | 18 ++++++++++++++++++ 4 files changed, 37 insertions(+), 43 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index f9193c22bf..38d9f1183f 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -28,6 +28,7 @@ use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; use log::{debug, error, info}; +pub use smbios::SmbiosConfig; use thiserror::Error; use vm_memory::{ Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, @@ -958,9 +959,7 @@ pub fn configure_system( _num_cpus: u32, setup_header: Option, rsdp_addr: Option, - serial_number: Option<&str>, - uuid: Option<&str>, - oem_strings: Vec, + smbios: Option<&SmbiosConfig>, topology: Option<(u16, u16, u16, u16)>, ) -> super::Result<()> { // Write EBDA address to location where ACPICA expects to find it @@ -968,8 +967,7 @@ pub fn configure_system( .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER) .map_err(Error::EbdaSetup)?; - let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings) - .map_err(Error::SmbiosSetup)?; + let size = smbios::setup_smbios(guest_mem, smbios).map_err(Error::SmbiosSetup)?; // Place the MP table after the SMIOS table aligned to 16 bytes let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size); @@ -1514,8 +1512,6 @@ mod unit_tests { Some(layout::RSDP_POINTER), None, None, - Vec::new(), - None, ); config_err.unwrap_err(); @@ -1538,8 +1534,6 @@ mod unit_tests { None, None, None, - Vec::new(), - None, ) .unwrap(); @@ -1567,8 +1561,6 @@ mod unit_tests { None, None, None, - Vec::new(), - None, ) .unwrap(); @@ -1582,8 +1574,6 @@ mod unit_tests { None, None, None, - Vec::new(), - None, ) .unwrap(); } diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index cb8b331bb8..eb99de94d9 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -50,6 +50,13 @@ const IS_VIRTUAL_MACHINE: u8 = 1 << 4; pub const DEFAULT_SYSTEM_MANUFACTURER: &str = "Cloud Hypervisor"; pub const DEFAULT_SYSTEM_PRODUCT_NAME: &str = "cloud-hypervisor"; +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosConfig { + pub serial_number: Option, + pub uuid: Option, + pub oem_strings: Vec, +} + fn compute_checksum(v: &T) -> u8 { let v: *const T = v; // SAFETY: we are only reading the bytes within the size of the `T` reference `v`. @@ -222,12 +229,10 @@ fn write_type1_system( Ok(()) } -pub fn setup_smbios( - mem: &GuestMemoryMmap, - serial_number: Option<&str>, - uuid: Option<&str>, - oem_strings: Vec, -) -> Result { +pub fn setup_smbios(mem: &GuestMemoryMmap, smbios: Option<&SmbiosConfig>) -> Result { + let serial_number = smbios.and_then(|cfg| cfg.serial_number.as_deref()); + let uuid = smbios.and_then(|cfg| cfg.uuid.as_deref()); + let oem_strings: &[String] = smbios.map_or(&[] as &[String], |cfg| cfg.oem_strings.as_slice()); let physptr = GuestAddress(SMBIOS_START) .checked_add(mem::size_of::() as u64) .ok_or(Error::NotEnoughMemory)?; @@ -267,7 +272,7 @@ pub fn setup_smbios( curptr = write_and_incr(mem, smbios_oemstrings, curptr)?; for s in oem_strings { - curptr = write_string(mem, &s, curptr)?; + curptr = write_string(mem, s, curptr)?; } curptr = write_string_terminator(mem, curptr, true)?; @@ -333,7 +338,7 @@ mod unit_tests { fn entrypoint_checksum() { let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); - setup_smbios(&mem, None, None, Vec::new()).unwrap(); + setup_smbios(&mem, None).unwrap(); let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 62e9a391a6..b408a8fb99 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1885,30 +1885,13 @@ impl Vm { let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); - let serial_number = self + let smbios = self .config .lock() .unwrap() .platform .as_ref() - .and_then(|p| p.serial_number.clone()); - - let uuid = self - .config - .lock() - .unwrap() - .platform - .as_ref() - .and_then(|p| p.uuid.clone()); - - let oem_strings = self - .config - .lock() - .unwrap() - .platform - .as_ref() - .map(|p| p.oem_strings.clone()) - .unwrap_or_default(); + .and_then(|p| p.smbios_config()); let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); @@ -1920,9 +1903,7 @@ impl Vm { boot_vcpus, entry_addr.setup_header, rsdp_addr, - serial_number.as_deref(), - uuid.as_deref(), - oem_strings, + smbios.as_ref(), topology, ) .map_err(Error::ConfigureSystem)?; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 239c682066..895a4c92b3 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -143,6 +143,24 @@ pub struct PlatformConfig { pub vfio_p2p_dma: bool, } +#[cfg(target_arch = "x86_64")] +impl PlatformConfig { + pub fn smbios_config(&self) -> Option { + let smbios = arch::x86_64::SmbiosConfig { + serial_number: self.serial_number.clone(), + uuid: self.uuid.clone(), + oem_strings: self.oem_strings.clone(), + }; + + if smbios.serial_number.is_none() && smbios.uuid.is_none() && smbios.oem_strings.is_empty() + { + None + } else { + Some(smbios) + } + } +} + pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1; fn default_pci_segment_aperture_weight() -> u32 { From 287c2c0c63999b2990fb1f5fc48b73bae1a7bb38 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Feb 2026 15:44:24 +0100 Subject: [PATCH 049/178] vmm: platform: add structured SMBIOS config Extend SMBIOS System Information with manufacturer, product, version, family, sku, serial, and uuid fields, add a chassis asset tag, and pass a structured SMBIOS config from --platform into arch setup. Keep OEM strings and legacy serial_number/uuid options working for compatibility. The platform option naming follows `dmidecode -s `. Fields: - system_manufacturer - system_product_name - system_version - system_family - system_serial_number - system_uuid - chassis_asset_tag On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- arch/src/x86_64/mod.rs | 2 +- arch/src/x86_64/smbios.rs | 156 ++++++++++++++++-- .../tests/common/tests_wrappers.rs | 6 +- vmm/src/api/openapi/cloud-hypervisor.yaml | 16 ++ vmm/src/config.rs | 130 +++++++++++++-- vmm/src/vm_config.rs | 55 +++++- 6 files changed, 326 insertions(+), 39 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 38d9f1183f..e5f4d48204 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -28,7 +28,7 @@ use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; use log::{debug, error, info}; -pub use smbios::SmbiosConfig; +pub use smbios::{SmbiosChassisConfig, SmbiosConfig, SmbiosSystem}; use thiserror::Error; use vm_memory::{ Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index eb99de94d9..eacfd2a67f 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -35,6 +35,9 @@ pub enum Error { /// Failure to parse uuid, uuid format may be error #[error("Failure to parse uuid: {1}")] ParseUuid(#[source] uuid::Error, String), + /// SMBIOS string index overflow (u8 limit reached). + #[error("SMBIOS string index overflow (u8 limit reached)")] + TooManyStrings, } pub type Result = result::Result; @@ -44,6 +47,7 @@ const SM3_MAGIC_IDENT: &[u8; 5usize] = b"_SM3_"; const BIOS_INFORMATION: u8 = 0; const SYSTEM_INFORMATION: u8 = 1; const OEM_STRINGS: u8 = 11; +const SYSTEM_ENCLOSURE: u8 = 3; const END_OF_TABLE: u8 = 127; const PCI_SUPPORTED: u64 = 1 << 7; const IS_VIRTUAL_MACHINE: u8 = 1 << 4; @@ -52,9 +56,29 @@ pub const DEFAULT_SYSTEM_PRODUCT_NAME: &str = "cloud-hypervisor"; #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct SmbiosConfig { + pub system: Option, + pub chassis: Option, + pub oem_strings: Vec, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosSystem { + pub manufacturer: Option, + pub product_name: Option, + pub version: Option, pub serial_number: Option, pub uuid: Option, - pub oem_strings: Vec, + pub sku_number: Option, + pub family: Option, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SmbiosChassisConfig { + pub manufacturer: Option, + pub chassis_type: Option, + pub version: Option, + pub serial_number: Option, + pub asset_tag: Option, } fn compute_checksum(v: &T) -> u8 { @@ -124,6 +148,33 @@ struct SmbiosOemStrings { count: u8, } +/// SMBIOS Chassis Table (Type 3) as defined in DMTF SMBIOS 3.9.0: +/// https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.9.0.pdf +/// Note: trailing fields are omitted, so this structure is not complete. +#[repr(C, packed)] +#[derive(Default, Copy, Clone)] +struct SmbiosChassis { + r#type: u8, + length: u8, + handle: u16, + manufacturer: u8, + chassis_type: u8, + version: u8, + serial_number: u8, + asset_tag: u8, + bootup_state: u8, + power_supply_state: u8, + thermal_state: u8, + security_status: u8, + oem_defined: u32, + height: u8, + number_of_power_cords: u8, + contained_element_count: u8, + contained_element_record_length: u8, + // followed by contained element records (optional, variable-length) + // followed by sku_number: u8, rack_type: u8, rack_height: u8 +} + #[repr(C, packed)] #[derive(Default, Copy, Clone)] struct SmbiosEndOfTable { @@ -141,6 +192,8 @@ unsafe impl ByteValued for SmbiosSysInfo {} // SAFETY: data structure only contain a series of integers unsafe impl ByteValued for SmbiosOemStrings {} // SAFETY: data structure only contain a series of integers +unsafe impl ByteValued for SmbiosChassis {} +// SAFETY: data structure only contain a series of integers unsafe impl ByteValued for SmbiosEndOfTable {} fn write_and_incr( @@ -194,44 +247,115 @@ fn write_string_terminator( } } +fn alloc_index(next: &mut u8, present: bool) -> Result { + if !present { + return Ok(0); + } + + let idx = *next; + if idx == 0 { + // wrapped around, next starts always initially at 1 + return Err(Error::TooManyStrings); + } + + *next = next.wrapping_add(1); + Ok(idx) +} + fn write_type1_system( mem: &GuestMemoryMmap, curptr: &mut GuestAddress, handle: &mut u16, - serial_number: Option<&str>, - uuid: Option<&str>, + system: Option<&SmbiosSystem>, ) -> Result<()> { *handle += 1; + let manufacturer = system + .and_then(|s| s.manufacturer.as_deref()) + .unwrap_or(DEFAULT_SYSTEM_MANUFACTURER); + let product = system + .and_then(|s| s.product_name.as_deref()) + .unwrap_or(DEFAULT_SYSTEM_PRODUCT_NAME); + let version = system.and_then(|s| s.version.as_deref()); + let serial = system.and_then(|s| s.serial_number.as_deref()); + let uuid = system.and_then(|s| s.uuid.as_deref()); + let sku = system.and_then(|s| s.sku_number.as_deref()); + let family = system.and_then(|s| s.family.as_deref()); + let uuid_number = uuid .map(Uuid::parse_str) .transpose() .map_err(|e| Error::ParseUuid(e, uuid.unwrap().to_string()))? .unwrap_or(Uuid::nil()); - let serial_idx = serial_number.map(|_| 3).unwrap_or_default(); - let smbios_sysinfo = SmbiosSysInfo { + let mut next = 1u8; + let manufacturer_idx = alloc_index(&mut next, true)?; + let product_idx = alloc_index(&mut next, true)?; + let version_idx = alloc_index(&mut next, version.is_some())?; + let serial_idx = alloc_index(&mut next, serial.is_some())?; + let sku_idx = alloc_index(&mut next, sku.is_some())?; + let family_idx = alloc_index(&mut next, family.is_some())?; + + let sys = SmbiosSysInfo { r#type: SYSTEM_INFORMATION, length: mem::size_of::() as u8, handle: *handle, - manufacturer: 1, // First string written in this section - product_name: 2, // Second string written in this section + manufacturer: manufacturer_idx, + product_name: product_idx, + version: version_idx, serial_number: serial_idx, uuid: uuid_number.to_bytes_le(), + sku: sku_idx, + family: family_idx, ..Default::default() }; - *curptr = write_and_incr(mem, smbios_sysinfo, *curptr)?; - *curptr = write_string(mem, DEFAULT_SYSTEM_MANUFACTURER, *curptr)?; - *curptr = write_string(mem, DEFAULT_SYSTEM_PRODUCT_NAME, *curptr)?; - *curptr = write_opt_string(mem, serial_number, *curptr)?; + *curptr = write_and_incr(mem, sys, *curptr)?; + *curptr = write_string(mem, manufacturer, *curptr)?; + *curptr = write_string(mem, product, *curptr)?; + *curptr = write_opt_string(mem, version, *curptr)?; + *curptr = write_opt_string(mem, serial, *curptr)?; + *curptr = write_opt_string(mem, sku, *curptr)?; + *curptr = write_opt_string(mem, family, *curptr)?; *curptr = write_and_incr(mem, 0u8, *curptr)?; Ok(()) } +fn write_type3_chassis( + mem: &GuestMemoryMmap, + curptr: &mut GuestAddress, + handle: &mut u16, + chassis: &SmbiosChassisConfig, +) -> Result<()> { + *handle += 1; + + let asset_tag = chassis.asset_tag.as_deref(); + let mut next = 1u8; + let asset_idx = alloc_index(&mut next, asset_tag.is_some())?; + + let ch = SmbiosChassis { + r#type: SYSTEM_ENCLOSURE, + length: mem::size_of::() as u8, + handle: *handle, + manufacturer: 0, + chassis_type: 0, + version: 0, + serial_number: 0, + asset_tag: asset_idx, + contained_element_count: 0, + contained_element_record_length: 0, + ..Default::default() + }; + + *curptr = write_and_incr(mem, ch, *curptr)?; + *curptr = write_opt_string(mem, asset_tag, *curptr)?; + *curptr = write_string_terminator(mem, *curptr, asset_tag.is_some())?; + Ok(()) +} + pub fn setup_smbios(mem: &GuestMemoryMmap, smbios: Option<&SmbiosConfig>) -> Result { - let serial_number = smbios.and_then(|cfg| cfg.serial_number.as_deref()); - let uuid = smbios.and_then(|cfg| cfg.uuid.as_deref()); + let system = smbios.and_then(|cfg| cfg.system.as_ref()); + let chassis = smbios.and_then(|cfg| cfg.chassis.as_ref()); let oem_strings: &[String] = smbios.map_or(&[] as &[String], |cfg| cfg.oem_strings.as_slice()); let physptr = GuestAddress(SMBIOS_START) .checked_add(mem::size_of::() as u64) @@ -257,7 +381,11 @@ pub fn setup_smbios(mem: &GuestMemoryMmap, smbios: Option<&SmbiosConfig>) -> Res curptr = write_and_incr(mem, 0u8, curptr)?; } - write_type1_system(mem, &mut curptr, &mut handle, serial_number, uuid)?; + write_type1_system(mem, &mut curptr, &mut handle, system)?; + + if let Some(chassis) = chassis { + write_type3_chassis(mem, &mut curptr, &mut handle, chassis)?; + } if !oem_strings.is_empty() { handle += 1; diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 693702b552..27e4c1f272 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2219,7 +2219,7 @@ pub(crate) fn _test_dmi_serial_number(guest: &Guest) { let mut child = GuestCommand::new(guest) .default_cpus() .default_memory() - .default_kernel_cmdline_with_platform(Some("serial_number=a=b;c=d")) + .default_kernel_cmdline_with_platform(Some("system_serial_number=a=b;c=d")) .default_disks() .default_net() .capture_output() @@ -2248,7 +2248,9 @@ pub(crate) fn _test_dmi_uuid(guest: &Guest) { let mut child = GuestCommand::new(guest) .default_cpus() .default_memory() - .default_kernel_cmdline_with_platform(Some("uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a")) + .default_kernel_cmdline_with_platform(Some( + "system_uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a", + )) .default_disks() .default_net() .capture_output() diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 91eb5af245..27ee20b696 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -784,14 +784,30 @@ components: iommu_address_width: type: integer format: uint8 + system_serial_number: + type: string serial_number: type: string + system_uuid: + type: string uuid: type: string oem_strings: type: array items: type: string + system_manufacturer: + type: string + system_product_name: + type: string + system_version: + type: string + system_family: + type: string + system_sku_number: + type: string + chassis_asset_tag: + type: string tdx: type: boolean default: false diff --git a/vmm/src/config.rs b/vmm/src/config.rs index f917c87bf5..e8527cb738 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -835,9 +835,11 @@ impl PlatformConfig { static SYNTAX: LazyLock = LazyLock::new(|| { let mut syntax = "Platform configuration parameters \ \"num_pci_segments=,iommu_segments=,\ - iommu_address_width=,serial_number=,\ - uuid=,oem_strings=,iommufd=on|off,\ - vfio_p2p_dma=on|off" + iommu_address_width=,iommufd=on|off,vfio_p2p_dma=on|off,system_manufacturer=,\ + system_product_name=,system_version=,\ + system_serial_number=,system_uuid=,\ + system_sku_number=,system_family=,\ + oem_strings=,chassis_asset_tag=" .to_string(); if cfg!(feature = "tdx") { @@ -857,16 +859,61 @@ impl PlatformConfig { } pub fn parse(platform: &str) -> Result { + struct StringField { + key: &'static str, + apply: fn(&mut PlatformConfig, String), + } + + const SMBIOS_STRING_FIELDS: &[StringField] = &[ + StringField { + key: "system_manufacturer", + apply: |config, value| config.system_manufacturer = Some(value), + }, + StringField { + key: "system_product_name", + apply: |config, value| config.system_product_name = Some(value), + }, + StringField { + key: "system_version", + apply: |config, value| config.system_version = Some(value), + }, + StringField { + key: "system_serial_number", + apply: |config, value| config.system_serial_number = Some(value), + }, + StringField { + key: "system_uuid", + apply: |config, value| config.system_uuid = Some(value), + }, + StringField { + key: "system_sku_number", + apply: |config, value| config.system_sku_number = Some(value), + }, + StringField { + key: "system_family", + apply: |config, value| config.system_family = Some(value), + }, + StringField { + key: "chassis_asset_tag", + apply: |config, value| config.chassis_asset_tag = Some(value), + }, + ]; + let mut parser = OptionParser::new(); parser .add("num_pci_segments") .add("iommu_segments") .add("iommu_address_width") + .add("oem_strings") .add("serial_number") .add("uuid") .add("oem_strings") .add("iommufd") - .add("vfio_p2p_dma"); + .add("vfio_p2p_dma") + .add("uuid"); + for field in SMBIOS_STRING_FIELDS { + parser.add(field.key); + } #[cfg(feature = "tdx")] parser.add("tdx"); #[cfg(feature = "sev_snp")] @@ -885,10 +932,6 @@ impl PlatformConfig { .convert("iommu_address_width") .map_err(Error::ParsePlatform)? .unwrap_or(MAX_IOMMU_ADDRESS_WIDTH_BITS); - let serial_number = parser - .convert("serial_number") - .map_err(Error::ParsePlatform)?; - let uuid = parser.convert("uuid").map_err(Error::ParsePlatform)?; let oem_strings = parser .convert::("oem_strings") .map_err(Error::ParsePlatform)? @@ -916,20 +959,71 @@ impl PlatformConfig { .map_err(Error::ParsePlatform)? .unwrap_or(Toggle(false)) .0; - Ok(PlatformConfig { + + let mut platform_config = PlatformConfig { num_pci_segments, iommu_segments, iommu_address_width_bits, - serial_number, - uuid, + system_serial_number: None, + system_uuid: None, oem_strings, + system_manufacturer: None, + system_product_name: None, + system_version: None, + system_family: None, + system_sku_number: None, + chassis_asset_tag: None, iommufd, - vfio_p2p_dma, #[cfg(feature = "tdx")] tdx, #[cfg(feature = "sev_snp")] sev_snp, - }) + vfio_p2p_dma, + }; + + for field in SMBIOS_STRING_FIELDS { + if let Some(value) = parser + .convert::(field.key) + .map_err(Error::ParsePlatform)? + { + (field.apply)(&mut platform_config, value); + } + } + + let legacy_serial_number = parser + .convert::("serial_number") + .map_err(Error::ParsePlatform)?; + platform_config.system_serial_number = platform_config + .system_serial_number + .or(legacy_serial_number); + + let legacy_uuid = parser + .convert::("uuid") + .map_err(Error::ParsePlatform)?; + platform_config.system_uuid = platform_config.system_uuid.or(legacy_uuid); + #[cfg(feature = "tdx")] + let tdx = parser + .convert::("tdx") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; + #[cfg(feature = "sev_snp")] + let sev_snp = parser + .convert::("sev_snp") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; + + #[cfg(feature = "tdx")] + { + platform_config.tdx = tdx; + } + #[cfg(feature = "sev_snp")] + { + platform_config.sev_snp = sev_snp; + } + + Ok(platform_config) } pub fn validate(&self) -> ValidationResult<()> { @@ -5038,11 +5132,17 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" num_pci_segments: MAX_NUM_PCI_SEGMENTS, iommu_segments: None, iommu_address_width_bits: MAX_IOMMU_ADDRESS_WIDTH_BITS, - serial_number: None, - uuid: None, + system_serial_number: None, + system_uuid: None, oem_strings: Vec::new(), iommufd: false, vfio_p2p_dma: default_platformconfig_vfio_p2p_dma(), + system_manufacturer: None, + system_product_name: None, + system_version: None, + system_family: None, + system_sku_number: None, + chassis_asset_tag: None, #[cfg(feature = "tdx")] tdx: false, #[cfg(feature = "sev_snp")] diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 895a4c92b3..72ce2c567b 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -125,12 +125,24 @@ pub struct PlatformConfig { pub iommu_segments: Option>, #[serde(default = "default_platformconfig_iommu_address_width_bits")] pub iommu_address_width_bits: u8, + #[serde(default, alias = "serial_number")] + pub system_serial_number: Option, + #[serde(default, alias = "uuid")] + pub system_uuid: Option, #[serde(default)] - pub serial_number: Option, + pub oem_strings: Vec, #[serde(default)] - pub uuid: Option, + pub system_manufacturer: Option, #[serde(default)] - pub oem_strings: Vec, + pub system_product_name: Option, + #[serde(default)] + pub system_version: Option, + #[serde(default)] + pub system_family: Option, + #[serde(default)] + pub system_sku_number: Option, + #[serde(default)] + pub chassis_asset_tag: Option, #[cfg(feature = "tdx")] #[serde(default)] pub tdx: bool, @@ -146,14 +158,43 @@ pub struct PlatformConfig { #[cfg(target_arch = "x86_64")] impl PlatformConfig { pub fn smbios_config(&self) -> Option { + let has_system = [ + &self.system_serial_number, + &self.system_uuid, + &self.system_manufacturer, + &self.system_product_name, + &self.system_version, + &self.system_family, + &self.system_sku_number, + ] + .iter() + .any(|v| v.is_some()); + + let system = has_system.then_some(arch::x86_64::SmbiosSystem { + manufacturer: self.system_manufacturer.clone(), + product_name: self.system_product_name.clone(), + version: self.system_version.clone(), + serial_number: self.system_serial_number.clone(), + uuid: self.system_uuid.clone(), + sku_number: self.system_sku_number.clone(), + family: self.system_family.clone(), + }); + + let chassis = + self.chassis_asset_tag + .clone() + .map(|asset_tag| arch::x86_64::SmbiosChassisConfig { + asset_tag: Some(asset_tag), + ..Default::default() + }); + let smbios = arch::x86_64::SmbiosConfig { - serial_number: self.serial_number.clone(), - uuid: self.uuid.clone(), + system, + chassis, oem_strings: self.oem_strings.clone(), }; - if smbios.serial_number.is_none() && smbios.uuid.is_none() && smbios.oem_strings.is_empty() - { + if smbios.system.is_none() && smbios.chassis.is_none() && smbios.oem_strings.is_empty() { None } else { Some(smbios) From 29fab1950ed02265ddc5aa9aac1d11068206527c Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Feb 2026 11:53:00 +0100 Subject: [PATCH 050/178] vmm: deprecate legacy SMBIOS keys in API and CLI Mark serial_number/uuid as deprecated in the OpenAPI schema and emit warnings when those legacy --platform keys are used, while continuing to accept them for compatibility. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/api/openapi/cloud-hypervisor.yaml | 2 ++ vmm/src/config.rs | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 27ee20b696..f1f5cfc824 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -788,10 +788,12 @@ components: type: string serial_number: type: string + deprecated: true system_uuid: type: string uuid: type: string + deprecated: true oem_strings: type: array items: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index e8527cb738..b22c010e75 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -993,6 +993,9 @@ impl PlatformConfig { let legacy_serial_number = parser .convert::("serial_number") .map_err(Error::ParsePlatform)?; + if legacy_serial_number.is_some() { + warn!("'serial_number' in --platform is deprecated; use 'system_serial_number'."); + } platform_config.system_serial_number = platform_config .system_serial_number .or(legacy_serial_number); @@ -1000,6 +1003,9 @@ impl PlatformConfig { let legacy_uuid = parser .convert::("uuid") .map_err(Error::ParsePlatform)?; + if legacy_uuid.is_some() { + warn!("'uuid' in --platform is deprecated; use 'system_uuid'."); + } platform_config.system_uuid = platform_config.system_uuid.or(legacy_uuid); #[cfg(feature = "tdx")] let tdx = parser From 9e3afb7d69a0117c575a0b86c74c88952434588e Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 10 Feb 2026 13:18:11 +0100 Subject: [PATCH 051/178] arch: smbios: add tests for table serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add unit tests that walk the SMBIOS binary layout in guest memory and verify structure ordering, string-set encoding, and error paths. Tests added: - smbios_chassis_empty_string_set_has_double_null: verify that a chassis with no strings emits the double-NUL terminator required by SMBIOS DSP0134 §6.1.3. - smbios_chassis_oem_strings_layout: verify the full chain (BIOS → System → Chassis → OEM → End) when a chassis asset tag and OEM strings are configured. - smbios_strings_terminators_default: verify the default table chain (BIOS → System → End) and check that string indices and string-set contents match for both structures. - smbios_strings_too_many: exercise alloc_index up to the u8 limit (255 strings) and verify the 256th is rejected. - smbios_uuid_invalid_rejected: ensure a malformed UUID string is rejected with Error::ParseUuid. - smbios_uuid_written_le: ensure the UUID is stored in little-endian byte order as required by SMBIOS Spec 7.2.1. - smbios_write_fails_with_too_small_memory: verify that setup_smbios fails with Error::WriteData when guest memory is too small to hold anything beyond the entry point. All tests also succeed when run with miri: cargo +nightly miri test -p arch smbios On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- arch/src/x86_64/smbios.rs | 225 +++++++++++++++++++++++++++++++++++++- 1 file changed, 222 insertions(+), 3 deletions(-) diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index eacfd2a67f..a59c39fef3 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -443,8 +443,55 @@ pub fn setup_smbios(mem: &GuestMemoryMmap, smbios: Option<&SmbiosConfig>) -> Res mod unit_tests { use super::*; + /// Collects all strings after a SMBIOS structure, stopping at the double-NUL terminator and returns next addr. + fn read_string_set(mem: &GuestMemoryMmap, addr: GuestAddress) -> (Vec, GuestAddress) { + let mut cur = addr; + let read_byte = |addr: GuestAddress| -> u8 { mem.read_obj(addr).unwrap() }; + + // SMBIOS string-set: NUL-terminated strings, terminated by an extra NUL. + // Empty string-set is exactly "\0\0". + if read_byte(cur) == 0 { + let next = cur.checked_add(1).unwrap(); + assert_eq!(read_byte(next), 0); + return (Vec::new(), next.checked_add(1).unwrap()); + } + + let mut strings = Vec::new(); + loop { + let mut bytes = Vec::new(); + loop { + let b = read_byte(cur); + cur = cur.checked_add(1).unwrap(); + if b == 0 { + break; + } + bytes.push(b); + } + strings.push(String::from_utf8(bytes).unwrap()); + + // If the next byte is NUL, that's the extra terminator. + if read_byte(cur) == 0 { + cur = cur.checked_add(1).unwrap(); + break; + } + } + + (strings, cur) + } + + #[test] + fn entrypoint_checksum() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + + setup_smbios(&mem, None).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + + assert_eq!(compute_checksum(&smbios_ep), 0); + } + #[test] - fn struct_size() { + fn entrypoint_struct_size() { assert_eq!( mem::size_of::(), 0x18usize, @@ -463,13 +510,185 @@ mod unit_tests { } #[test] - fn entrypoint_checksum() { + fn smbios_chassis_empty_string_set_has_double_null() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let smbios = SmbiosConfig { + chassis: Some(SmbiosChassisConfig::default()), + ..Default::default() + }; + + setup_smbios(&mem, Some(&smbios)).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let chassis: SmbiosChassis = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(chassis.length as u64).unwrap(); + // SMBIOS DSP0134 §6.1.3: empty string-set ends with double NUL. + let b0: u8 = mem.read_obj(cur).unwrap(); + let b1: u8 = mem.read_obj(cur.checked_add(1).unwrap()).unwrap(); + assert_eq!(b0, 0); + assert_eq!(b1, 0); + cur = cur.checked_add(2).unwrap(); + + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } + + #[test] + fn smbios_chassis_oem_strings_layout() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + + let smbios = SmbiosConfig { + chassis: Some(SmbiosChassisConfig { + asset_tag: Some("rack1".to_string()), + ..Default::default() + }), + oem_strings: vec!["o1".to_string(), "o2".to_string()], + ..Default::default() + }; + + setup_smbios(&mem, Some(&smbios)).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let chassis: SmbiosChassis = mem.read_obj(cur).unwrap(); + assert_eq!(chassis.r#type, SYSTEM_ENCLOSURE); + assert_eq!(chassis.asset_tag, 1); + cur = cur.checked_add(chassis.length as u64).unwrap(); + let (chassis_strings, next) = read_string_set(&mem, cur); + assert_eq!(chassis_strings, vec!["rack1"]); + cur = next; + + let oem: SmbiosOemStrings = mem.read_obj(cur).unwrap(); + assert_eq!(oem.r#type, OEM_STRINGS); + assert_eq!(oem.count, 2); + cur = cur.checked_add(oem.length as u64).unwrap(); + let (oem_strings, next) = read_string_set(&mem, cur); + assert_eq!(oem_strings, vec!["o1", "o2"]); + cur = next; + + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } + + #[test] + fn smbios_strings_terminators_default() { let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); setup_smbios(&mem, None).unwrap(); let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + assert_eq!(bios.r#type, BIOS_INFORMATION); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (bios_strings, next) = read_string_set(&mem, cur); + assert_eq!(bios_strings, vec!["cloud-hypervisor", "0"]); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + assert_eq!(sys.r#type, SYSTEM_INFORMATION); + assert_eq!(sys.manufacturer, 1); + assert_eq!(sys.product_name, 2); + assert_eq!(sys.version, 0); + assert_eq!(sys.serial_number, 0); + assert_eq!(sys.sku, 0); + assert_eq!(sys.family, 0); + cur = cur.checked_add(sys.length as u64).unwrap(); + let (sys_strings, next) = read_string_set(&mem, cur); + assert_eq!( + sys_strings, + vec![DEFAULT_SYSTEM_MANUFACTURER, DEFAULT_SYSTEM_PRODUCT_NAME] + ); + cur = next; - assert_eq!(compute_checksum(&smbios_ep), 0); + let end: SmbiosEndOfTable = mem.read_obj(cur).unwrap(); + assert_eq!(end.r#type, END_OF_TABLE); + } + + #[test] + fn smbios_strings_too_many() { + let mut next = 1u8; + for _ in 0..255 { + alloc_index(&mut next, true).unwrap(); + } + let err = alloc_index(&mut next, true).unwrap_err(); + assert!(matches!(err, Error::TooManyStrings)); + } + + #[test] + fn smbios_uuid_invalid_rejected() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let smbios = SmbiosConfig { + system: Some(SmbiosSystem { + uuid: Some("not-a-uuid".to_string()), + ..Default::default() + }), + ..Default::default() + }; + + let err = setup_smbios(&mem, Some(&smbios)).unwrap_err(); + assert!(matches!(err, Error::ParseUuid(_, _))); + } + + #[test] + fn smbios_uuid_written_le() { + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(SMBIOS_START), 4096)]).unwrap(); + let uuid_str = "00112233-4455-6677-8899-aabbccddeeff"; + let smbios = SmbiosConfig { + system: Some(SmbiosSystem { + uuid: Some(uuid_str.to_string()), + ..Default::default() + }), + ..Default::default() + }; + + setup_smbios(&mem, Some(&smbios)).unwrap(); + + let smbios_ep: Smbios30Entrypoint = mem.read_obj(GuestAddress(SMBIOS_START)).unwrap(); + let mut cur = GuestAddress(smbios_ep.physptr); + + let bios: SmbiosBiosInfo = mem.read_obj(cur).unwrap(); + cur = cur.checked_add(bios.length as u64).unwrap(); + let (_, next) = read_string_set(&mem, cur); + cur = next; + + let sys: SmbiosSysInfo = mem.read_obj(cur).unwrap(); + assert_eq!(sys.uuid, Uuid::parse_str(uuid_str).unwrap().to_bytes_le()); + } + + #[test] + fn smbios_write_fails_with_too_small_memory() { + let mem = GuestMemoryMmap::from_ranges(&[( + GuestAddress(SMBIOS_START), + mem::size_of::(), + )]) + .unwrap(); + + let err = setup_smbios(&mem, None).unwrap_err(); + assert!(matches!(err, Error::WriteData)); } } From 90d53efd4617bb2e102f12576cac191138f4679b Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Tue, 10 Feb 2026 16:48:32 +0100 Subject: [PATCH 052/178] vmm: Allow to modify host numa settings It is possible to migrate a VM to a host that might have different a numa configuration. In such a case, we need to adjust the mapping of guest memory to host nodes. This patch therefore adds a new field to `VmReceiveMigrationData` in order to allow to provide the VMM with information about changing `MemoryZoneConfig`s. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com --- vmm/src/api/mod.rs | 11 ++++++++-- vmm/src/lib.rs | 45 ++++++++++++++++++++++++++++++++++++--- vmm/src/memory_manager.rs | 5 ++++- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 2c53493294..f994d73515 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -57,8 +57,8 @@ use crate::device_tree::DeviceTree; use crate::migration_transport::MAX_MIGRATION_CONNECTIONS; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, - UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, MemoryZoneConfig, NetConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; /// API errors are sent back from the VMM API server through the ApiResponse. @@ -281,6 +281,9 @@ pub struct VmReceiveMigrationData { /// migration. Example: "192.168.1.1:2222". #[serde(default)] pub tcp_serial_url: Option, + /// Optional memory zone reconfiguration data. + #[serde(default)] + pub zones: Vec, } #[derive(Debug, Error)] @@ -343,6 +346,7 @@ impl VmReceiveMigrationData { tls_dir: None, net_fds: vec![], tcp_serial_url: None, + zones: vec![], }; data.validate()?; @@ -377,6 +381,7 @@ impl VmReceiveMigrationData { tls_dir, net_fds: vec![], tcp_serial_url, + zones: vec![], }; data.validate()?; @@ -1997,6 +2002,7 @@ mod unit_tests { tls_dir: None, net_fds: vec![], tcp_serial_url: None, + zones: vec![], } ); @@ -2022,6 +2028,7 @@ mod unit_tests { tls_dir: Some(tls_dir), net_fds: vec![], tcp_serial_url: Some("1.2.3.4:6789".to_string()), + zones: vec![], } ); diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f0b12cce69..e6fc77f79c 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -76,8 +76,8 @@ use crate::migration_transport::{ use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, - UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, MemoryZoneConfig, NetConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; mod acpi; @@ -926,6 +926,7 @@ impl Vmm { socket, memory_files, receive_data_migration.tcp_serial_url.clone(), + receive_data_migration.zones.clone(), )?; if !receive_data_migration.net_fds.is_empty() { @@ -1070,6 +1071,7 @@ impl Vmm { socket: &mut T, existing_memory_files: HashMap, tcp_serial_url: Option, + zones: Vec, ) -> std::result::Result>, MigratableError> where T: Read, @@ -1100,6 +1102,42 @@ impl Vmm { vm_config.serial.common.url = Some(tcp_serial_url); } + // Adopt host nodes. + if !zones.is_empty() { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + if let Some(config_zones) = &mut vm_config.memory.zones { + for zone in zones { + // We currently only support to move MemoryZones to different host nodes. We therefore ensure that + // there exists a memory zone in the new config that matches the same size and ID for each memory + // zone of the old config. + if let Some(matched_zone) = config_zones.iter_mut().find(|z| z.id == zone.id) { + if matched_zone.size != zone.size { + return Err(MigratableError::MigrateReceive(anyhow!( + "Size update of memory zone with ID {} not allowed. Tried to resize from {:018x?} to {:018x?}", + zone.id, + zone.size, + matched_zone.size + ))); + } + // Override the host numa node + matched_zone.host_numa_node = zone.host_numa_node; + } else { + // We did not find a match for a memory zone that was defined in the old config, so we cannot + // update it. + return Err(MigratableError::MigrateReceive(anyhow!( + "Failed to associate new memory zone information with ID {} to an existing zone", + zone.id + ))); + } + } + } else { + // MemoryZoneConfigs were provided but the initial config didn't contain any + return Err(MigratableError::MigrateReceive(anyhow!( + "Updating memory zone data is forbidden as VM was instantiated without any zones" + ))); + } + } + self.console_info = Some(pre_create_console_devices(self).map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) })?); @@ -2602,11 +2640,12 @@ impl RequestHandler for Vmm { .map_err(MigratableError::MigrateReceive)?; info!( - "Receiving migration: receiver_url={},tls={},net_fds={:?}, tcp_url={:?}", + "Receiving migration: receiver_url={},tls={},net_fds={:?}, tcp_url={:?}, zones={:?}", receive_data_migration.receiver_url, receive_data_migration.tls_dir.is_some(), &receive_data_migration.net_fds, &receive_data_migration.tcp_serial_url, + &receive_data_migration.zones, ); let mut listener = migration_transport::receive_migration_listener( diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 61e21aad6b..41dbfc436c 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -1571,7 +1571,6 @@ impl MemoryManager { .filter(|r| r.2 == RegionType::Ram) .map(|r| (r.0, r.1)) .collect(); - let arch_mem_regions: Vec = arch_mem_regions .iter() .map(|(a, b, c)| ArchMemRegion { @@ -1967,6 +1966,10 @@ impl MemoryManager { // MPOL_BIND is the selected mode as it specifies a strict policy // that restricts memory allocation to the nodes specified in the // nodemask. + info!( + "Creating raw memory region: host-addr={:018x}, len={len}, mode={mode}, host-node={node}", + addr as u64 + ); Self::mbind(addr, len, mode, &nodemask, maxnode, flags) .map_err(Error::ApplyNumaPolicy)?; } From a904e9b9d9019e8f0ebcb5bbd206d157a998586d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 21 Nov 2025 10:48:15 +0100 Subject: [PATCH 053/178] vmm: migration: prepare EventFd for async migration events This is a pre-requisite for the following commit which puts the migration into a dedicated thread. It allows the VMM to react to migration events (success/failure). The commit series was inspired by @ljcore [0] but was changed quite significantly. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7038 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e6fc77f79c..b7d28abf14 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -276,6 +276,7 @@ pub enum EpollDispatch { ActivateVirtioDevices = 3, Debug = 4, GuestExit = 5, + CheckMigration = 6, Unknown, } @@ -289,6 +290,7 @@ impl From for EpollDispatch { 3 => ActivateVirtioDevices, 4 => Debug, 5 => GuestExit, + 6 => CheckMigration, _ => Unknown, } } @@ -655,6 +657,7 @@ pub struct Vmm { console_resize_pipe: Option>, console_info: Option, no_shutdown: bool, + check_migration_evt: EventFd, } /// Just a wrapper for the data that goes into @@ -827,6 +830,7 @@ impl Vmm { let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let guest_exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let check_migration_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll .add_event(&exit_evt, EpollDispatch::Exit) @@ -853,6 +857,10 @@ impl Vmm { .add_event(&debug_evt, EpollDispatch::Debug) .map_err(Error::Epoll)?; + epoll + .add_event(&check_migration_evt, EpollDispatch::CheckMigration) + .map_err(Error::Epoll)?; + Ok(Vmm { epoll, exit_evt, @@ -875,6 +883,7 @@ impl Vmm { console_resize_pipe: None, console_info: None, no_shutdown, + check_migration_evt, }) } @@ -1797,6 +1806,14 @@ impl Vmm { } } + /// Checks the migration result. + /// + /// This should be called when the migration thread indicated a state + /// change (and therefore, its termination). The function checks the result + /// of that thread and either shuts down the VMM on success or keeps the VM + /// and the VMM running on migration failure. + fn check_migration_result(&mut self) {} + fn control_loop( &mut self, api_receiver: &Receiver, @@ -1897,6 +1914,14 @@ impl Vmm { } #[cfg(not(feature = "guest_debug"))] EpollDispatch::Debug => {} + EpollDispatch::CheckMigration => { + info!("VM migration check event"); + // Consume the event. + self.check_migration_evt + .read() + .map_err(Error::EventFdRead)?; + self.check_migration_result(); + } } } } From 034935aed4d41c0a8a9c6035b7b6fa46c2d3c8d7 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 21 Nov 2025 10:49:46 +0100 Subject: [PATCH 054/178] vmm: migration: handle in dedicated thread (make async) This puts the send-migration action into a dedicated thread. This means: 1. The send-migration call will exit sooner (just trigger the migration) 2. Other API Call will not be possible as the VM's ownership is transferred from the VMM to the migration thread. E.g., hotplugging won't work (which is good). 3. If the migration causes the VMM process to crash, this currently can't be observed. A mechanism to query the migration status doesn't exist. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 205 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 146 insertions(+), 59 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index b7d28abf14..f3112ec2a2 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -21,6 +21,7 @@ use std::panic::AssertUnwindSafe; use std::path::PathBuf; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; use std::sync::{Arc, Mutex}; +use std::thread::JoinHandle; use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; @@ -628,6 +629,69 @@ impl VmmVersionInfo { } } +/// Abstraction for the thread controlling and performing the live migration. +/// +/// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. +struct MigrationWorker { + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: Arc, +} + +impl MigrationWorker { + /// Performs any final cleanup after failed live migrations. + /// + /// Helper for [`Self::migrate`]. + fn migrate_error_cleanup(&mut self) -> result::Result<(), MigratableError> { + // Stop logging dirty pages only for non-local migrations + if !self.config.local { + self.vm.stop_dirty_log()?; + } + + Ok(()) + } + + /// Migrate and cleanup. + fn migrate(&mut self) -> result::Result<(), MigratableError> { + debug!("start sending migration"); + event!("vm", "migration-started"); + Vmm::send_migration( + &mut self.vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.as_ref(), + &self.config, + ) + .inspect(|_| event!("vm", "migration-finished")) + .inspect_err(|_| { + event!("vm", "migration-failed"); + let e = self.migrate_error_cleanup(); + if let Err(e) = e { + error!("Failed to clean up after a failed live migration. VM might keep running but in an odd or possibly slowed-down state: {e}"); + } + })?; + + Ok(()) + } + + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> MigrationThreadOut { + debug!("migration thread is starting"); + + let res = self.migrate().inspect_err(|e| error!("migrate error: {e}")); + + // Notify VMM thread to get migration result by joining this thread. + self.check_migration_evt.write(1).unwrap(); + + debug!("migration thread is finished"); + MigrationThreadOut { + vm: self.vm, + migration_res: res, + } + } +} + pub struct VmmThreadHandle { pub thread_handle: thread::JoinHandle>, #[cfg(feature = "dbus_api")] @@ -635,6 +699,12 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +/// Output value of [`MigrationWorker`]. +struct MigrationThreadOut { + vm: Vm, + migration_res: result::Result<(), MigratableError>, +} + pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, @@ -646,6 +716,11 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, + /// The currently running [`Vm`] instance, if any. + /// + /// This is `Some` from the boot to the shutdown of a VM. In the special + /// case of an ongoing live-migration, this is temporarily `None` and held + /// by a guard to prevent modifications to the VM. vm: Option, vm_config: Option>>, seccomp_action: SeccompAction, @@ -658,6 +733,10 @@ pub struct Vmm { console_info: Option, no_shutdown: bool, check_migration_evt: EventFd, + /// Handle to the [`MigrationWorker`] thread. + /// + /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. + migration_thread_handle: Option>, } /// Just a wrapper for the data that goes into @@ -776,14 +855,14 @@ impl Vmm { .name("vmm_signal_handler".to_string()) .spawn(move || { if !signal_handler_seccomp_filter.is_empty() && let Err(e) = apply_filter(&signal_handler_seccomp_filter) - .map_err(Error::ApplySeccompFilter) - { - error!("Error applying seccomp filter: {e:?}"); - exit_evt.write(1).ok(); - return; - } + .map_err(Error::ApplySeccompFilter) + { + error!("Error applying seccomp filter: {e:?}"); + exit_evt.write(1).ok(); + return; + } - if landlock_enable{ + if landlock_enable { match Landlock::new() { Ok(landlock) => { let _ = landlock.restrict_self().map_err(Error::ApplyLandlock).map_err(|e| { @@ -801,11 +880,11 @@ impl Vmm { std::panic::catch_unwind(AssertUnwindSafe(|| { Vmm::signal_handler(signals, original_termios_opt.as_ref(), &exit_evt); })) - .map_err(|_| { - error!("vmm signal_handler thread panicked"); - exit_evt.write(1).ok() - }) - .ok(); + .map_err(|_| { + error!("vmm signal_handler thread panicked"); + exit_evt.write(1).ok() + }) + .ok(); }) .map_err(Error::SignalHandlerSpawn)?, ); @@ -884,6 +963,7 @@ impl Vmm { console_info: None, no_shutdown, check_migration_evt, + migration_thread_handle: None, }) } @@ -1516,13 +1596,17 @@ impl Vmm { Ok(()) } - /// Performs a migration including all its phases. + /// Performs a live-migration. + /// + /// This function performs necessary after-migration cleanup only in the + /// good case. Callers are responsible for properly handling failed + /// migrations. + #[allow(unused_assignments)] // TODO remove fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, - initial_vm_state: VmState, ) -> result::Result<(), MigratableError> { // State machine that is updated with more context as we progress. let mut ctx = OngoingMigrationContext::new(); @@ -1654,15 +1738,10 @@ impl Vmm { // When this returns, we know the VM was resumed (if it was running // before the migration) and that the receiving VMM acquired disk // locks again. - let complete_req = if initial_vm_state == VmState::Running { - Request::complete() - } else { - Request::complete_paused() - }; let (_, complete_duration) = measure_ok(|| { migration_transport::send_request_expect_ok( &mut socket, - complete_req, + Request::complete(), MigratableError::MigrateSend(anyhow!("Error completing migration")), ) })?; @@ -1812,7 +1891,31 @@ impl Vmm { /// change (and therefore, its termination). The function checks the result /// of that thread and either shuts down the VMM on success or keeps the VM /// and the VMM running on migration failure. - fn check_migration_result(&mut self) {} + fn check_migration_result(&mut self) { + // At this point, the thread must be finished. + // If we fail here, we have lost anyway. Just panic. + let MigrationThreadOut { vm, migration_res } = self + .migration_thread_handle + .take() + .expect("should have thread") + .join() + .expect("should have joined"); + + // Give VMM back control. + self.vm = Some(vm); + + match migration_res { + Ok(()) => { + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {e}"); + } + } + Err(e) => { + error!("Migration failed: {e}"); + } + } + } fn control_loop( &mut self, @@ -2741,6 +2844,9 @@ impl RequestHandler for Vmm { send_data_migration.timeout_strategy ); + // TODO Check if there is already a migration in progress + // will be done in next commit + if !self .vm_config .as_ref() @@ -2755,10 +2861,12 @@ impl RequestHandler for Vmm { ))); } + // Take VM ownership. This also means that API events can no longer + // change the VM (e.g. net device hotplug). let vm = self .vm - .as_mut() - .ok_or_else(|| MigratableError::MigrateSend(anyhow!("VM is not running")))?; + .take() + .ok_or(MigratableError::MigrateSend(anyhow!("VM is not running")))?; let initial_vm_state = vm.get_state(); if initial_vm_state != VmState::Running && initial_vm_state != VmState::Paused { @@ -2767,45 +2875,24 @@ impl RequestHandler for Vmm { ))); } - event!("vm", "migration-started"); - Self::send_migration( + // Start migration thread + let worker = MigrationWorker { vm, + check_migration_evt: self.check_migration_evt.try_clone().unwrap(), + config: send_data_migration, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), - &send_data_migration, - initial_vm_state, - ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); - event!("vm", "migration-failed"); - - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } - - // Only resume if the VM was originally running; a VM that was already - // paused before migration should remain paused after failure. - if initial_vm_state == VmState::Running - && vm.get_state() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; - } - - migration_err - })?; - - event!("vm", "migration-finished"); + hypervisor: self.hypervisor.clone(), + }; - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) + self.migration_thread_handle = Some( + thread::Builder::new() + .name("migration".into()) + .spawn(move || worker.run()) + // For upstreaming, we should simply continue and return an + // error when this fails. For our PoC, this is fine. + .unwrap(), + ); + Ok(()) } } From e3977923b2e47fb6c3d86477729bc41b36b5ab7e Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 30 Oct 2025 12:10:51 +0100 Subject: [PATCH 055/178] vmm: better VM ownership handling in context of live migration The commit prepares to properly handle API events during ongoing live-migrations. The VmInfo call is currently not working when a VM is migrating. This will be addressed in a follow-up as part of statistics migration statistics about ongoing live-migrations. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 776 +++++++++++++++++++++++++++---------------------- vmm/src/vm.rs | 3 + 2 files changed, 432 insertions(+), 347 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f3112ec2a2..a9b1b0989a 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -25,7 +25,7 @@ use std::thread::JoinHandle; use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; -use std::{io, result, thread}; +use std::{io, mem, result, thread}; use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] @@ -45,7 +45,6 @@ use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; -use tracer::trace_scoped; use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; @@ -699,6 +698,41 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +/// Describes the current ownership of a running VM. +#[allow(clippy::large_enum_variant)] +pub enum MaybeVmOwnership { + /// The VMM holds the ownership of the VM. + Vmm(Vm), + /// The VM is temporarily blocked by the current ongoing migration. + Migration, + /// No VM is running. + None, +} + +impl MaybeVmOwnership { + /// Takes the VM and replaces it with [`Self::Migration`]. + /// + /// # Panics + /// This method panics if `self` is not [`Self::Vmm`]. + fn take_vm_for_migration(&mut self) -> Vm { + if !matches!(self, Self::Vmm(_)) { + panic!("should only be called when a migration can start"); + } + + match mem::replace(self, Self::Migration) { + MaybeVmOwnership::Vmm(vm) => vm, + _ => unreachable!(), + } + } + + fn vm_mut(&mut self) -> Option<&mut Vm> { + match self { + MaybeVmOwnership::Vmm(vm) => Some(vm), + _ => None, + } + } +} + /// Output value of [`MigrationWorker`]. struct MigrationThreadOut { vm: Vm, @@ -716,12 +750,7 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, - /// The currently running [`Vm`] instance, if any. - /// - /// This is `Some` from the boot to the shutdown of a VM. In the special - /// case of an ongoing live-migration, this is temporarily `None` and held - /// by a guard to prevent modifications to the VM. - vm: Option, + vm: MaybeVmOwnership, vm_config: Option>>, seccomp_action: SeccompAction, hypervisor: Arc, @@ -951,7 +980,7 @@ impl Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt, version: vmm_version, - vm: None, + vm: MaybeVmOwnership::None, vm_config: None, seccomp_action, hypervisor, @@ -1128,7 +1157,7 @@ impl Vmm { Command::Complete => { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. - let vm = self.vm.as_mut().unwrap(); + let vm = self.vm.vm_mut().unwrap(); let (_, resume_duration) = measure_ok(|| vm.resume())?; debug!( "Migration (incoming): resume:{}ms", @@ -1368,7 +1397,7 @@ impl Vmm { Ok(vm) })?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); Ok((receive_duration, restore_duration)) } @@ -1815,6 +1844,10 @@ impl Vmm { prefault: bool, memory_restore_mode: MemoryRestoreMode, ) -> std::result::Result<(), VmError> { + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let vm_snapshot = get_vm_snapshot(&snapshot).map_err(VmError::Restore)?; @@ -1863,7 +1896,7 @@ impl Vmm { Some(prefault), Some(memory_restore_mode), )?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); if self .vm_config @@ -1878,11 +1911,8 @@ impl Vmm { } // Now we can restore the rest of the VM. - if let Some(ref mut vm) = self.vm { - vm.restore() - } else { - Err(VmError::VmNotCreated) - } + // PANIC: won't panic, we just checked that the VM is there. + self.vm.vm_mut().unwrap().restore() } /// Checks the migration result. @@ -1901,11 +1931,11 @@ impl Vmm { .join() .expect("should have joined"); - // Give VMM back control. - self.vm = Some(vm); - match migration_res { Ok(()) => { + self.vm = MaybeVmOwnership::None; + drop(vm); + // Shutdown the VM after the migration succeeded if let Err(e) = self.exit_evt.write(1) { error!("Failed shutting down the VM after migration: {e}"); @@ -1913,6 +1943,9 @@ impl Vmm { } Err(e) => { error!("Migration failed: {e}"); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); } } } @@ -1977,7 +2010,7 @@ impl Vmm { } } EpollDispatch::ActivateVirtioDevices => { - if let Some(ref vm) = self.vm { + if let MaybeVmOwnership::Vmm(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; info!("Trying to activate pending virtio devices: count = {count}"); vm.activate_virtio_devices() @@ -2002,7 +2035,7 @@ impl Vmm { // Read from the API receiver channel let gdb_request = gdb_receiver.recv().map_err(Error::GdbRequestRecv)?; - let response = if let Some(ref mut vm) = self.vm { + let response = if let MaybeVmOwnership::Vmm(ref mut vm) = self.vm { vm.debug_request(&gdb_request.payload, gdb_request.cpu_id) } else { Err(VmError::VmNotRunning) @@ -2075,108 +2108,110 @@ impl RequestHandler for Vmm { tracer::start(); info!("Booting VM"); event!("vm", "booting"); - let r = { - trace_scoped!("vm_boot"); - // If we don't have a config, we cannot boot a VM. - if self.vm_config.is_none() { - return Err(VmError::VmMissingConfig); - } - // console_info is set to None in vm_shutdown. re-populate here if empty - if self.console_info.is_none() { - self.console_info = - Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); - } + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } - // Create a new VM if we don't have one yet. - if self.vm.is_none() { - let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; - let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; - let guest_exit_evt = self - .guest_exit_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - #[cfg(feature = "guest_debug")] - let vm_debug_evt = self - .vm_debug_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - let activate_evt = self - .activate_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - - if let Some(ref vm_config) = self.vm_config { - let vm = Vm::new( - Arc::clone(vm_config), - exit_evt, - reset_evt, - guest_exit_evt, - #[cfg(feature = "guest_debug")] - vm_debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - None, - None, - None, - None, - )?; - - self.vm = Some(vm); - } + // Create a new VM if we don't have one yet. + if matches!(self.vm, MaybeVmOwnership::None) { + let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; + let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + #[cfg(feature = "guest_debug")] + let vm_debug_evt = self + .vm_debug_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + let activate_evt = self + .activate_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + + if let Some(ref vm_config) = self.vm_config { + let vm = Vm::new( + Arc::clone(vm_config), + exit_evt, + reset_evt, + guest_exit_evt, + #[cfg(feature = "guest_debug")] + vm_debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + None, + None, + None, + None, + )?; + + self.vm = MaybeVmOwnership::Vmm(vm); } + } - // Now we can boot the VM. - if let Some(ref mut vm) = self.vm { - vm.boot() - } else { - Err(VmError::VmNotCreated) + // Now we can boot the VM. + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.boot()?; + event!("vm", "booted"); } - }; - tracer::end(); - if r.is_ok() { - event!("vm", "booted"); + MaybeVmOwnership::None => { + return Err(VmError::VmNotCreated); + } + _ => unreachable!(), } - r + + tracer::end(); + Ok(()) } fn vm_pause(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.pause().map_err(VmError::Pause) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_resume(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.resume().map_err(VmError::Resume) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - // Drain console_info so that FDs are not reused - let _ = self.console_info.take(); - vm.snapshot() - .map_err(VmError::Snapshot) - .and_then(|snapshot| { - vm.send(&snapshot, destination_url) - .map_err(VmError::SnapshotSend) - }) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + // Drain console_info so that FDs are not reused + let _ = self.console_info.take(); + vm.snapshot() + .map_err(VmError::Snapshot) + .and_then(|snapshot| { + vm.send(&snapshot, destination_url) + .map_err(VmError::SnapshotSend) + }) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { - if self.vm.is_some() || self.vm_config.is_some() { + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => (), + } + + if self.vm_config.is_some() { return Err(VmError::VmAlreadyCreated); } @@ -2236,21 +2271,25 @@ impl RequestHandler for Vmm { #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] fn vm_coredump(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.coredump(destination_url).map_err(VmError::Coredump) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.coredump(destination_url).map_err(VmError::Coredump) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_shutdown(&mut self) -> result::Result<(), VmError> { - let r = if let Some(ref mut vm) = self.vm.take() { - // Drain console_info so that the FDs are not reused - let _ = self.console_info.take(); - vm.shutdown() - } else { - Err(VmError::VmNotRunning) + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + // Drain console_info so that the FDs are not reused + let _ = self.console_info.take(); + let r = vm.shutdown(); + self.vm = MaybeVmOwnership::None; if r.is_ok() { event!("vm", "shutdown"); @@ -2263,13 +2302,14 @@ impl RequestHandler for Vmm { event!("vm", "rebooting"); // First we stop the current VM - let config = if let Some(mut vm) = self.vm.take() { - let config = vm.get_config(); - vm.shutdown()?; - config - } else { - return Err(VmError::VmNotCreated); + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + let config = vm.get_config(); + vm.shutdown()?; + self.vm = MaybeVmOwnership::None; // vm.shutdown() closes all the console devices, so set console_info to None // so that the closed FD #s are not reused. @@ -2324,7 +2364,7 @@ impl RequestHandler for Vmm { // And we boot it vm.boot()?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); event!("vm", "rebooted"); @@ -2332,35 +2372,40 @@ impl RequestHandler for Vmm { } fn vm_info(&self) -> result::Result { - match &self.vm_config { - Some(vm_config) => { - let state = match &self.vm { - Some(vm) => vm.get_state(), - None => VmState::Created, - }; - let config = vm_config.lock().unwrap().clone(); - - let mut memory_actual_size = - config.memory.total_size() - config.memory.hotplugged_size(); - if let Some(vm) = &self.vm { - memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); - memory_actual_size += vm.virtio_mem_plugged_size(); - } - - let device_tree = self - .vm - .as_ref() - .map(|vm| vm.device_tree().lock().unwrap().clone()); + let vm_config = self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + let vm_config = vm_config.lock().unwrap().clone(); + + let state = match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm.get_state(), + // TODO in theory one could live-migrate a non-running VM .. + MaybeVmOwnership::Migration => VmState::Running, + MaybeVmOwnership::None => VmState::Created, + }; - Ok(VmInfoResponse { - config: Box::new(config), - state, - memory_actual_size, - device_tree, - }) + let mut memory_actual_size = + vm_config.memory.total_size() - vm_config.memory.hotplugged_size(); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); + memory_actual_size += vm.virtio_mem_plugged_size(); } - None => Err(VmError::VmNotCreated), + MaybeVmOwnership::Migration => {} + MaybeVmOwnership::None => {} } + + let device_tree = match &self.vm { + MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), + // TODO we need to fix this + MaybeVmOwnership::Migration => None, + MaybeVmOwnership::None => None, + }; + + Ok(VmInfoResponse { + config: Box::new(vm_config), + state, + memory_actual_size, + device_tree, + }) } fn vmm_ping(&self) -> VmmPingResponse { @@ -2382,14 +2427,19 @@ impl RequestHandler for Vmm { return Ok(()); } - // If a VM is booted, we first try to shut it down. - if self.vm.is_some() { - self.vm_shutdown()?; - } - - self.vm_config = None; + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => { + event!("vm", "deleted"); - event!("vm", "deleted"); + // If a VM is booted, we first try to shut it down. + self.vm_shutdown()?; + self.vm_config = None; + } + MaybeVmOwnership::None => { + self.vm_config = None; + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + } Ok(()) } @@ -2412,59 +2462,68 @@ impl RequestHandler for Vmm { todo!("doesn't work currently with our thread-local KVM_RUN approach"); } - if let Some(ref mut vm) = self.vm { - vm.resize(desired_vcpus, desired_ram, desired_balloon) - .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; - Ok(()) - } else { - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - if let Some(desired_vcpus) = desired_vcpus { - config.cpus.boot_vcpus = desired_vcpus; - } - if let Some(desired_ram) = desired_ram { - config.memory.size = desired_ram; + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize(desired_vcpus, desired_ram, desired_balloon) + .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; + Ok(()) } - if let Some(desired_balloon) = desired_balloon - && let Some(balloon_config) = &mut config.balloon - { - balloon_config.size = desired_balloon; + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + if let Some(desired_vcpus) = desired_vcpus { + config.cpus.boot_vcpus = desired_vcpus; + } + if let Some(desired_ram) = desired_ram { + config.memory.size = desired_ram; + } + if let Some(desired_balloon) = desired_balloon + && let Some(balloon_config) = &mut config.balloon + { + balloon_config.size = desired_balloon; + } + + Ok(()) } - Ok(()) } } fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - return vm.resize_disk(&id, desired_size); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resize_disk(&id, desired_size), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::ResizeDisk), } - - Err(VmError::ResizeDisk) } fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - vm.resize_zone(&id, desired_ram) - .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; - Ok(()) - } else { - // Update VmConfig by setting the new desired ram. - let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; - - if let Some(zones) = &mut memory_config.zones { - for zone in zones.iter_mut() { - if zone.id == id { - zone.size = desired_ram; - return Ok(()); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize_zone(&id, desired_ram) + .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; + Ok(()) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by setting the new desired ram. + let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; + + if let Some(zones) = &mut memory_config.zones { + for zone in zones.iter_mut() { + if zone.id == id { + zone.size = desired_ram; + return Ok(()); + } } } - } - error!("Could not find the memory zone {id} for the resize"); - Err(VmError::ResizeZone) + error!("Could not find the memory zone {id} for the resize"); + Err(VmError::ResizeZone) + } } } @@ -2481,18 +2540,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_device(device_cfg).inspect_err(|e| { - error!("Error when adding new device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_device(device_cfg).inspect_err(|e| { + error!("Error when adding new device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.devices, device_cfg); + Ok(None) + } } } @@ -2509,35 +2572,45 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_user_device(device_cfg).inspect_err(|e| { - error!("Error when adding new user device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.user_devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_user_device(device_cfg).inspect_err(|e| { + error!("Error when adding new user device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.user_devices, device_cfg); + Ok(None) + } } } fn vm_remove_device(&mut self, id: String) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.remove_device(&id) - .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; - Ok(()) - } else if let Some(ref config) = self.vm_config { - let mut config = config.lock().unwrap(); - if config.remove_device(&id) { + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.remove_device(&id) + .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; Ok(()) - } else { - Err(VmError::NoDeviceToRemove(id)) } - } else { - Err(VmError::VmNotCreated) + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + if let Some(ref config) = self.vm_config { + let mut config = config.lock().unwrap(); + if config.remove_device(&id) { + Ok(()) + } else { + Err(VmError::NoDeviceToRemove(id)) + } + } else { + Err(VmError::VmNotCreated) + } + } } } @@ -2551,18 +2624,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_disk(disk_cfg).inspect_err(|e| { - error!("Error when adding new disk to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.disks, disk_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_disk(disk_cfg).inspect_err(|e| { + error!("Error when adding new disk to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.disks, disk_cfg); + Ok(None) + } } } @@ -2576,52 +2653,32 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_fs(fs_cfg).inspect_err(|e| { - error!("Error when adding new fs to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.fs, fs_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_fs(fs_cfg).inspect_err(|e| { + error!("Error when adding new fs to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.fs, fs_cfg); + Ok(None) + } } } fn vm_add_generic_vhost_user( &mut self, - generic_vhost_user_cfg: GenericVhostUserConfig, + _generic_vhost_user_cfg: GenericVhostUserConfig, ) -> result::Result>, VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - { - // Validate the configuration change in a cloned configuration - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap().clone(); - add_to_config( - &mut config.generic_vhost_user, - generic_vhost_user_cfg.clone(), - ); - config.validate().map_err(VmError::ConfigValidation)?; - } - - if let Some(ref mut vm) = self.vm { - let info = vm - .add_generic_vhost_user(generic_vhost_user_cfg) - .inspect_err(|e| { - error!("Error when adding new generic vhost-user device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); - Ok(None) - } + unimplemented!("removed in our fork for simplicity"); } fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> result::Result>, VmError> { @@ -2634,18 +2691,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { - error!("Error when adding new pmem device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.pmem, pmem_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { + error!("Error when adding new pmem device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.pmem, pmem_cfg); + Ok(None) + } } } @@ -2659,18 +2720,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_net(net_cfg).inspect_err(|e| { - error!("Error when adding new network device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.net, net_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_net(net_cfg).inspect_err(|e| { + error!("Error when adding new network device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.net, net_cfg); + Ok(None) + } } } @@ -2684,18 +2749,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { - error!("Error when adding new vDPA device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.vdpa, vdpa_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { + error!("Error when adding new vDPA device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.vdpa, vdpa_cfg); + Ok(None) + } } } @@ -2714,47 +2783,53 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { - error!("Error when adding new vsock device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - config.vsock = Some(vsock_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { + error!("Error when adding new vsock device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + config.vsock = Some(vsock_cfg); + Ok(None) + } } } fn vm_counters(&mut self) -> result::Result>, VmError> { - if let Some(ref mut vm) = self.vm { - let info = vm.counters().inspect_err(|e| { - error!("Error when getting counters from the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.counters().inspect_err(|e| { + error!("Error when getting counters from the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_power_button(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.power_button() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_nmi(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.nmi() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2816,7 +2891,7 @@ impl RequestHandler for Vmm { if let ReceiveMigrationState::Aborted = state { event!("vm", "migration-receive-failed"); - self.vm = None; + self.vm = MaybeVmOwnership::None; self.vm_config = None; } else { event!("vm", "migration-receive-finished"); @@ -2834,6 +2909,18 @@ impl RequestHandler for Vmm { .context("Invalid send migration configuration") .map_err(MigratableError::MigrateSend)?; + match self.vm { + MaybeVmOwnership::Vmm(_) => (), + MaybeVmOwnership::Migration => { + return Err(MigratableError::MigrateSend(anyhow!( + "There is already an ongoing migration" + ))); + } + MaybeVmOwnership::None => { + return Err(MigratableError::MigrateSend(anyhow!("VM is not running"))); + } + } + info!( "Sending migration: destination_url={},local={},tls={},downtime={}ms,timeout={}s,timeout_strategy={:?}", send_data_migration.destination_url, @@ -2844,9 +2931,6 @@ impl RequestHandler for Vmm { send_data_migration.timeout_strategy ); - // TODO Check if there is already a migration in progress - // will be done in next commit - if !self .vm_config .as_ref() @@ -2863,10 +2947,7 @@ impl RequestHandler for Vmm { // Take VM ownership. This also means that API events can no longer // change the VM (e.g. net device hotplug). - let vm = self - .vm - .take() - .ok_or(MigratableError::MigrateSend(anyhow!("VM is not running")))?; + let vm = self.vm.take_vm_for_migration(); let initial_vm_state = vm.get_state(); if initial_vm_state != VmState::Running && initial_vm_state != VmState::Paused { @@ -3218,6 +3299,7 @@ mod unit_tests { ); } + #[ignore] // skipped in our fork for simplicity #[test] fn test_vmm_vm_cold_add_generic_vhost_user() { let mut vmm = create_dummy_vmm(); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index b408a8fb99..f28c9cddb5 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -199,6 +199,9 @@ pub enum Error { #[error("VM is not running")] VmNotRunning, + #[error("VM is currently migrating and can't be modified")] + VmMigrating, + #[error("Cannot clone EventFd")] EventFdClone(#[source] io::Error), From f7303f34a9e6edec9e1f642cb43cb3a03e328fbc Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 30 Oct 2025 12:23:25 +0100 Subject: [PATCH 056/178] vmm: api: temporarily make VmSendMigration call blocking again Once we have a mechanism to query the progress of an ongoing live-migration, we can remove this workaround. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/api/http/http_endpoint.rs | 55 +++++++++++++++++++++++++++++-- vmm/src/lib.rs | 26 ++++++++++++++- vmm/src/seccomp_filters.rs | 3 ++ 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 980b3e067a..abea2ba51a 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -35,11 +35,23 @@ //! [special HTTP library]: https://github.com/firecracker-microvm/micro-http use std::fs::File; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, SyncSender}; +use std::sync::{LazyLock, Mutex}; +use log::info; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; +/// Helper to make the VmSendMigration call blocking as long as a migration is ongoing. +#[allow(clippy::type_complexity)] +pub static ONGOING_LIVEMIGRATION: LazyLock<( + SyncSender>, + Mutex>>, +)> = LazyLock::new(|| { + let (sender, receiver) = std::sync::mpsc::sync_channel(0); + (sender, Mutex::new(receiver)) +}); + #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; @@ -429,7 +441,6 @@ vm_action_put_handler_body!(VmRemoveDevice); vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); @@ -486,6 +497,46 @@ impl PutHandler for VmReceiveMigration { impl GetHandler for VmReceiveMigration {} +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmSendMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let res = self + .send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .map_err(HttpError::ApiError)?; + + info!("live migration started"); + + let (_, receiver) = &*ONGOING_LIVEMIGRATION; + + info!("waiting for live migration result"); + let mig_res = receiver.lock().unwrap().recv().unwrap(); + info!("received live migration result"); + + // We forward the migration error here to the guest + mig_res + .map(|_| res) + .map_err(|e| HttpError::ApiError(ApiError::VmSendMigration(e))) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmSendMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index a9b1b0989a..06e068b914 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -56,6 +56,7 @@ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use crate::api::http::http_endpoint::ONGOING_LIVEMIGRATION; use crate::api::{ ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, @@ -1936,6 +1937,13 @@ impl Vmm { self.vm = MaybeVmOwnership::None; drop(vm); + { + info!("Sending Receiver in HTTP thread that migration succeeded"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Ok(())).unwrap(); + } + // Shutdown the VM after the migration succeeded if let Err(e) = self.exit_evt.write(1) { error!("Failed shutting down the VM after migration: {e}"); @@ -1946,6 +1954,14 @@ impl Vmm { // Give VMM back control. self.vm = MaybeVmOwnership::Vmm(vm); + + { + info!("Sending Receiver in HTTP thread that migration failed"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Err(e)).unwrap(); + } + // we don't fail the VMM here, it just continues running its VM } } } @@ -2489,10 +2505,18 @@ impl RequestHandler for Vmm { } fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { + info!("request to resize disk: id={id}"); self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; match self.vm { - MaybeVmOwnership::Vmm(ref mut vm) => vm.resize_disk(&id, desired_size), + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_disk(&id, desired_size) { + error!("Error when resizing disk: {e:?}"); + Err(e) + } else { + Ok(()) + } + } MaybeVmOwnership::Migration => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::ResizeDisk), } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index fb5aaedf9f..bd85b0776e 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -976,6 +976,9 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), + (libc::SYS_getcwd, vec![]), + (libc::SYS_clock_nanosleep, vec![]), ]) } From 3b3eda17bf0eb3de1103baa64ebaf8ed4043ad78 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Feb 2026 17:38:26 +0100 Subject: [PATCH 057/178] vmm: migration: fix missing resume() VM after failed live migration If a failure happens fairly late in the migration, the VM will remain unusable. This commit uses the generic migration result check code path to resume() the VM when the VM was running before as well. I could nicely test various scenarios via `ch-remote`. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 06e068b914..ef07dc421a 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1925,7 +1925,10 @@ impl Vmm { fn check_migration_result(&mut self) { // At this point, the thread must be finished. // If we fail here, we have lost anyway. Just panic. - let MigrationThreadOut { vm, migration_res } = self + let MigrationThreadOut { + mut vm, + migration_res, + } = self .migration_thread_handle .take() .expect("should have thread") @@ -1952,6 +1955,28 @@ impl Vmm { Err(e) => { error!("Migration failed: {e}"); + // If the failure happened very late in the migration path, the VM might already be + // stopped. We resume it to ensure proper operation. + // + // Cloud Hypervisor only supports migration of running VMs, therefore it cannot + // happen that we resume a previously paused VM. + if vm.get_state() == VmState::Paused { + match vm.resume() { + Ok(_) => { + info!("Resumed VM successfully after failed migration"); + + // Ensure full VM performance. The operation is idempotent. + let _ = vm.stop_dirty_log().inspect_err(|e| { + warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); + }); + } + Err(e) => { + error!("Failed resuming VM after failed migration: {e}"); + self.exit_evt.write(1).unwrap(); + } + } + } + // Give VMM back control. self.vm = MaybeVmOwnership::Vmm(vm); From 27ba41b55651244078b9ff2fd136141d6738bdeb Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Feb 2026 17:35:32 +0100 Subject: [PATCH 058/178] vmm: migration: limit to running VMs only Cloud Hypervisor only supports migration of running VMs. There are too many implicit assumptions in the code to fix them easily. Further, with our current knowledge, this restriction is perfectly feasible. This check makes this failure case more explicit in favor of deeply nested errors. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index ef07dc421a..81f36ec31f 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2994,6 +2994,14 @@ impl RequestHandler for Vmm { ))); } + // Cloud Hypervisor only supports the migration of running VMs. + let current_state = self.vm.vm_mut().as_ref().unwrap().get_state(); + if current_state != VmState::Running { + return Err(MigratableError::MigrateSend(anyhow!(format!( + "Only running VMs can be migrated! state={current_state:?}" + )))); + } + // Take VM ownership. This also means that API events can no longer // change the VM (e.g. net device hotplug). let vm = self.vm.take_vm_for_migration(); From 04dd58fe1c67be484791b79011a8ce2497ebd20f Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 13 Jan 2026 11:26:55 +0100 Subject: [PATCH 059/178] vm-migration: prepare progress types for new API endpoint This is the first commit in a series of commits to introduce a new API endpoint in Cloud Hypervisor to report progress and live-insights about an ongoing live migration. Having live and frequently refreshing statistics/metrics about an ongoing live migration is especially interesting for debugging and monitoring, such as checking the actual network throughput. With the proposed changes, for the first time, we will be able to see how live migrations behave and create benchmarking infrastructure around it. The ch driver in libvirt will use these information to populate its `virsh domjobinfo` information. We will add a new API endpoint to query information for ongoing live migrations. The new endpoint will also serve to query information about any previously failed or canceled migrations. The SendMigration call will no longer be blocking (wait until the migration is done) but instead just dispatch the migration. This streamlines the behavior with QEMU and simplifies management software. When one queries the endpoint, a frequently refreshed snapshot of the migration statistics and progress will be returned. The data will not be assembled on the fly. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/lib.rs | 1 + vm-migration/src/progress.rs | 564 +++++++++++++++++++++++++++++++++++ 2 files changed, 565 insertions(+) create mode 100644 vm-migration/src/progress.rs diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 0613818c6a..c5c0ce3c3f 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -15,6 +15,7 @@ use crate::protocol::MemoryRangeTable; mod bitpos_iterator; mod context; +pub mod progress; pub mod protocol; pub mod tls; diff --git a/vm-migration/src/progress.rs b/vm-migration/src/progress.rs new file mode 100644 index 0000000000..8a5083068d --- /dev/null +++ b/vm-migration/src/progress.rs @@ -0,0 +1,564 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! Module for reporting status and progress of live migrations. +//! +//! The main export is [`MigrationProgress`]. +//! +//! # Motivation +//! +//! Monitoring a live-migration is important for debugging of cloud deployments, +//! for cloud monitoring in general, and for network optimization, such as +//! verifying the throughput for the migration is as high as expected. +//! +//! It also helps to analyze the downtime of VMs and see how much pressure a +//! guest is putting on its memory (by writing), which is slowing down +//! migrations. + +use std::error::Error; +use std::fmt; +use std::fmt::Display; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum TransportationMode { + Local, + Tcp { connections: NonZeroU32, tls: bool }, +} + +/// Carries information about the transmission of the VM's memory. +#[derive( + Clone, + Copy, + Debug, + Default, + PartialOrd, + Ord, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, +)] +pub struct MemoryTransmissionInfo { + /// The memory iteration (only in precopy mode). + pub memory_iteration: u64, + /// Memory bytes per second. + pub memory_transmission_bps: u64, + /// The total size of the VMs memory in bytes. + pub memory_bytes_total: u64, + /// The total size of transmitted bytes. + pub memory_bytes_transmitted: u64, + /// The amount of remaining bytes for this iteration. + pub memory_bytes_remaining_iteration: u64, + /// The amount of transmitted 4k pages. + pub memory_pages_4k_transmitted: u64, + /// The amount of remaining 4k pages for this iteration. + pub memory_pages_4k_remaining_iteration: u64, + /// The amount of constant pages for that we could take a shortcut. + /// Pages where all bits are either zero or one. + pub memory_pages_constant_count: u64, + /// Current memory dirty rate in pages per seconds (pps). + pub memory_dirty_rate_pps: u64, +} + +/// The different phases of an ongoing ([`MigrationState::Ongoing`]) migration +/// (good case). +/// +/// The states correspond to the [live-migration protocol]. +/// +/// [live-migration protocol]: super::protocol +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum MigrationStateOngoingPhase { + /// The migration starts. Handshake and transfer of VM config. + Starting, + /// Transfer of memory FDs. + /// + /// Only used for local migrations. + MemoryFds, + /// Transfer of VM memory in precopy mode. + /// + /// Not used for local migrations. + MemoryPrecopy, + // TODO eventually add MemoryPostcopy here + /// The VM migration is completing. This means the last chunks of memory + /// are transmitted as well as the final VM state (vCPUs, devices). + Completing, +} + +impl Display for MigrationStateOngoingPhase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Starting => write!(f, "starting"), + Self::MemoryFds => write!(f, "memory FDs"), + Self::MemoryPrecopy => write!(f, "memory (precopy)"), + Self::Completing => write!(f, "completing"), + } + } +} + +/// The different states of a migration, covering steady progress and failure. +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub enum MigrationState { + /// The migration has been cancelled. + Cancelled {}, + /// The migration has failed. + Failed { + /// Stringified error. + error_msg: String, + /// Debug-stringified error. + error_msg_debug: String, + // TODO this is very tricky because I need clone() + // error: Box, + }, + /// The migration has finished successfully. + Finished {}, + /// The migration is ongoing. + Ongoing { + phase: MigrationStateOngoingPhase, + /// Percent in range `0..=100`. + vcpu_throttle_percent: u8, + }, +} + +impl Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + MigrationState::Cancelled { .. } => write!(f, "{}", self.state_name()), + MigrationState::Failed { error_msg, .. } => { + write!(f, "{}: {error_msg}", self.state_name()) + } + MigrationState::Finished { .. } => write!(f, "{}", self.state_name()), + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => write!( + f, + "{}: phase={phase}, vcpu_throttle={vcpu_throttle_percent}", + self.state_name() + ), + } + } +} + +impl MigrationState { + fn state_name(&self) -> &'static str { + match self { + MigrationState::Cancelled { .. } => "cancelled", + MigrationState::Failed { .. } => "failed", + MigrationState::Finished { .. } => "finished", + MigrationState::Ongoing { .. } => "ongoing", + } + } +} + +/// Returns the current UNIX timestamp in ms. +fn current_unix_timestamp_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("should be valid duration") + .as_millis() as u64 +} + +/// Holds a snapshot of progress and status information for an ongoing live +/// migration, or the last snapshot of a canceled or aborted migration. +/// +/// This type carries insightful information for every step of the +/// [live-migration protocol] in a way that makes it easy for API users to +/// parse the data with ease while retaining all important information. +/// +/// [live-migration protocol]: super::protocol +#[derive( + Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, +)] +pub struct MigrationProgress { + /// UNIX timestamp of the start of the live-migration process in ms. + pub timestamp_begin_ms: u64, + /// UNIX timestamp of the current snapshot in ms. + pub timestamp_snapshot_ms: u64, + /// Relative timestamp since the beginning of the migration in ms. + pub timestamp_snapshot_relative_ms: u64, + /// Configured target downtime. + pub downtime_configured_ms: u64, + /// Currently estimated (computed) downtime given the remaining + /// transmissions and the bandwidth. + /// + /// If this is `0`, the downtime could not yet be calculated. + pub downtime_estimated_ms: u64, + /// Requested transportation mode. + pub transportation_mode: TransportationMode, + /// Snapshot of the current phase. + pub state: MigrationState, + /// Latest [`MemoryTransmissionInfo`] info, if any. + /// + /// The most interesting phase is when current state is + /// [`MigrationState::Ongoing`] and [`MigrationStateOngoingPhase::MemoryPrecopy`] + /// as this value will be updated frequently. + pub memory_transmission_info: MemoryTransmissionInfo, +} + +impl MigrationProgress { + /// Creates new progress in a valid init state. + /// + /// This progress must be updated using any of: + /// - [`Self::update`] + /// - [`Self::mark_as_finished`] + /// - [`Self::mark_as_failed`] + /// - [`Self::mark_as_cancelled`] + pub fn new(transportation_mode: TransportationMode, target_downtime: Duration) -> Self { + let timestamp = current_unix_timestamp_ms(); + Self { + timestamp_begin_ms: timestamp, + timestamp_snapshot_ms: timestamp, + timestamp_snapshot_relative_ms: 0, + downtime_configured_ms: target_downtime.as_millis() as u64, + downtime_estimated_ms: 0, + transportation_mode, + state: MigrationState::Ongoing { + phase: MigrationStateOngoingPhase::Starting, + vcpu_throttle_percent: 0, + }, + memory_transmission_info: MemoryTransmissionInfo::default(), + } + } + + /// Updates the state of an ongoing migration. + /// + /// Only updates new values that are provided via `Some`. + /// + /// # Arguments + /// + /// - `new_phase`: The current [`MigrationStateOngoingPhase`]. + /// - `new_memory_transmission_info`: If `Some`, the current [`MemoryTransmissionInfo`]. + /// - `new_cpu_throttle_percent`: If `Some`, the current value of the vCPU throttle percentage. + /// Must be in range `0..=100`. + /// - `new_estimated_downtime`: If `Some`, the latest expected (calculated) downtime. + pub fn update( + &mut self, + new_phase: MigrationStateOngoingPhase, + new_memory_transmission_info: Option, + new_cpu_throttle_percent: Option, + new_estimated_downtime: Option, + ) { + if let Some(percent) = new_cpu_throttle_percent { + assert!(percent <= 100); + } + + if let Some(downtime) = new_estimated_downtime { + self.downtime_estimated_ms = u64::try_from(downtime.as_millis()).unwrap(); + } else { + // This is better than showing `0` and it is likely close to the final actual downtime. + self.downtime_estimated_ms = self.downtime_configured_ms; + } + + match &self.state { + MigrationState::Ongoing { + phase: _old_phase, + vcpu_throttle_percent: old_vcpu_throttle_percent, + } => { + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = + self.timestamp_snapshot_ms - self.timestamp_begin_ms; + + self.memory_transmission_info = + new_memory_transmission_info.unwrap_or(self.memory_transmission_info); + self.state = MigrationState::Ongoing { + phase: new_phase, + vcpu_throttle_percent: new_cpu_throttle_percent + .unwrap_or(*old_vcpu_throttle_percent), + }; + } + illegal => { + // panic is fine as we have a logic error here, nothing that was caused by a user. + panic!( + "illegal state transition: {} -> ongoing", + illegal.state_name(), + ); + } + } + } + + /// Sets the underlying state to [`MigrationState::Cancelled`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_cancelled(&mut self) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> cancelled", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Cancelled {}; + } + + /// Sets the underlying state to [`MigrationState::Failed`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_failed(&mut self, error: &dyn Error) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> failed", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Failed { + error_msg: format!("{error}",), + error_msg_debug: format!("{error:?}",), + }; + } + + /// Sets the underlying state to [`MigrationState::Finished`] and + /// updates all corresponding metadata. + /// + /// After this state change, the object is supposed to be handled as immutable. + /// + /// # Panics + /// + /// If the current state is not [`MigrationState::Ongoing`], this function panics. + pub fn mark_as_finished(&mut self) { + if !matches!(self.state, MigrationState::Ongoing { .. }) { + panic!( + "illegal state transition: {} -> finished", + self.state.state_name() + ); + } + self.timestamp_snapshot_ms = current_unix_timestamp_ms(); + self.timestamp_snapshot_relative_ms = self.timestamp_snapshot_ms - self.timestamp_begin_ms; + self.state = MigrationState::Finished {}; + } +} + +#[cfg(test)] +mod unit_tests { + use std::thread; + + use super::*; + + fn tcp_mode() -> TransportationMode { + TransportationMode::Tcp { + connections: NonZeroU32::new(2).unwrap(), + tls: true, + } + } + + #[test] + fn new_initializes_valid_state() { + let target = Duration::from_millis(150); + let progress = MigrationProgress::new(tcp_mode(), target); + + assert_eq!(progress.timestamp_snapshot_ms, progress.timestamp_begin_ms); + assert_eq!(progress.timestamp_snapshot_relative_ms, 0); + assert_eq!(progress.downtime_configured_ms, 150); + assert_eq!(progress.downtime_estimated_ms, 0); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::Starting); + assert_eq!(vcpu_throttle_percent, 0); + } + _ => panic!("expected Ongoing state"), + } + + assert_eq!( + progress.memory_transmission_info, + MemoryTransmissionInfo::default() + ); + } + + #[test] + fn update_changes_phase_and_preserves_previous_values() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(200)); + + let initial_timestamp = progress.timestamp_snapshot_ms; + + thread::sleep(Duration::from_millis(1)); + + progress.update(MigrationStateOngoingPhase::MemoryPrecopy, None, None, None); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::MemoryPrecopy); + assert_eq!(vcpu_throttle_percent, 0); // unchanged + } + _ => panic!("expected Ongoing"), + } + + assert!(progress.timestamp_snapshot_ms >= initial_timestamp); + assert!(progress.timestamp_snapshot_relative_ms > 0); + + // If no estimated downtime provided, fallback to configured value + assert_eq!( + progress.downtime_estimated_ms, + progress.downtime_configured_ms + ); + } + + #[test] + fn update_replaces_memory_info_and_throttle() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(100)); + + let mem = MemoryTransmissionInfo { + memory_iteration: 3, + memory_transmission_bps: 10_000, + memory_bytes_total: 1_000_000, + memory_bytes_transmitted: 400_000, + memory_bytes_remaining_iteration: 100_000, + memory_pages_4k_transmitted: 100, + memory_pages_4k_remaining_iteration: 25, + memory_pages_constant_count: 10, + memory_dirty_rate_pps: 500, + }; + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + Some(mem), + Some(42), + Some(Duration::from_millis(55)), + ); + + assert_eq!(progress.memory_transmission_info, mem); + assert_eq!(progress.downtime_estimated_ms, 55); + + match progress.state { + MigrationState::Ongoing { + phase, + vcpu_throttle_percent, + } => { + assert_eq!(phase, MigrationStateOngoingPhase::MemoryPrecopy); + assert_eq!(vcpu_throttle_percent, 42); + } + _ => panic!("expected Ongoing"), + } + } + + #[test] + #[should_panic] + fn update_panics_if_not_ongoing() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + progress.mark_as_finished(); + + progress.update(MigrationStateOngoingPhase::Completing, None, None, None); + } + + #[test] + #[should_panic] + fn throttle_above_100_panics() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + None, + Some(101), + None, + ); + } + + #[test] + fn mark_as_finished_transitions_state() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + thread::sleep(Duration::from_millis(1)); + progress.mark_as_finished(); + + match progress.state { + MigrationState::Finished {} => {} + _ => panic!("expected Finished"), + } + + assert!(progress.timestamp_snapshot_relative_ms > 0); + } + + #[test] + #[should_panic] + fn mark_as_finished_twice_panics() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.mark_as_finished(); + progress.mark_as_finished(); + } + + #[test] + fn mark_as_failed_sets_error_strings() { + #[derive(Debug)] + struct TestError; + + impl fmt::Display for TestError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "test error") + } + } + + impl Error for TestError {} + + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.mark_as_failed(&TestError); + + match &progress.state { + MigrationState::Failed { + error_msg, + error_msg_debug, + } => { + assert_eq!(error_msg, "test error"); + assert!(error_msg_debug.contains("TestError")); + } + _ => panic!("expected Failed"), + } + } + + #[test] + fn display_formats_are_stable() { + let mut progress = + MigrationProgress::new(TransportationMode::Local, Duration::from_millis(10)); + + progress.update( + MigrationStateOngoingPhase::MemoryPrecopy, + None, + Some(12), + None, + ); + + let s = format!("{}", progress.state); + assert!(s.contains("ongoing")); + assert!(s.contains("phase=memory (precopy)")); + assert!(s.contains("vcpu_throttle=12")); + + progress.mark_as_cancelled(); + assert_eq!(format!("{}", progress.state), "cancelled"); + } +} From e27612d9a304d72009cb27d0f9e2f8a3356de61e Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 13 Jan 2026 11:27:01 +0100 Subject: [PATCH 060/178] vmm: add migration-progress API endpoint This is part of the commit series to enable live updates about an ongoing live migration. See the first commit for an introduction. We decided to use an Option<> rather than a Result<> as there isn't really an error that can happen when we query this endpoint. A previous snapshot may either be there or not. It also doesn't make sense here to check if the current VM is running, as users should always be able to query information about the past (failed or canceled) live migration. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- fuzz/Cargo.lock | 1 + fuzz/fuzz_targets/http_api.rs | 5 ++++ vmm/src/api/mod.rs | 51 +++++++++++++++++++++++++++++++++++ vmm/src/lib.rs | 5 ++++ 4 files changed, 62 insertions(+) diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 5e168d1560..dcc82409e8 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1600,6 +1600,7 @@ version = "0.1.0" dependencies = [ "arch", "libc", + "thiserror", "vm-memory", ] diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 34d24de77a..6e1c15feae 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -11,6 +11,7 @@ use std::thread; use libfuzzer_sys::{fuzz_target, Corpus}; use micro_http::Request; +use vm_migration::progress::MigrationProgress; use vm_migration::MigratableError; use vmm::api::http::*; use vmm::api::{ @@ -303,6 +304,10 @@ impl RequestHandler for StubApiRequestHandler { fn vm_nmi(&mut self) -> Result<(), VmError> { Ok(()) } + + fn vm_migration_progress(&mut self) -> Option { + None + } } fn http_receiver_stub(exit_evt: EventFd, api_evt: EventFd, api_receiver: Receiver) { diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index f994d73515..02edb6ebc9 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -46,6 +46,7 @@ use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_migration::MigratableError; +use vm_migration::progress::MigrationProgress; use vmm_sys_util::eventfd::EventFd; #[cfg(feature = "dbus_api")] @@ -211,6 +212,10 @@ pub enum ApiError { /// Error triggering NMI #[error("Error triggering NMI")] VmNmi(#[source] VmError), + + /// Error fetching the migration progress + #[error("Error fetching the migration progress")] + VmMigrationProgress(#[source] VmError), } pub type ApiResult = Result; @@ -677,6 +682,9 @@ pub enum ApiResponsePayload { /// Virtual machine information VmInfo(VmInfoResponse), + /// The progress of a possibly ongoing live migration. + VmMigrationProgress(Box>), + /// Vmm ping response VmmPing(VmmPingResponse), @@ -767,6 +775,10 @@ pub trait RequestHandler { ) -> Result<(), MigratableError>; fn vm_nmi(&mut self) -> Result<(), VmError>; + + /// Returns the progress of the currently active migration or any previous + /// failed or canceled migration. + fn vm_migration_progress(&mut self) -> Option; } /// It would be nice if we could pass around an object like this: @@ -1937,6 +1949,45 @@ impl ApiAction for VmNmi { } } +pub struct VmMigrationProgress; + +impl ApiAction for VmMigrationProgress { + type RequestBody = (); + type ResponseBody = Box>; + + fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmMigrationProgress"); + + let snapshot = Ok(vmm.vm_migration_progress()); + let response = snapshot + .map(Box::new) + .map(ApiResponsePayload::VmMigrationProgress) + .map_err(ApiError::VmMigrationProgress); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + let info = get_response(self, api_evt, api_sender, data)?; + + match info { + ApiResponsePayload::VmMigrationProgress(info) => Ok(info), + _ => Err(ApiError::ResponsePayloadType), + } + } +} + #[cfg(test)] mod unit_tests { use super::*; diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 81f36ec31f..e24b8d757f 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -47,6 +47,7 @@ use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; +use vm_migration::progress::MigrationProgress; use vm_migration::protocol::*; use vm_migration::{ MemoryMigrationContext, Migratable, MigratableError, OngoingMigrationContext, Pausable, @@ -3032,6 +3033,10 @@ impl RequestHandler for Vmm { ); Ok(()) } + + fn vm_migration_progress(&mut self) -> Option { + None + } } const CPU_MANAGER_SNAPSHOT_ID: &str = "cpu-manager"; From 2dd9618cfaa88f12cfd7cf6a6f52e070c18f4ea9 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 12 Jan 2026 17:39:49 +0100 Subject: [PATCH 061/178] vmm: add migration-progress HTTP endpoint This is part of the commit series to enable live updates about an ongoing live migration. See the first commit for an introduction. In this commit, we add the HTTP endpoint to export ongoing VM live-migration progress. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/api/http/http_endpoint.rs | 32 ++++++++++++++++++++++++++++--- vmm/src/api/http/mod.rs | 11 ++++++++--- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index abea2ba51a..93801168fc 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -59,9 +59,9 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmConfig, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, - VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, - VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, + VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, + VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -710,6 +710,32 @@ impl EndpointHandler for VmmShutdown { } } +impl EndpointHandler for VmMigrationProgress { + fn handle_request( + &self, + req: &Request, + api_notifier: EventFd, + api_sender: Sender, + ) -> Response { + match req.method() { + Method::Get => match crate::api::VmMigrationProgress + .send(api_notifier, api_sender, ()) + .map_err(HttpError::ApiError) + { + Ok(info) => { + let mut response = Response::new(Version::Http11, StatusCode::OK); + let info_serialized = serde_json::to_string(&info).unwrap(); + + response.set_body(Body::new(info_serialized)); + response + } + Err(e) => error_response(e, StatusCode::InternalServerError), + }, + _ => error_response(HttpError::BadRequest, StatusCode::BadRequest), + } + } +} + #[cfg(test)] mod external_fds_tests { use super::*; diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index f7cea4fafa..1f7a1e7cf7 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -29,9 +29,10 @@ use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdow use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, - VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, - VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, + VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -281,6 +282,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.shutdown"), Box::new(VmActionHandler::new(&VmShutdown)), ); + r.routes.insert( + endpoint!("/vm.migration-progress"), + Box::new(VmMigrationProgress {}), + ); r.routes.insert( endpoint!("/vm.snapshot"), Box::new(VmActionHandler::new(&VmSnapshot)), From fe5387d44fa578d3bcd99d25f098dcdfb8c597f8 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 22 Jan 2026 13:13:33 +0100 Subject: [PATCH 062/178] vmm: actually populate migration progress This is part of the commit series to enable live updates about an ongoing live migration. See the first commit for an introduction. This commit actually brings all the functionality together. The first version has the limitation that we populate the latest snapshot once per memory iteration, although this is the most interesting part by far. In a follow-up, we can make this more fine-grained. We guarantee that as soon as SendMigration returns, migration progress can be fetched as the underlying data source is populated. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/context.rs | 4 +- vmm/src/lib.rs | 138 +++++++++++++++++++++++++++++++----- 2 files changed, 123 insertions(+), 19 deletions(-) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 21801c0290..8e4d28c4f0 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -225,13 +225,13 @@ pub struct MemoryMigrationContext { /// Current iteration: 0 initial total transmission, >0 delta transmission. pub iteration: usize, /// Total bytes sent across all iterations. - total_sent_bytes: u64, + pub total_sent_bytes: u64, /// Total bytes to send in the current iteration. pub current_iteration_total_bytes: u64, /// The currently measured bandwidth. /// /// This is updated (at least) after each completed iteration. - bandwidth_bytes_per_second: f64, + pub bandwidth_bytes_per_second: f64, /// Calculated downtime in milliseconds regarding the current bandwidth and /// the remaining memory. /// diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e24b8d757f..10fba149f4 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -31,6 +31,7 @@ use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; +use arch::PAGE_SIZE; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; use console_devices::{ConsoleInfo, pre_create_console_devices}; @@ -47,7 +48,10 @@ use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; -use vm_migration::progress::MigrationProgress; +use vm_migration::progress::{ + MemoryTransmissionInfo, MigrationProgress, MigrationState, MigrationStateOngoingPhase, + TransportationMode, +}; use vm_migration::protocol::*; use vm_migration::{ MemoryMigrationContext, Migratable, MigratableError, OngoingMigrationContext, Pausable, @@ -298,6 +302,9 @@ impl From for EpollDispatch { } } +// TODO make this a member of Vmm? +static MIGRATION_PROGRESS_SNAPSHOT: Mutex> = Mutex::new(None); + pub struct EpollContext { epoll_file: File, } @@ -1426,6 +1433,36 @@ impl Vmm { is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, mem_send: &mut SendAdditionalConnections, ) -> result::Result { + let update_migration_progress = |s: &mut MemoryMigrationContext, vm: &Vm| { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update( + MigrationStateOngoingPhase::MemoryPrecopy, + Some(MemoryTransmissionInfo { + memory_iteration: s.iteration as u64, + memory_transmission_bps: s.current_iteration_total_bytes, + memory_bytes_total: s.bandwidth_bytes_per_second as u64, + memory_bytes_transmitted: s.total_sent_bytes, + memory_pages_4k_transmitted: s.total_sent_bytes.div_ceil(PAGE_SIZE as u64), + memory_pages_4k_remaining_iteration: s + .current_iteration_total_bytes + .div_ceil(PAGE_SIZE as u64), + memory_bytes_remaining_iteration: s.current_iteration_total_bytes, + memory_dirty_rate_pps: { + let pages = s.current_iteration_total_bytes.div_ceil(PAGE_SIZE as u64); + s.iteration_duration + .filter(|d| !d.is_zero()) + .map(|d| (pages as f64 / d.as_secs_f64()).ceil()) + .map_or(0, |dirty_rate| dirty_rate as u64) + }, + memory_pages_constant_count: 0, /* TODO */ + }), + Some(vm.throttle_percent()), + s.estimated_downtime, + ); + }; + loop { // todo: check if auto-converge is enabled at all? if Self::can_increase_autoconverge_step(ctx) @@ -1450,11 +1487,16 @@ impl Vmm { }; ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); + // Update before we might exit the loop. + update_migration_progress(ctx, vm); if is_converged(ctx)? { info!("Precopy converged: {ctx}"); break Ok(iteration_table); } + // Update with new metrics before transmission. + update_migration_progress(ctx, vm); + // Send the current dirty pages let transfer_begin = Instant::now(); mem_send.send_memory(iteration_table, socket)?; @@ -1687,6 +1729,11 @@ impl Vmm { if send_data_migration.local { match &mut socket { SocketStream::Unix(unix_socket) => { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update(MigrationStateOngoingPhase::MemoryFds, None, None, None); + // Proceed with sending memory file descriptors over UNIX socket vm.send_memory_fds(unix_socket)?; } @@ -1747,6 +1794,14 @@ impl Vmm { mem_send.cleanup()?; } + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .update(MigrationStateOngoingPhase::Completing, None, None, None); + } + // We release the locks early to enable locking them on the destination host. // The VM is already stopped. vm.release_disk_locks() @@ -1794,6 +1849,14 @@ impl Vmm { vm.stop_dirty_log()?; } + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_finished(); + } + // Let every Migratable object know about the migration being complete vm.complete_migration() } @@ -1981,6 +2044,14 @@ impl Vmm { // Give VMM back control. self.vm = MaybeVmOwnership::Vmm(vm); + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_failed(&e); + } + { info!("Sending Receiver in HTTP thread that migration failed"); let (sender, _) = &*ONGOING_LIVEMIGRATION; @@ -3014,28 +3085,61 @@ impl RequestHandler for Vmm { ))); } + // Update migration progress snapshot early: + // We guarantee that migration statistics can be fetched as soon as SendMigration returns. + // + // If the migration fails, the state will later be updated accordingly. + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + if lock + .as_ref() + .map(|p| &p.state) + .is_some_and(|snapshot| matches!(snapshot, MigrationState::Ongoing { .. })) + { + // If this panic triggers, we made a programming error in our state handling. + panic!("migration already ongoing"); + } + let transportation_mode = if send_data_migration.local { + TransportationMode::Local + } else { + TransportationMode::Tcp { + connections: send_data_migration.connections, + tls: send_data_migration.tls_dir.is_some(), + } + }; + lock.replace(MigrationProgress::new( + transportation_mode, + send_data_migration.downtime(), + )); + } + // Start migration thread - let worker = MigrationWorker { - vm, - check_migration_evt: self.check_migration_evt.try_clone().unwrap(), - config: send_data_migration, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - hypervisor: self.hypervisor.clone(), - }; + { + let worker = MigrationWorker { + vm, + check_migration_evt: self.check_migration_evt.try_clone().unwrap(), + config: send_data_migration, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: self.hypervisor.clone(), + }; - self.migration_thread_handle = Some( - thread::Builder::new() - .name("migration".into()) - .spawn(move || worker.run()) - // For upstreaming, we should simply continue and return an - // error when this fails. For our PoC, this is fine. - .unwrap(), - ); + self.migration_thread_handle = Some( + thread::Builder::new() + .name("migration".into()) + .spawn(move || worker.run()) + // For upstreaming, we should simply continue and return an + // error when this fails. For our PoC, this is fine. + .unwrap(), + ); + } Ok(()) } fn vm_migration_progress(&mut self) -> Option { - None + // We explicitly do not check here for `is VM running?` to always + // enable querying the state of the last failed migration. + let lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.clone() } } From d9a820606a888f1fd8768ab8314ea6eccdf1103c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Feb 2026 09:44:46 +0100 Subject: [PATCH 063/178] ch-remote: add `migration-progress` command On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/src/bin/ch-remote.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 3fd399a77f..44573a652d 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -317,6 +317,8 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu Some("shutdown") => { simple_api_command(socket, "PUT", "shutdown", None).map_err(Error::HttpApiClient) } + Some("migration-progress") => simple_api_command(socket, "GET", "migration-progress", None) + .map_err(Error::HttpApiClient), Some("nmi") => simple_api_command(socket, "PUT", "nmi", None).map_err(Error::HttpApiClient), Some("resize") => { let resize = resize_config( @@ -1059,6 +1061,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg(Arg::new("path").index(1).default_value("-")), Command::new("delete").about("Delete a VM"), Command::new("info").about("Info on the VM"), + Command::new("migration-progress"), Command::new("nmi").about("Trigger NMI"), Command::new("pause").about("Pause the VM"), Command::new("ping").about("Ping the VMM to check for API server availability"), From cf187423374a7c6a911463f1d8fa9743ee37ceb0 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Feb 2026 09:44:49 +0100 Subject: [PATCH 064/178] vmm: migration: switch to non-blocking SendMigration call Time has proven that the previous design was not optimal. Now, the SendMigration call is not blocking for the duration of the migration. Instead, it just triggers the migration. Using the new MigrationProgress endpoint, management software can trigger the state of the migration and also find information for failed migrations. A new `keep_alive` parameter for SendMigration will keep the VMM alive and usable after the migration to ensure management software can fetch the final state. The management software is then supposed to send a ShutdownVmm command. With this, we are finally able to query the migration progress API endpoint during an ongoing live migration. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/api/http/http_endpoint.rs | 42 ++++++++----------------------- vmm/src/api/mod.rs | 15 +++++++++-- vmm/src/lib.rs | 39 +++++++++++----------------- 3 files changed, 38 insertions(+), 58 deletions(-) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 93801168fc..07c2941256 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -35,23 +35,12 @@ //! [special HTTP library]: https://github.com/firecracker-microvm/micro-http use std::fs::File; -use std::sync::mpsc::{Receiver, Sender, SyncSender}; -use std::sync::{LazyLock, Mutex}; +use std::sync::mpsc::Sender; use log::info; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; -/// Helper to make the VmSendMigration call blocking as long as a migration is ongoing. -#[allow(clippy::type_complexity)] -pub static ONGOING_LIVEMIGRATION: LazyLock<( - SyncSender>, - Mutex>>, -)> = LazyLock::new(|| { - let (sender, receiver) = std::sync::mpsc::sync_channel(0); - (sender, Mutex::new(receiver)) -}); - #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; @@ -509,26 +498,15 @@ impl PutHandler for VmSendMigration { _files: Vec, ) -> std::result::Result, HttpError> { if let Some(body) = body { - let res = self - .send( - api_notifier, - api_sender, - serde_json::from_slice(body.raw())?, - ) - .map_err(HttpError::ApiError)?; - - info!("live migration started"); - - let (_, receiver) = &*ONGOING_LIVEMIGRATION; - - info!("waiting for live migration result"); - let mig_res = receiver.lock().unwrap().recv().unwrap(); - info!("received live migration result"); - - // We forward the migration error here to the guest - mig_res - .map(|_| res) - .map_err(|e| HttpError::ApiError(ApiError::VmSendMigration(e))) + self.send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .inspect(|_| { + info!("live migration started (in background)"); + }) + .map_err(HttpError::ApiError) } else { Err(HttpError::BadRequest) } diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 02edb6ebc9..3a1c8a4eae 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -494,6 +494,8 @@ pub struct VmSendMigrationData { /// Path to the directory containing the TLS root CA certificate (ca-cert.pem), the TLS client certificate (client-cert.pem), and TLS client key (client-key.pem). #[serde(default)] pub tls_dir: Option, + /// Keep the VMM alive. + pub keep_alive: bool, } impl VmSendMigrationData { @@ -530,7 +532,8 @@ impl VmSendMigrationData { .add("timeout_s") .add("timeout_strategy") .add("connections") - .add("tls_dir"); + .add("tls_dir") + .add("keep_alive"); parser .parse(migration) .map_err(VmSendMigrationConfigError::ParseError)?; @@ -586,6 +589,11 @@ impl VmSendMigrationData { .convert::("tls_dir") .map_err(VmSendMigrationConfigError::ParseError)? .map(|path| PathBuf::from(&path)); + let keep_alive = parser + .convert::("keep_alive") + .map_err(VmSendMigrationConfigError::ParseError)? + .unwrap_or(Toggle(false)) + .0; let data = Self { destination_url, @@ -595,6 +603,7 @@ impl VmSendMigrationData { timeout_strategy, connections, tls_dir, + keep_alive, }; data.validate()?; @@ -2180,13 +2189,14 @@ mod unit_tests { timeout_strategy: Default::default(), connections: VmSendMigrationData::default_connections(), tls_dir: None, + keep_alive: false, } ); // Happy path, fully specified let tls_dir = std::env::temp_dir(); let data = - VmSendMigrationData::parse(&format!("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4,tls_dir={}", tls_dir.display())) + VmSendMigrationData::parse(&format!("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4,tls_dir={},keep_alive=true", tls_dir.display())) .unwrap(); assert_eq!( data, @@ -2198,6 +2208,7 @@ mod unit_tests { timeout_strategy: TimeoutStrategy::Ignore, connections: NonZeroU32::new(4).unwrap(), tls_dir: Some(tls_dir), + keep_alive: true } ); } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 10fba149f4..901c2772d9 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -61,7 +61,6 @@ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; -use crate::api::http::http_endpoint::ONGOING_LIVEMIGRATION; use crate::api::{ ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, @@ -696,6 +695,7 @@ impl MigrationWorker { MigrationThreadOut { vm: self.vm, migration_res: res, + migration_cfg: self.config, } } } @@ -746,6 +746,7 @@ impl MaybeVmOwnership { struct MigrationThreadOut { vm: Vm, migration_res: result::Result<(), MigratableError>, + migration_cfg: VmSendMigrationData, } pub struct Vmm { @@ -1849,14 +1850,6 @@ impl Vmm { vm.stop_dirty_log()?; } - // Update migration progress snapshot - { - let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); - lock.as_mut() - .expect("live migration should be ongoing") - .mark_as_finished(); - } - // Let every Migratable object know about the migration being complete vm.complete_migration() } @@ -1992,6 +1985,7 @@ impl Vmm { let MigrationThreadOut { mut vm, migration_res, + migration_cfg, } = self .migration_thread_handle .take() @@ -2005,15 +1999,20 @@ impl Vmm { drop(vm); { - info!("Sending Receiver in HTTP thread that migration succeeded"); - let (sender, _) = &*ONGOING_LIVEMIGRATION; - // unblock API call; propagate migration result - sender.send(Ok(())).unwrap(); + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_finished(); } - // Shutdown the VM after the migration succeeded - if let Err(e) = self.exit_evt.write(1) { - error!("Failed shutting down the VM after migration: {e}"); + if migration_cfg.keep_alive { + // API users can still query live-migration statistics + info!("Keeping VMM alive as requested"); + } else { + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {e}"); + } } } Err(e) => { @@ -2051,14 +2050,6 @@ impl Vmm { .expect("live migration should be ongoing") .mark_as_failed(&e); } - - { - info!("Sending Receiver in HTTP thread that migration failed"); - let (sender, _) = &*ONGOING_LIVEMIGRATION; - // unblock API call; propagate migration result - sender.send(Err(e)).unwrap(); - } - // we don't fail the VMM here, it just continues running its VM } } } From 8bbea46a0cd9028479a62098863054865c08d0aa Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Feb 2026 09:43:17 +0100 Subject: [PATCH 065/178] ch-remote: wait for migration to finish by querying migration progress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We preserve the old behavior in ch-remote: SendMigration is blocking. A new ´--dispatch` flag however ensures that one can just dispatch the migration without waiting for it to finish (or fail). On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- Cargo.lock | 1 + cloud-hypervisor/Cargo.toml | 1 + cloud-hypervisor/src/bin/ch-remote.rs | 83 +++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 86107bc64c..82d42a7023 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -484,6 +484,7 @@ dependencies = [ "tpm", "tracer", "vm-memory", + "vm-migration", "vmm", "vmm-sys-util", "wait-timeout", diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml index 39bcd3ff47..be61df65f6 100644 --- a/cloud-hypervisor/Cargo.toml +++ b/cloud-hypervisor/Cargo.toml @@ -30,6 +30,7 @@ thiserror = { workspace = true } tpm = { path = "../tpm" } tracer = { path = "../tracer" } vm-memory = { workspace = true } +vm-migration = { path = "../vm-migration" } vmm = { path = "../vmm" } vmm-sys-util = { workspace = true } zbus = { version = "5.15.0", optional = true } diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 44573a652d..7d7a75abd6 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -11,17 +11,20 @@ use std::io::Read; use std::marker::PhantomData; use std::os::unix::net::UnixStream; use std::process; +use std::thread::sleep; +use std::time::Duration; use api_client::{ - Error as ApiClientError, simple_api_command, simple_api_command_with_fds, - simple_api_full_command, + Error as ApiClientError, StatusCode, simple_api_command, simple_api_command_with_fds, + simple_api_full_command, simple_api_full_command_and_response, }; #[cfg(feature = "dbus_api")] use clap::ArgAction; use clap::{Arg, ArgMatches, Command}; -use log::error; +use log::{error, info}; use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; +use vm_migration::progress::{MigrationProgress, MigrationState}; use vmm::config::RestoreConfig; use vmm::vm_config::{ DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, @@ -521,6 +524,14 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .map_err(Error::HttpApiClient) } Some("send-migration") => { + let just_dispatch = matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("dispatch") + .cloned() + .unwrap_or(false); + let wait_for_migration = !just_dispatch; + let send_migration_data = send_migration_data( matches .subcommand_matches("send-migration") @@ -529,7 +540,65 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap(), )?; simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) - .map_err(Error::HttpApiClient) + .map_err(Error::HttpApiClient)?; + + if !wait_for_migration { + return Ok(()); + } + loop { + let response = simple_api_full_command_and_response( + socket, + "GET", + "vm.migration-progress", + None, + ) + .map_err(Error::HttpApiClient)? + // should have response + .ok_or(Error::HttpApiClient(ApiClientError::ServerResponse( + StatusCode::Ok, + None, + )))?; + + // This is guaranteed by the SendMigration call + assert_ne!( + response, "null", + "migration progress should be there immediately when the migration was dispatched" + ); + + let progress = serde_json::from_slice::(response.as_bytes()) + .map_err(|e| { + error!("failed to parse response as MigrationProgress: {e}"); + Error::HttpApiClient(ApiClientError::ServerResponse( + StatusCode::Ok, + Some(response), + )) + })?; + + match progress.state { + MigrationState::Cancelled { .. } => { + info!("Migration was cancelled"); + break; + } + MigrationState::Failed { + error_msg, + error_msg_debug, + } => { + error!("Migration failed! {error_msg}\n{error_msg_debug}"); + break; + } + MigrationState::Finished { .. } => { + info!("Migration finished successfully. Shutting down Cloud Hypervisor"); + simple_api_full_command(socket, "PUT", "vmm.shutdown", None) + .map_err(Error::HttpApiClient)?; + break; + } + MigrationState::Ongoing { .. } => { + sleep(Duration::from_millis(50)); + continue; + } + } + } + Ok(()) } Some("receive-migration") => { let receive_migration_data = receive_migration_data( @@ -1135,6 +1204,12 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("resume").about("Resume the VM"), Command::new("send-migration") .about("Initiate a VM migration") + .arg( + Arg::new("dispatch") + .long("dispatch") + .help("just dispatch the migration without waiting for it to finish") + .num_args(0), + ) .arg( Arg::new("send_migration_config") .index(1) From 98405450b273d3d239eb56556f88be53ba70f7a2 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Feb 2026 11:26:42 +0100 Subject: [PATCH 066/178] vmm: api: less verbose log These events happen fairly often now and are very spammy in the log. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/api/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 3a1c8a4eae..363dd8737f 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -40,7 +40,7 @@ use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; -use log::info; +use log::{debug, info}; use micro_http::Body; use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; @@ -1371,7 +1371,7 @@ impl ApiAction for VmInfo { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - info!("API request event: VmInfo"); + debug!("API request event: VmInfo"); let response = vmm .vm_info() @@ -1966,7 +1966,7 @@ impl ApiAction for VmMigrationProgress { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - info!("API request event: VmMigrationProgress"); + debug!("API request event: VmMigrationProgress"); let snapshot = Ok(vmm.vm_migration_progress()); let response = snapshot From 26778e94dae7a8e8415ee3caf9531a1d98fa4753 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 19 Feb 2026 09:58:36 +0100 Subject: [PATCH 067/178] devices: stop reset loops when vCPUs are paused Legacy reset/poweroff paths waited only for vcpus_kill_signalled before leaving their spin loops. During migration, vCPUs are pause-signalled, not kill-signalled. This could stall reset/poweroff handling and block migration completion. Also break these waits on vcpus_pause_signalled and wire the new flag through device construction paths. Updated device paths: - i8042: guest reboot/reset write path wait loop - CMOS: reset register write (0x0f) wait loop - ACPI shutdown device: - reboot/reset write path wait loop - poweroff/shutdown write path wait loop On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- devices/src/acpi.rs | 11 +++++++++-- devices/src/legacy/cmos.rs | 10 +++++++++- devices/src/legacy/i8042.rs | 12 ++++++++++-- fuzz/fuzz_targets/cmos.rs | 1 + vmm/src/cpu.rs | 5 +++++ vmm/src/device_manager.rs | 15 +++++++++++++++ 6 files changed, 49 insertions(+), 5 deletions(-) diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs index a9c86aa18e..49f166655d 100644 --- a/devices/src/acpi.rs +++ b/devices/src/acpi.rs @@ -24,6 +24,7 @@ pub struct AcpiShutdownDevice { guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl AcpiShutdownDevice { @@ -32,11 +33,13 @@ impl AcpiShutdownDevice { guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, ) -> AcpiShutdownDevice { AcpiShutdownDevice { guest_exit_evt, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -56,7 +59,9 @@ impl BusDevice for AcpiShutdownDevice { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); @@ -73,7 +78,9 @@ impl BusDevice for AcpiShutdownDevice { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); diff --git a/devices/src/legacy/cmos.rs b/devices/src/legacy/cmos.rs index 238f2184d6..c5185622f4 100644 --- a/devices/src/legacy/cmos.rs +++ b/devices/src/legacy/cmos.rs @@ -28,6 +28,7 @@ pub struct Cmos { data: [u8; DATA_LEN], reset_evt: EventFd, vcpus_kill_signalled: Option>, + vcpus_pause_signalled: Option>, } impl Cmos { @@ -39,6 +40,7 @@ impl Cmos { mem_above_4g: u64, reset_evt: EventFd, vcpus_kill_signalled: Option>, + vcpus_pause_signalled: Option>, ) -> Cmos { let mut data = [0u8; DATA_LEN]; @@ -61,6 +63,7 @@ impl Cmos { data, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -79,9 +82,14 @@ impl BusDevice for Cmos { info!("CMOS reset"); self.reset_evt.write(1).unwrap(); if let Some(vcpus_kill_signalled) = self.vcpus_kill_signalled.take() { + let pause_signalled = self.vcpus_pause_signalled.clone(); // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !vcpus_kill_signalled.load(Ordering::SeqCst) { + while !vcpus_kill_signalled.load(Ordering::SeqCst) + && !pause_signalled + .as_ref() + .is_some_and(|p| p.load(Ordering::SeqCst)) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); diff --git a/devices/src/legacy/i8042.rs b/devices/src/legacy/i8042.rs index 0e014ab8bf..7639f819e0 100644 --- a/devices/src/legacy/i8042.rs +++ b/devices/src/legacy/i8042.rs @@ -16,14 +16,20 @@ use vmm_sys_util::eventfd::EventFd; pub struct I8042Device { reset_evt: EventFd, vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl I8042Device { /// Constructs a i8042 device that will signal the given event when the guest requests it. - pub fn new(reset_evt: EventFd, vcpus_kill_signalled: Arc) -> I8042Device { + pub fn new( + reset_evt: EventFd, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, + ) -> I8042Device { I8042Device { reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, } } } @@ -50,7 +56,9 @@ impl BusDevice for I8042Device { } // Spin until we are sure the reset_evt has been handled and that when // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !self.vcpus_kill_signalled.load(Ordering::SeqCst) { + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the VMM thread thread::sleep(std::time::Duration::from_millis(1)); diff --git a/fuzz/fuzz_targets/cmos.rs b/fuzz/fuzz_targets/cmos.rs index 9ae5b59da5..baf7ca8fdd 100644 --- a/fuzz/fuzz_targets/cmos.rs +++ b/fuzz/fuzz_targets/cmos.rs @@ -26,6 +26,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { u64::from_le_bytes(above_4g), EventFd::new(EFD_NONBLOCK).unwrap(), None, + None, ); let mut i = 16; diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index f42d3adfb3..9873e2abcc 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -2310,6 +2310,11 @@ impl CpuManager { &self.vcpus_kill_signalled } + pub(crate) fn vcpus_pause_signalled(&self) -> &Arc { + &self.vcpus_pause_signalled + } + + #[cfg(feature = "igvm")] #[cfg(all(feature = "igvm", feature = "mshv"))] pub(crate) fn get_cpuid_leaf( &self, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index ff71462571..b1a132ede4 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1918,10 +1918,17 @@ impl DeviceManager { .unwrap() .vcpus_kill_signalled() .clone(); + let vcpus_pause_signalled = self + .cpu_manager + .lock() + .unwrap() + .vcpus_pause_signalled() + .clone(); let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new( guest_exit_evt, reset_evt, vcpus_kill_signalled, + vcpus_pause_signalled, ))); self.bus_devices @@ -2026,10 +2033,17 @@ impl DeviceManager { .unwrap() .vcpus_kill_signalled() .clone(); + let vcpus_pause_signalled = self + .cpu_manager + .lock() + .unwrap() + .vcpus_pause_signalled() + .clone(); // Add a shutdown device (i8042) let i8042 = Arc::new(Mutex::new(devices::legacy::I8042Device::new( reset_evt.try_clone().unwrap(), vcpus_kill_signalled.clone(), + vcpus_pause_signalled.clone(), ))); self.bus_devices @@ -2058,6 +2072,7 @@ impl DeviceManager { mem_above_4g, reset_evt, Some(vcpus_kill_signalled), + Some(vcpus_pause_signalled.clone()), ))); self.bus_devices From 07426a58c1598252b0f6d797c97fac43ccdb208a Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 19 Feb 2026 14:25:17 +0100 Subject: [PATCH 068/178] devices: cmos: simplify pause/kill wait flags Align CMOS reset wait logic with i8042/ACPI by using direct pause/kill flags instead of optional wrappers and Option checks. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- devices/src/legacy/cmos.rs | 29 ++++++++++++----------------- fuzz/fuzz_targets/cmos.rs | 7 +++++-- vmm/src/device_manager.rs | 4 ++-- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/devices/src/legacy/cmos.rs b/devices/src/legacy/cmos.rs index c5185622f4..8f4b44941b 100644 --- a/devices/src/legacy/cmos.rs +++ b/devices/src/legacy/cmos.rs @@ -27,8 +27,8 @@ pub struct Cmos { index: u8, data: [u8; DATA_LEN], reset_evt: EventFd, - vcpus_kill_signalled: Option>, - vcpus_pause_signalled: Option>, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, } impl Cmos { @@ -39,8 +39,8 @@ impl Cmos { mem_below_4g: u64, mem_above_4g: u64, reset_evt: EventFd, - vcpus_kill_signalled: Option>, - vcpus_pause_signalled: Option>, + vcpus_kill_signalled: Arc, + vcpus_pause_signalled: Arc, ) -> Cmos { let mut data = [0u8; DATA_LEN]; @@ -81,19 +81,14 @@ impl BusDevice for Cmos { if self.index == 0x8f && data[0] == 0 { info!("CMOS reset"); self.reset_evt.write(1).unwrap(); - if let Some(vcpus_kill_signalled) = self.vcpus_kill_signalled.take() { - let pause_signalled = self.vcpus_pause_signalled.clone(); - // Spin until we are sure the reset_evt has been handled and that when - // we return from the KVM_RUN we will exit rather than re-enter the guest. - while !vcpus_kill_signalled.load(Ordering::SeqCst) - && !pause_signalled - .as_ref() - .is_some_and(|p| p.load(Ordering::SeqCst)) - { - // This is more effective than thread::yield_now() at - // avoiding a priority inversion with the VMM thread - thread::sleep(std::time::Duration::from_millis(1)); - } + // Spin until we are sure the reset_evt has been handled and that when + // we return from the KVM_RUN we will exit rather than re-enter the guest. + while !self.vcpus_kill_signalled.load(Ordering::SeqCst) + && !self.vcpus_pause_signalled.load(Ordering::SeqCst) + { + // This is more effective than thread::yield_now() at + // avoiding a priority inversion with the VMM thread + thread::sleep(std::time::Duration::from_millis(1)); } } else { self.data[(self.index & INDEX_MASK) as usize] = data[0]; diff --git a/fuzz/fuzz_targets/cmos.rs b/fuzz/fuzz_targets/cmos.rs index baf7ca8fdd..c925295790 100644 --- a/fuzz/fuzz_targets/cmos.rs +++ b/fuzz/fuzz_targets/cmos.rs @@ -3,6 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #![no_main] +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + use devices::legacy::Cmos; use libc::EFD_NONBLOCK; use libfuzzer_sys::{fuzz_target, Corpus}; @@ -25,8 +28,8 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { u64::from_le_bytes(below_4g), u64::from_le_bytes(above_4g), EventFd::new(EFD_NONBLOCK).unwrap(), - None, - None, + Arc::new(AtomicBool::new(false)), + Arc::new(AtomicBool::new(false)), ); let mut i = 16; diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index b1a132ede4..13f27e666f 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2071,8 +2071,8 @@ impl DeviceManager { mem_below_4g, mem_above_4g, reset_evt, - Some(vcpus_kill_signalled), - Some(vcpus_pause_signalled.clone()), + vcpus_kill_signalled, + vcpus_pause_signalled.clone(), ))); self.bus_devices From 668a6374e4ce68b4cf7a0349b763c5078d572520 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Feb 2026 16:42:36 +0100 Subject: [PATCH 069/178] vmm: set timeouts on sockets used for live migrations That way, when the connection between sender and receiver dies, both sides will time out and notice that something is wrong. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 53 +++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index d9e3e20077..1605a5e818 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -37,6 +37,19 @@ use crate::{GuestMemoryMmap, VmMigrationConfig}; /// receiver side. pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; +const RECEIVE_MIGRATION_SOCKET_TIMEOUT: Duration = Duration::from_secs(10); +const SEND_MIGRATION_SOCKET_TIMEOUT: Duration = Duration::from_secs(5); + +fn set_migration_socket_timeouts(socket: &TcpStream, timeout: Duration) -> anyhow::Result<()> { + socket + .set_read_timeout(Some(timeout)) + .context("Error setting read timeout on TCP socket")?; + socket + .set_write_timeout(Some(timeout)) + .context("Error setting write timeout on TCP socket")?; + Ok(()) +} + /// Transport-agnostic listener used to receive connections. #[derive(Debug)] pub(crate) enum ReceiveListener { @@ -49,25 +62,35 @@ impl ReceiveListener { /// Block until a connection is accepted. pub(crate) fn accept(&mut self) -> Result { match self { - ReceiveListener::Tcp(listener) => listener - .accept() - .map(|(socket, _)| SocketStream::Tcp(socket)) - .context("Failed to accept TCP migration connection") - .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tcp(listener) => { + let (socket, _) = listener + .accept() + .context("Failed to accept TCP migration connection") + .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) + .map_err(MigratableError::MigrateReceive)?; + + Ok(SocketStream::Tcp(socket)) + } ReceiveListener::Unix(listener) => listener .accept() .map(|(socket, _)| SocketStream::Unix(socket)) .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), - ReceiveListener::Tls(listener, config) => listener - .accept() - .map(|(socket, _)| TlsStream::new_server(socket, config)) - .context("Failed to accept TCP connection") - .map_err(MigratableError::MigrateReceive)? - .map(Box::new) - .map(SocketStream::Tls) - .context("Failed to accept TLS migration connection") - .map_err(MigratableError::MigrateReceive), + ReceiveListener::Tls(listener, config) => { + let (socket, _) = listener + .accept() + .context("Failed to accept TCP connection") + .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) + .map_err(MigratableError::MigrateReceive)?; + + TlsStream::new_server(socket, config) + .map(Box::new) + .map(SocketStream::Tls) + .context("Failed to accept TLS migration connection") + .map_err(MigratableError::MigrateReceive) + } } } @@ -821,6 +844,8 @@ pub(crate) fn send_migration_socket( let socket = TcpStream::connect(address).map_err(|e| { MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) })?; + set_migration_socket_timeouts(&socket, SEND_MIGRATION_SOCKET_TIMEOUT) + .map_err(MigratableError::MigrateSend)?; if let Some(tls_dir) = tls_dir { let server_name = tcp_address_to_server_name(address) From 8be5c81041dc2c0f286691ed5b414e9a9a25de3f Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 11 Feb 2026 16:20:27 +0100 Subject: [PATCH 070/178] vmm: make aborted migrations return errors On the receiver side, a live migration with status "aborted" does not return an error. Thus, management software will think that the live migration was successful. This is not expected behaviour. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 901c2772d9..7239458e12 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3005,10 +3005,11 @@ impl RequestHandler for Vmm { event!("vm", "migration-receive-failed"); self.vm = MaybeVmOwnership::None; self.vm_config = None; - } else { - event!("vm", "migration-receive-finished"); + return Err(MigratableError::CompleteMigration(anyhow!( + "Migration was aborted" + ))); } - + event!("vm", "migration-receive-finished"); Ok(()) } From 599d53a31ac70d55e194e499609f9cad920142bd Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Feb 2026 17:16:12 +0100 Subject: [PATCH 071/178] vm-migration: send periodic keep alive messages during live migration After introducing timeouts on sockets used for live migrations, live migations with mutliple TCP connections stopped working. This happens because the receiver side of the main connection does not get any messages while the VMs memory is being sent. Thus, we now send periodic keep alive messages when using multiple TCP connections. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/keep_alive_stream.rs | 282 ++++++++++++++++++++++++++ vm-migration/src/lib.rs | 2 +- vm-migration/src/protocol.rs | 11 +- vmm/src/lib.rs | 20 +- vmm/src/migration_transport.rs | 27 +++ 5 files changed, 336 insertions(+), 6 deletions(-) create mode 100644 vm-migration/src/keep_alive_stream.rs diff --git a/vm-migration/src/keep_alive_stream.rs b/vm-migration/src/keep_alive_stream.rs new file mode 100644 index 0000000000..821af88436 --- /dev/null +++ b/vm-migration/src/keep_alive_stream.rs @@ -0,0 +1,282 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Read, Write}; +use std::sync::mpsc::{Receiver, RecvTimeoutError, SyncSender, sync_channel}; +use std::thread::JoinHandle; +use std::time::Duration; +use std::{result, thread}; + +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; + +use crate::protocol::Request; + +/// The `KeepAliveStream` is a stream that is intended to be used for the main +/// connection of live migrations. If the `KeepAliveStream` does not read or +/// write often enough, it will send keep alive messages on the given stream. +/// The `KeepAliveStream` should not be used to send or receive memory, because +/// the `read_volatile()` and `write_volatile()` functions will be very slow. +/// +/// The `KeepAliveStream` is designed to be compatible with the `SocketStream` +/// enum, and thus it should be really easy to use it. +/// +/// The `KeepAliveStream` consists of a thread (the `KeepAliveWorker`) that owns +/// the given stream, and channels to send messages to said thread, and receive +/// answers from it. +// The messages that will be sent to the `KeepAliveWorker`. +#[derive(Debug)] +enum KeepAliveStreamMessage { + // Read `len` bytes from `stream`. + Read(usize /* len */), + // Write `buf` to `stream`. + Write(Vec /* buf */), + // Flush `stream`. + Flush, + // Stop listening for messages, i.e. stop the worker. + Disconnect, +} + +// The answer we will get from the `KeepAliveWorker`. +#[derive(Debug)] +enum KeepAliveStreamAnswer { + // Result of reading from `stream`. + Read(io::Result<(Vec, usize)>), + // Result of writing to `stream`. + Write(io::Result), + // Result of flushing `stream`. + Flush(io::Result<()>), +} + +// The [`KeepAliveStream`] should only be used by the sender, not the receiver. +// Thus it doesn't have to implement `AsFd`. +struct KeepAliveWorker { + stream: S, +} + +impl KeepAliveWorker +where + S: Read + Write, +{ + pub fn new(stream: S) -> Self { + Self { stream } + } + + pub fn read(&mut self, len: usize) -> io::Result<(Vec, usize)> { + let mut buf: Vec = vec![0u8; len]; + let n = Read::read(&mut self.stream, &mut buf)?; + Ok((buf, n)) + } + + pub fn write(&mut self, buf: &[u8]) -> io::Result { + Write::write(&mut self.stream, buf) + } + + pub fn flush(&mut self) -> io::Result<()> { + Write::flush(&mut self.stream) + } +} + +pub struct KeepAliveStream { + /// The `KeepAliveWorker`. + thread: Option>, + + /// Used to send messages to the worker. + message_tx: SyncSender, + /// Used to receive answers from the worker. + answer_rx: Receiver, +} + +impl KeepAliveStream { + pub fn new( + stream: T, + timeout: Duration, + ) -> result::Result { + // We want to block on send and on recv if nobody listens. Thus we set the bound to 0. + let (message_tx, message_rx) = sync_channel::(0); + let (answer_tx, answer_rx) = sync_channel::(0); + + let thread = thread::Builder::new() + .name("keep_alive_sender_thread".to_string()) + .spawn(move || { + let mut worker = KeepAliveWorker::new(stream); + loop { + // The idea is to always send a keep alive message when this times out. + match message_rx.recv_timeout(timeout) { + Ok(message) => match message { + KeepAliveStreamMessage::Read(payload) => { + if answer_tx + .send(KeepAliveStreamAnswer::Read(worker.read(payload))) + .is_err() + { + // We simply break the loop and thus stop the thread if anything bad happens. + // The main thread will notice next time it tries to send a message to the thread. + break; + } + } + KeepAliveStreamMessage::Write(payload) => { + if answer_tx + .send(KeepAliveStreamAnswer::Write(worker.write(&payload))) + .is_err() + { + break; + } + } + KeepAliveStreamMessage::Flush => { + if answer_tx + .send(KeepAliveStreamAnswer::Flush(worker.flush())) + .is_err() + { + break; + } + } + KeepAliveStreamMessage::Disconnect => break, + }, + Err(RecvTimeoutError::Timeout) => { + let keep_alive = Request::keep_alive(); + let _ = keep_alive.write_to(&mut worker.stream); + } + Err(RecvTimeoutError::Disconnected) => break, + } + } + })?; + + Ok(Self { + thread: Some(thread), + message_tx, + answer_rx, + }) + } +} + +impl Drop for KeepAliveStream { + fn drop(&mut self) { + let _ = self.message_tx.send(KeepAliveStreamMessage::Disconnect); + if let Some(handle) = self.thread.take() { + let _ = handle.join(); + } + } +} + +impl Read for KeepAliveStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.message_tx + .send(KeepAliveStreamMessage::Read(buf.len())) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Read(result)) => match result { + Ok((recv_buf, len)) => { + buf[..len].copy_from_slice(&recv_buf[..len]); + Ok(len) + } + Err(e) => Err(e), + }, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!" + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } +} + +impl Write for KeepAliveStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.message_tx + .send(KeepAliveStreamMessage::Write(Vec::from(buf))) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Write(result)) => result, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } + + fn flush(&mut self) -> io::Result<()> { + self.message_tx + .send(KeepAliveStreamMessage::Flush) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + })?; + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Flush(result)) => result, + Ok(a) => Err(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + ))), + Err(e) => Err(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + ))), + } + } +} + +impl ReadVolatile for KeepAliveStream { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> result::Result { + self.message_tx + .send(KeepAliveStreamMessage::Read(buf.len())) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + }) + .map_err(VolatileMemoryError::IOError)?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Read(result)) => match result { + Ok((recv_buf, len)) => { + buf.copy_from(&recv_buf[..len]); + Ok(len) + } + Err(e) => Err(VolatileMemoryError::IOError(e)), + }, + Ok(a) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + )))), + Err(e) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + )))), + } + } +} + +impl WriteVolatile for KeepAliveStream { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> result::Result { + let mut send_buf = vec![0u8; buf.len()]; + buf.copy_to(&mut send_buf); + self.message_tx + .send(KeepAliveStreamMessage::Write(send_buf)) + .map_err(|e| { + io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) + }) + .map_err(VolatileMemoryError::IOError)?; + + match self.answer_rx.recv() { + Ok(KeepAliveStreamAnswer::Write(result)) => { + result.map_err(VolatileMemoryError::IOError) + } + Ok(a) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Received unexpected answer: {a:?}. This is most likely a bug!", + )))), + Err(e) => Err(VolatileMemoryError::IOError(io::Error::other(format!( + "Unable to receive answer from KeepAliveWorker: {e}" + )))), + } + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index c5c0ce3c3f..38e42b29c3 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -15,6 +15,7 @@ use crate::protocol::MemoryRangeTable; mod bitpos_iterator; mod context; +pub mod keep_alive_stream; pub mod progress; pub mod protocol; pub mod tls; @@ -50,7 +51,6 @@ pub enum UffdError { #[error("Handler failed after startup")] HandlerFailed(#[source] std::io::Error), } - #[derive(Error, Debug)] pub enum MigratableError { #[error("Failed to pause migratable component")] diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 5fffbada87..01e0fba853 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -123,7 +123,12 @@ pub enum Command { MemoryFd = 7, /// Finalizes the migration without resuming the VM on the destination. /// Sent when the source VM was paused at migration time. - CompletePaused = 8, + CompletePaused = 9, + // We introduced this with discriminant eight but in the meantime, + // upstream introduced a new command with discriminant 8. For + // migration-compatibility we stick to this temporarily, until we have + // a solution for the discriminant collision. + KeepAlive = 8, } #[repr(C)] @@ -180,6 +185,10 @@ impl Request { Self::new(Command::Abandon, 0) } + pub fn keep_alive() -> Self { + Self::new(Command::KeepAlive, 0) + } + pub fn command(&self) -> Command { self.command } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 7239458e12..62b9c49d8e 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1686,10 +1686,18 @@ impl Vmm { let mut ctx = OngoingMigrationContext::new(); // Set up the socket connection - let mut socket = migration_transport::send_migration_socket( - &send_data_migration.destination_url, - send_data_migration.tls_dir.as_deref(), - )?; + let mut socket = if send_data_migration.connections.get() > 1 && !send_data_migration.local + { + migration_transport::send_migration_socket_with_keep_alive( + &send_data_migration.destination_url, + send_data_migration.tls_dir.as_deref(), + )? + } else { + migration_transport::send_migration_socket( + &send_data_migration.destination_url, + send_data_migration.tls_dir.as_deref(), + )? + }; // Start the migration migration_transport::send_request_expect_ok( @@ -2978,6 +2986,10 @@ impl RequestHandler for Vmm { let req = Request::read_from(&mut socket)?; trace!("Command {:?} received", req.command()); + if req.command() == Command::KeepAlive { + continue; + } + let (response, new_state) = match self.vm_receive_migration_step( &mut socket, &listener, diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 1605a5e818..388b676c7a 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -25,6 +25,7 @@ use vm_memory::{ Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; +use vm_migration::keep_alive_stream::KeepAliveStream; use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; use vm_migration::tls::{TlsServerConfig, TlsStream}; use vm_migration::{MigratableError, Snapshot}; @@ -148,6 +149,7 @@ pub(crate) enum SocketStream { Unix(UnixStream), Tcp(TcpStream), Tls(Box), + KeepAlive(KeepAliveStream), } impl Read for SocketStream { @@ -156,6 +158,7 @@ impl Read for SocketStream { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), SocketStream::Tls(stream) => stream.read(buf), + SocketStream::KeepAlive(stream) => stream.read(buf), } } } @@ -166,6 +169,7 @@ impl Write for SocketStream { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), SocketStream::Tls(stream) => stream.write(buf), + SocketStream::KeepAlive(stream) => stream.write(buf), } } @@ -174,6 +178,7 @@ impl Write for SocketStream { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), SocketStream::Tls(stream) => stream.flush(), + SocketStream::KeepAlive(stream) => stream.flush(), } } } @@ -184,6 +189,9 @@ impl AsFd for SocketStream { SocketStream::Unix(s) => s.as_fd(), SocketStream::Tcp(s) => s.as_fd(), SocketStream::Tls(s) => s.as_fd(), + SocketStream::KeepAlive(_) => { + unreachable!("KeepAliveStream is only used by the migration sender") + } } } } @@ -197,6 +205,7 @@ impl ReadVolatile for SocketStream { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), SocketStream::Tls(s) => s.read_volatile(buf), + SocketStream::KeepAlive(s) => s.read_volatile(buf), } } } @@ -210,6 +219,7 @@ impl WriteVolatile for SocketStream { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), SocketStream::Tls(s) => s.write_volatile(buf), + SocketStream::KeepAlive(s) => s.write_volatile(buf), } } } @@ -871,6 +881,23 @@ pub(crate) fn send_migration_socket( } } +/// Connect to the main migration endpoint and keep the connection active while +/// memory is transferred over additional streams. +pub(crate) fn send_migration_socket_with_keep_alive( + destination_url: &str, + tls_dir: Option<&Path>, +) -> Result { + match send_migration_socket(destination_url, tls_dir)? { + socket @ (SocketStream::Tcp(_) | SocketStream::Tls(_)) => { + KeepAliveStream::new(socket, SEND_MIGRATION_SOCKET_TIMEOUT) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateSend) + } + socket => Ok(socket), + } +} + /// Bind a migration listener for the receiver side. pub(crate) fn receive_migration_listener( receiver_url: &str, From a3f00f1a941ce756b1ac01a53038ce6ade3fa90f Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 23 Feb 2026 15:54:42 +0100 Subject: [PATCH 072/178] vmm: return error if the migration sender never connects The receiver of a migration would listen forever if the sender never connects. This can happen if the sender fails very early during the live migration. With this change the receiver only waits a few seconds and then returns an error. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 388b676c7a..56da4fd953 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -30,6 +30,7 @@ use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; use vm_migration::tls::{TlsServerConfig, TlsStream}; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::timerfd::TimerFd; use crate::sync_utils::Gate; use crate::{GuestMemoryMmap, VmMigrationConfig}; @@ -64,8 +65,7 @@ impl ReceiveListener { pub(crate) fn accept(&mut self) -> Result { match self { ReceiveListener::Tcp(listener) => { - let (socket, _) = listener - .accept() + let (socket, _) = accept_with_timeout(listener, RECEIVE_MIGRATION_SOCKET_TIMEOUT) .context("Failed to accept TCP migration connection") .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) @@ -79,8 +79,7 @@ impl ReceiveListener { .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), ReceiveListener::Tls(listener, config) => { - let (socket, _) = listener - .accept() + let (socket, _) = accept_with_timeout(listener, RECEIVE_MIGRATION_SOCKET_TIMEOUT) .context("Failed to accept TCP connection") .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) @@ -134,6 +133,26 @@ impl ReceiveListener { } } +/// Same as [`TcpListener::accept`], but returns an error if `timeout` expires. +fn accept_with_timeout( + listener: &TcpListener, + timeout: Duration, +) -> Result<(TcpStream, std::net::SocketAddr), io::Error> { + let mut timer_fd = TimerFd::new()?; + timer_fd + .reset(timeout, None) + .map_err(|e| io::Error::from_raw_os_error(e.errno()))?; + + wait_for_readable(listener, &timer_fd)? + .then(|| listener.accept()) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::TimedOut, + "Timed out waiting for sender to connect.", + ) + })? +} + impl AsFd for ReceiveListener { fn as_fd(&self) -> BorrowedFd<'_> { match self { From eebf20fe85487bd5837c7e45422a1ba74119b33b Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 26 Feb 2026 13:15:23 +0100 Subject: [PATCH 073/178] vmm: add context to MigrateSend-Error This commit adds more context to MigrateSend-Errors, since they were often not helpful. I did that by 1) stop converting errors to strings, 2) using anyhow::Context to add context to errors, and 3) just adding more text to some error messages. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 7 +++---- vmm/src/memory_manager.rs | 30 ++++++++++++++++++------------ vmm/src/vm.rs | 36 +++++++++++++++++++++--------------- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 62b9c49d8e..63046abe47 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1703,7 +1703,7 @@ impl Vmm { migration_transport::send_request_expect_ok( &mut socket, Request::start(), - MigratableError::MigrateSend(anyhow!("Error starting migration")), + MigratableError::MigrateSend(anyhow!("Error starting migration (got bad response)")), )?; // Send config @@ -1730,9 +1730,8 @@ impl Vmm { amx, }, ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error generating common cpuid': {e:?}")) - })? + .context("Error generating common cpuid") + .map_err(MigratableError::MigrateSend)? }; if send_data_migration.local { diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 41dbfc436c..dc3e8de3f3 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -19,7 +19,7 @@ use std::sync::{Arc, Barrier, Mutex}; use std::{ffi, result, thread}; use acpi_tables::{Aml, aml}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; use arch::RegionType; #[cfg(target_arch = "x86_64")] use devices::ioapic; @@ -3162,7 +3162,8 @@ impl Transportable for MemoryManager { .write(true) .create_new(true) .open(&memory_file_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating snapshot file for memory") + .map_err(MigratableError::MigrateSend)?; let total_len: u64 = self .snapshot_memory_ranges @@ -3224,7 +3225,8 @@ impl Transportable for MemoryManager { &mut memory_file, (range.length - offset) as usize, ) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing guest memory to snapshot file") + .map_err(MigratableError::MigrateSend)?; offset += bytes_written as u64; if offset == range.length { break; @@ -3246,9 +3248,10 @@ impl Migratable for MemoryManager { // Just before we do a bulk copy we want to start/clear the dirty log so that // pages touched during our bulk copy are tracked. fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { - self.vm.start_dirty_log().map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {e}")) - })?; + self.vm + .start_dirty_log() + .context("Error starting VM dirty log") + .map_err(MigratableError::MigrateSend)?; for r in self.guest_memory.memory().iter() { (**r).bitmap().reset(); @@ -3258,9 +3261,10 @@ impl Migratable for MemoryManager { } fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { - self.vm.stop_dirty_log().map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {e}")) - })?; + self.vm + .stop_dirty_log() + .context("Error stopping VM dirty log") + .map_err(MigratableError::MigrateSend)?; Ok(()) } @@ -3270,9 +3274,11 @@ impl Migratable for MemoryManager { fn dirty_log(&mut self) -> std::result::Result { let mut table = MemoryRangeTable::default(); for r in &self.guest_ram_mappings { - let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {e}")) - })?; + let vm_dirty_bitmap = self + .vm + .get_dirty_log(r.slot, r.gpa, r.size) + .context("Error getting VM dirty log") + .map_err(MigratableError::MigrateSend)?; let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) { Some(region) => { diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index f28c9cddb5..3886d216b8 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -24,7 +24,7 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{cmp, result, str, thread}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use arch::PciSpaceInfo; #[cfg(target_arch = "x86_64")] @@ -3072,18 +3072,18 @@ impl Vm { { Request::memory_fd(std::mem::size_of_val(&slot) as u64) .write_to(socket) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {e}")) - })?; + .context("Error sending memory fd request") + .map_err(MigratableError::MigrateSend)?; socket .send_with_fd(&slot.to_le_bytes()[..], fd) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error sending memory fd: {e}")) - })?; + .context("Error sending memory fd") + .map_err(MigratableError::MigrateSend)?; Response::read_from(socket)?.ok_or_abandon( socket, - MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), + MigratableError::MigrateSend(anyhow!( + "Error during memory fd migration (got bad response)" + )), )?; } @@ -3435,15 +3435,18 @@ impl Transportable for Vm { .write(true) .create_new(true) .open(snapshot_config_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating config snapshot file") + .map_err(MigratableError::MigrateSend)?; // Serialize and write the snapshot config let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error serializing VM config") + .map_err(MigratableError::MigrateSend)?; snapshot_config_file .write(vm_config.as_bytes()) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing serialized VM config") + .map_err(MigratableError::MigrateSend)?; let mut snapshot_state_path = url_to_path(destination_url)?; snapshot_state_path.push(SNAPSHOT_STATE_FILE); @@ -3454,15 +3457,18 @@ impl Transportable for Vm { .write(true) .create_new(true) .open(snapshot_state_path) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error creating state snapshot file") + .map_err(MigratableError::MigrateSend)?; // Serialize and write the snapshot state - let vm_state = - serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; + let vm_state = serde_json::to_vec(snapshot) + .context("Error serializing state snapshot") + .map_err(MigratableError::MigrateSend)?; snapshot_state_file .write(&vm_state) - .map_err(|e| MigratableError::MigrateSend(e.into()))?; + .context("Error writing serialized state snapshot") + .map_err(MigratableError::MigrateSend)?; // Tell the memory manager to also send/write its own snapshot. if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { From fc81e2ac407d72e9b22969201767e099d4f7812c Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 26 Feb 2026 16:52:42 +0100 Subject: [PATCH 074/178] vmm: return error if prefaulting fails Prefaulting pages was done on a best-effort basis before, meaning that errors were ignored. This could lead to errors during runtime, especially when used with hugepages, because there was no guarantee that enough pages are available. With this change errors during prefaulting will be reported. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/memory_manager.rs | 62 +++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index dc3e8de3f3..edbb918dd3 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -439,6 +439,10 @@ pub enum Error { /// Memory size is misaligned with default page size or its hugepage size #[error("Memory size is misaligned with default page size or its hugepage size")] MisalignedMemorySize, + + /// Failed to prefault memory + #[error("Failed to prefault memory")] + PrefaultMemory(#[source] io::Error), } impl From for Error { @@ -1991,29 +1995,49 @@ impl MemoryManager { let remainder = num_pages % num_threads; let barrier = Arc::new(Barrier::new(num_threads)); - thread::scope(|s| { + thread::scope(|s| -> Result<(), Error> { let r = ®ion; + let mut handles = Vec::new(); for i in 0..num_threads { let barrier = Arc::clone(&barrier); - s.spawn(move || { - // Wait until all threads have been spawned to avoid contention - // over mmap_sem between thread stack allocation and page faulting. - barrier.wait(); - let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; - let offset = - page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); - // SAFETY: FFI call with correct arguments - let ret = unsafe { - let addr = r.as_ptr().add(offset); - libc::madvise(addr.cast(), pages * page_size, libc::MADV_POPULATE_WRITE) - }; - if ret != 0 { - let e = io::Error::last_os_error(); - warn!("Failed to prefault pages: {e}"); - } - }); + let h: thread::ScopedJoinHandle<'_, Result<(), io::Error>> = + s.spawn(move || { + // Wait until all threads have been spawned to avoid contention + // over mmap_sem between thread stack allocation and page faulting. + barrier.wait(); + let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; + let offset = + page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); + // SAFETY: FFI call with correct arguments + let ret = unsafe { + let addr = r.as_ptr().add(offset); + libc::madvise( + addr.cast(), + pages * page_size, + libc::MADV_POPULATE_WRITE, + ) + }; + if ret != 0 { + let e = io::Error::last_os_error(); + warn!("Failed to prefault pages: {e}"); + return Err(e); + } + Ok(()) + }); + handles.push(h); } - }); + + for handle in handles { + handle + .join() + .map_err(|_| { + Error::PrefaultMemory(io::Error::other("Prefault thread died")) + })? + .map_err(Error::PrefaultMemory)?; + } + + Ok(()) + })?; } info!( From 0041e57035dba7e1d0b93bf72af99a2e0ad38db8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 2 Mar 2026 10:21:34 +0100 Subject: [PATCH 075/178] vmm: migration receiver: report error when migration is aborted This commit fixes the error reporting of the receiving side of a migration when the migration is aborted. Example: when a migration is aborted because the migration receiver doesn't have enough hugepages, the API response just reports that the migration was aborted, and the log tells us that the VMM couldn't prefault enough pages. With this commit, the API response contains these information. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 63046abe47..e270b2b923 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2981,7 +2981,7 @@ impl RequestHandler for Vmm { let mut state = ReceiveMigrationState::Established; - while !state.finished() { + let res: result::Result = loop { let req = Request::read_from(&mut socket)?; trace!("Command {:?} received", req.command()); @@ -2989,36 +2989,45 @@ impl RequestHandler for Vmm { continue; } - let (response, new_state) = match self.vm_receive_migration_step( + let (response, new_state, mut maybe_error) = match self.vm_receive_migration_step( &mut socket, &listener, state, &req, &receive_data_migration, ) { - Ok(next_state) => (Response::ok(), next_state), + Ok(next_state) => (Response::ok(), next_state, None), Err(err) => { warn!( "Migration aborted as migration command {:?} failed: {}", req.command(), err ); - (Response::error(), ReceiveMigrationState::Aborted) + (Response::error(), ReceiveMigrationState::Aborted, Some(err)) } }; state = new_state; assert_eq!(response.length(), 0); response.write_to(&mut socket)?; - } - if let ReceiveMigrationState::Aborted = state { + if maybe_error.is_some() { + break Err(maybe_error.take().unwrap()); + } else if state.finished() { + break Ok(state); + } + }; + + if matches!(res, Err(_) | Ok(ReceiveMigrationState::Aborted)) { event!("vm", "migration-receive-failed"); self.vm = MaybeVmOwnership::None; self.vm_config = None; - return Err(MigratableError::CompleteMigration(anyhow!( - "Migration was aborted" - ))); + return match res { + Ok(_) => Err(MigratableError::CompleteMigration(anyhow!( + "Migration was aborted by sender" + ))), + Err(e) => Err(MigratableError::CompleteMigration(e.into())), + }; } event!("vm", "migration-receive-finished"); Ok(()) From dc00fe1ebb9f024d5d929df048808725c51f7726 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 27 Feb 2026 09:47:02 +0100 Subject: [PATCH 076/178] vmm: add context to MigrateReceive-Error Like the previous commit, this commit adds context to errors. Some errors weren't easily convertible into anyhow::Error. I did not touch these places, as I want to keep the changes small and uncontroversial. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 94 ++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e270b2b923..a9bab074ea 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1012,9 +1012,10 @@ impl Vmm { ) -> std::result::Result<(u32, File), MigratableError> { if let SocketStream::Unix(unix_socket) = socket { let mut buf = [0u8; 4]; - let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error receiving slot from socket: {e}")) - })?; + let (_, file) = unix_socket + .recv_with_fd(&mut buf) + .context("Error receiving slot from socket") + .map_err(MigratableError::MigrateReceive)?; file.ok_or_else(|| MigratableError::MigrateReceive(anyhow!("Failed to receive socket"))) .map(|file| (u32::from_le_bytes(buf), file)) @@ -1211,10 +1212,9 @@ impl Vmm { .read_exact(&mut data) .map_err(MigratableError::MigrateSocket)?; - let vm_migration_config: VmMigrationConfig = - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising config: {e}")) - })?; + let vm_migration_config: VmMigrationConfig = serde_json::from_slice(&data) + .context("Error deserialising config") + .map_err(MigratableError::MigrateReceive)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] self.vm_check_cpuid_compatibility( @@ -1266,9 +1266,11 @@ impl Vmm { } } - self.console_info = Some(pre_create_console_devices(self).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) - })?); + self.console_info = Some( + pre_create_console_devices(self) + .context("Error creating console devices") + .map_err(MigratableError::MigrateReceive)?, + ); if self .vm_config @@ -1279,9 +1281,9 @@ impl Vmm { .landlock_enable { let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - apply_landlock(&mut config).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error applying landlock: {e:?}")) - })?; + apply_landlock(&mut config) + .context("Error applying landlock") + .map_err(MigratableError::MigrateReceive)?; } let vm = Vm::create_hypervisor_vm( @@ -1314,11 +1316,8 @@ impl Vmm { Some(&vm_migration_config.memory_manager_data), existing_memory_files, ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error creating MemoryManager from snapshot: {e:?}" - )) - })?; + .context("Error creating MemoryManager from snapshot") + .map_err(MigratableError::MigrateReceive)?; Ok(memory_manager) } @@ -1347,27 +1346,35 @@ impl Vmm { socket .read_exact(&mut data) .map_err(MigratableError::MigrateSocket)?; - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) - }) + serde_json::from_slice(&data) + .context("Error deserialising snapshot") + .map_err(MigratableError::MigrateReceive) })?; - let exit_evt = self.exit_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning exit EventFd: {e}")) - })?; - let reset_evt = self.reset_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning reset EventFd: {e}")) - })?; + let exit_evt = self + .exit_evt + .try_clone() + .context("Error cloning exit EventFd") + .map_err(MigratableError::MigrateReceive)?; + let reset_evt = self + .reset_evt + .try_clone() + .context("Error cloning reset EventFd") + .map_err(MigratableError::MigrateReceive)?; let guest_exit_evt = self.guest_exit_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning guest exit EventFd: {e}")) })?; #[cfg(feature = "guest_debug")] - let debug_evt = self.vm_debug_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning debug EventFd: {e}")) - })?; - let activate_evt = self.activate_evt.try_clone().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error cloning activate EventFd: {e}")) - })?; + let debug_evt = self + .vm_debug_evt + .try_clone() + .context("Error clonung debug EventFd") + .map_err(MigratableError::MigrateReceive)?; + let activate_evt = self + .activate_evt + .try_clone() + .context("Error cloning activate EventFd") + .map_err(MigratableError::MigrateReceive)?; let (vm, restore_duration) = measure_ok(|| { #[cfg(not(target_arch = "riscv64"))] @@ -1891,15 +1898,12 @@ impl Vmm { amx: vm_config.cpus.features.amx, }, ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {e:?}")) - })? + .context("Error generating common cpuid") + .map_err(MigratableError::MigrateReceive)? }; - arch::CpuidFeatureEntry::check_cpuid_compatibility(src_vm_cpuid, dest_cpuid).map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error checking cpu feature compatibility': {e:?}" - )) - }) + arch::CpuidFeatureEntry::check_cpuid_compatibility(src_vm_cpuid, dest_cpuid) + .context("Error checking cpu feature compatibility") + .map_err(MigratableError::MigrateReceive) } fn vm_restore( @@ -2975,7 +2979,13 @@ impl RequestHandler for Vmm { receive_data_migration.tls_dir.as_deref(), )?; // Accept the connection and get the socket - let mut socket = listener.accept()?; + let mut socket = listener + .accept() + .context("Failed to accept migration connection") + .map_err(|e| { + warn!("{e}"); + MigratableError::MigrateReceive(e) + })?; event!("vm", "migration-receive-started"); From fbb64c8a46bb8899d9b8ab90088c0563afb736cc Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 17 Feb 2026 14:14:27 +0100 Subject: [PATCH 077/178] vmm: add post-migration event to VmSnapshot During live migration, VM ownership is moved away from the VMM thread. To preserve guest-triggered reboot and shutdown lifecycle intent across that ownership handover, we need a small lifecycle marker to travel with the migrated VM state. This change introduces `PostMigrationLifecycleEvent` and stores it in `VmSnapshot` with `#[serde(default)]` for backward compatibility. `Vm::snapshot()` now serializes the marker, and VM construction from a snapshot restores it. No control-loop behavior is changed in this commit. This is only the data model/plumbing needed by follow-up commits. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/vm.rs | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 3886d216b8..9ac2360e6a 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -100,11 +100,9 @@ use crate::landlock::LandlockError; use crate::memory_manager::{ Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, }; -#[cfg(target_arch = "x86_64")] -use crate::migration::get_vm_snapshot; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::migration::url_to_file; -use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; +use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, get_vm_snapshot, url_to_path}; #[cfg(all( feature = "kvm", feature = "sev_snp", @@ -545,6 +543,13 @@ pub struct Vm { stop_on_boot: bool, load_payload_handle: Option>>, vcpu_throttler: ThrottleThreadHandle, + post_migration_lifecycle_event: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum PostMigrationLifecycleEvent { + VmReboot, + VmmShutdown, } impl Vm { @@ -703,6 +708,15 @@ impl Vm { } else { VmState::Created }; + let post_migration_lifecycle_event = snapshot + .as_ref() + .map(|snapshot| { + get_vm_snapshot(snapshot) + .map(|vm_snapshot| vm_snapshot.post_migration_lifecycle_event) + .map_err(Error::Restore) + }) + .transpose()? + .flatten(); // TODO we could also spawn the thread when a migration with auto-converge starts. // Probably this is the better design. @@ -728,6 +742,7 @@ impl Vm { stop_on_boot, load_payload_handle, vcpu_throttler, + post_migration_lifecycle_event, }) } @@ -1369,6 +1384,17 @@ impl Vm { self.vcpu_throttler.shutdown(); } + pub fn set_post_migration_lifecycle_event( + &mut self, + event: Option, + ) { + self.post_migration_lifecycle_event = event; + } + + pub fn post_migration_lifecycle_event(&self) -> Option { + self.post_migration_lifecycle_event + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, @@ -3338,6 +3364,8 @@ impl Pausable for Vm { #[derive(Serialize, Deserialize)] pub struct VmSnapshot { + #[serde(default)] + pub post_migration_lifecycle_event: Option, #[cfg(target_arch = "x86_64")] pub clock: Option, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -3391,6 +3419,7 @@ impl Snapshottable for Vm { }; let vm_snapshot_state = VmSnapshot { + post_migration_lifecycle_event: self.post_migration_lifecycle_event(), #[cfg(target_arch = "x86_64")] clock: self.saved_clock, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] From 23414bb4b063f4b1e329556937b31e7b57fe9f61 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 17 Feb 2026 14:42:39 +0100 Subject: [PATCH 078/178] vmm: postpone reset/exit during migration While a live migration is running, the migration worker owns the VM and the VMM control loop cannot execute vm_reboot()/vmm_shutdown() directly. Guest-triggered reset/exit events in that window currently hit VmMigrating and fail. This change makes the control loop consume reset/exit as before, but when ownership is `MaybeVmOwnership::Migration` it postpones a post-migration lifecycle intent instead of calling lifecycle handlers directly. The postponed state is first-event-wins and is cleared when a new send migration starts, preventing stale lifecycle intent from leaking between migrations. This commit only introduces source-side postponing behavior and does not yet apply or replay the postponed event. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/lib.rs | 88 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index a9bab074ea..c1bc5ebf88 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -72,14 +72,12 @@ use crate::coredump::GuestDebuggable; use crate::cpu::IS_IN_SHUTDOWN; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; -#[cfg(all(feature = "kvm", target_arch = "x86_64"))] -use crate::migration::get_vm_snapshot; -use crate::migration::{recv_vm_config, recv_vm_state}; +use crate::migration::{get_vm_snapshot, recv_vm_config, recv_vm_state}; use crate::migration_transport::{ ReceiveAdditionalConnections, ReceiveListener, SendAdditionalConnections, SocketStream, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; -use crate::vm::{Error as VmError, Vm, VmState}; +use crate::vm::{Error as VmError, PostMigrationLifecycleEvent, Vm, VmState}; use crate::vm_config::{ DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, MemoryZoneConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, @@ -643,6 +641,8 @@ struct MigrationWorker { vm: Vm, check_migration_evt: EventFd, config: VmSendMigrationData, + // Shared with main VMM thread + postponed_lifecycle_event: Arc>>, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc, } @@ -669,6 +669,7 @@ impl MigrationWorker { #[cfg(all(feature = "kvm", target_arch = "x86_64"))] self.hypervisor.as_ref(), &self.config, + self.postponed_lifecycle_event.as_ref(), ) .inspect(|_| event!("vm", "migration-finished")) .inspect_err(|_| { @@ -772,6 +773,8 @@ pub struct Vmm { console_info: Option, no_shutdown: bool, check_migration_evt: EventFd, + postponed_lifecycle_event: Arc>>, + received_postponed_lifecycle_event: Option, /// Handle to the [`MigrationWorker`] thread. /// /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. @@ -1002,10 +1005,25 @@ impl Vmm { console_info: None, no_shutdown, check_migration_evt, + postponed_lifecycle_event: Arc::new(Mutex::new(None)), + received_postponed_lifecycle_event: None, migration_thread_handle: None, }) } + fn postpone_lifecycle_event_during_migration(&self, event: PostMigrationLifecycleEvent) { + let mut postponed_event = self.postponed_lifecycle_event.lock().unwrap(); + if postponed_event.is_none() { + *postponed_event = Some(event); + info!("Postponed post-migration lifecycle event: {event:?}"); + } + } + + fn clear_postponed_lifecycle_event(&self) { + let mut postponed_event = self.postponed_lifecycle_event.lock().unwrap(); + *postponed_event = None; + } + /// Try to receive a file descriptor from a socket. Returns the slot number and the file descriptor. fn vm_receive_memory_fd( socket: &mut SocketStream, @@ -1169,11 +1187,35 @@ impl Vmm { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. let vm = self.vm.vm_mut().unwrap(); - let (_, resume_duration) = measure_ok(|| vm.resume())?; - debug!( - "Migration (incoming): resume:{}ms", - resume_duration.as_millis() - ); + + // We are on the control-loop thread handling an API request, so + // there is no concurrent access from other VMM or migration + // threads. The VM is in the Paused state , which permits both + // the Running transition (resume) and the Shutdown transition (reboot / exit) + // triggered via the eventfds below. + match self.received_postponed_lifecycle_event { + None => { + let (_, resume_duration) = measure_ok(|| vm.resume())?; + debug!( + "Migration (incoming): resume:{}ms", + resume_duration.as_millis() + ); + } + Some(PostMigrationLifecycleEvent::VmReboot) => { + self.reset_evt + .write(1) + .context("Failed writing reset eventfd after migration") + .map_err(MigratableError::MigrateReceive)?; + } + Some(PostMigrationLifecycleEvent::VmmShutdown) => { + self.exit_evt + .write(1) + .context("Failed writing exit eventfd after migration") + .map_err(MigratableError::MigrateReceive)?; + } + } + self.received_postponed_lifecycle_event = None; + // This logs the downtime without the final memory delta, so // it does not reflect the actual downtime. While we could // pass along the timestamp from when the VM was paused, @@ -1351,6 +1393,11 @@ impl Vmm { .map_err(MigratableError::MigrateReceive) })?; + let vm_snapshot = get_vm_snapshot(&snapshot) + .context("Failed extracting VM snapshot data") + .map_err(MigratableError::MigrateReceive)?; + self.received_postponed_lifecycle_event = vm_snapshot.post_migration_lifecycle_event; + let exit_evt = self .exit_evt .try_clone() @@ -1688,6 +1735,7 @@ impl Vmm { #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, + postponed_lifecycle_event: &Mutex>, ) -> result::Result<(), MigratableError> { // State machine that is updated with more context as we progress. let mut ctx = OngoingMigrationContext::new(); @@ -1831,6 +1879,7 @@ impl Vmm { } // Capture snapshot and send it + vm.set_post_migration_lifecycle_event(*postponed_lifecycle_event.lock().unwrap()); let (vm_snapshot, snapshot_duration) = measure_ok(|| vm.snapshot())?; let (_, send_snapshot_duration) = measure_ok(|| migration_transport::send_state(&mut socket, &vm_snapshot))?; @@ -2104,6 +2153,13 @@ impl Vmm { info!("VM exit event"); // Consume the event. self.exit_evt.read().map_err(Error::EventFdRead)?; + // Workaround for guest-induced shutdown during a live-migration. + if matches!(self.vm, MaybeVmOwnership::Migration) { + self.postpone_lifecycle_event_during_migration( + PostMigrationLifecycleEvent::VmmShutdown, + ); + continue; + } self.vmm_shutdown().map_err(Error::VmmShutdown)?; break 'outer; @@ -2112,6 +2168,13 @@ impl Vmm { info!("VM reset event"); // Consume the event. self.reset_evt.read().map_err(Error::EventFdRead)?; + // Workaround for guest-induced shutdown during a live-migration. + if matches!(self.vm, MaybeVmOwnership::Migration) { + self.postpone_lifecycle_event_during_migration( + PostMigrationLifecycleEvent::VmReboot, + ); + continue; + } self.vm_reboot().map_err(Error::VmReboot)?; } EpollDispatch::GuestExit => { @@ -2965,6 +3028,9 @@ impl RequestHandler for Vmm { .context("Invalid receive migration configuration") .map_err(MigratableError::MigrateReceive)?; + // Prevent stale lifecycle intent from a previous failed receive attempt. + self.received_postponed_lifecycle_event = None; + info!( "Receiving migration: receiver_url={},tls={},net_fds={:?}, tcp_url={:?}, zones={:?}", receive_data_migration.receiver_url, @@ -3074,6 +3140,9 @@ impl RequestHandler for Vmm { send_data_migration.timeout_strategy ); + // New migration attempt: clear postponed lifecycle from any previous run. + self.clear_postponed_lifecycle_event(); + if !self .vm_config .as_ref() @@ -3141,6 +3210,7 @@ impl RequestHandler for Vmm { vm, check_migration_evt: self.check_migration_evt.try_clone().unwrap(), config: send_data_migration, + postponed_lifecycle_event: self.postponed_lifecycle_event.clone(), #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: self.hypervisor.clone(), }; From fe3709fd46ffed12d620b4a25281de3bd6411947 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 17 Feb 2026 15:00:59 +0100 Subject: [PATCH 079/178] vmm: migration: switch downtime on postponed event When a lifecycle event like reset or shutdown is postponed during pre-copy, switch to downtime at the next iteration boundary. This keeps the current iteration send intact and then transitions into the existing graceful downtime path (`stop_vcpu_throttling()`, `pause()`, final transfer, snapshot). To keep behavior deterministic on source migration failure, replay the postponed lifecycle event locally after ownership is returned: - VmReboot -> reset_evt - VmmShutdown -> exit_evt Postponed state is cleared on both success and failure paths to avoid stale state across migrations. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/lib.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c1bc5ebf88..5457b6c523 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1019,6 +1019,10 @@ impl Vmm { } } + fn current_postponed_lifecycle_event(&self) -> Option { + *self.postponed_lifecycle_event.lock().unwrap() + } + fn clear_postponed_lifecycle_event(&self) { let mut postponed_event = self.postponed_lifecycle_event.lock().unwrap(); *postponed_event = None; @@ -1487,6 +1491,7 @@ impl Vmm { ctx: &mut MemoryMigrationContext, is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, mem_send: &mut SendAdditionalConnections, + postponed_lifecycle_event: &Mutex>, ) -> result::Result { let update_migration_progress = |s: &mut MemoryMigrationContext, vm: &Vm| { let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); @@ -1572,6 +1577,16 @@ impl Vmm { // Increment iteration last: This way we ensure that the logging // above matches the actual iteration. ctx.iteration += 1; + + let event = *postponed_lifecycle_event.lock().unwrap(); + if let Some(event) = event { + info!( + "Lifecycle event postponed during migration ({event:?}), switching to downtime phase early" + ); + // The current iteration has already been sent, therefore no extra range + // needs to be carried into the final transfer batch. + break Ok(MemoryRangeTable::default()); + } } } @@ -1681,6 +1696,7 @@ impl Vmm { send_data_migration: &VmSendMigrationData, mem_send: &mut SendAdditionalConnections, ctx: &mut OngoingMigrationContext, + postponed_lifecycle_event: &Mutex>, ) -> result::Result<(), MigratableError> { let mut mem_ctx = MemoryMigrationContext::new(); @@ -1692,6 +1708,7 @@ impl Vmm { // We bind send_data_migration to the callback |ctx| Self::is_precopy_converged(ctx, send_data_migration), mem_send, + postponed_lifecycle_event, )?; let downtime_begin = Instant::now(); // End throttle thread @@ -1845,6 +1862,7 @@ impl Vmm { send_data_migration, &mut mem_send, &mut ctx, + postponed_lifecycle_event, ) .inspect_err(|_| { // Calling cleanup multiple times is fine, thus here we just make sure @@ -2102,7 +2120,24 @@ impl Vmm { // Give VMM back control. self.vm = MaybeVmOwnership::Vmm(vm); - + if let Some(event) = self.current_postponed_lifecycle_event() { + match event { + PostMigrationLifecycleEvent::VmReboot => { + self.reset_evt + .write(1) + .context("Failed replaying reset event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + PostMigrationLifecycleEvent::VmmShutdown => { + self.exit_evt + .write(1) + .context("Failed replaying shutdown event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + } + } // Update migration progress snapshot { let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); @@ -2112,6 +2147,7 @@ impl Vmm { } } } + self.clear_postponed_lifecycle_event(); } fn control_loop( From 838ea2e71b500a2d7cc20c2dd59c6abbb79ce178 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 17 Feb 2026 16:17:48 +0100 Subject: [PATCH 080/178] vmm: streamline migration failure cleanup We do all cleanups in Vmm::check_migration_res() already at a centralized place. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 57 +++++++++++++++----------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5457b6c523..cd7ded8f02 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -648,46 +648,23 @@ struct MigrationWorker { } impl MigrationWorker { - /// Performs any final cleanup after failed live migrations. - /// - /// Helper for [`Self::migrate`]. - fn migrate_error_cleanup(&mut self) -> result::Result<(), MigratableError> { - // Stop logging dirty pages only for non-local migrations - if !self.config.local { - self.vm.stop_dirty_log()?; - } - - Ok(()) - } - - /// Migrate and cleanup. - fn migrate(&mut self) -> result::Result<(), MigratableError> { - debug!("start sending migration"); + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> MigrationThreadOut { + debug!("migration thread is starting"); event!("vm", "migration-started"); - Vmm::send_migration( + + let res = Vmm::send_migration( &mut self.vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), + self.hypervisor.as_ref(), &self.config, self.postponed_lifecycle_event.as_ref(), ) - .inspect(|_| event!("vm", "migration-finished")) - .inspect_err(|_| { - event!("vm", "migration-failed"); - let e = self.migrate_error_cleanup(); - if let Err(e) = e { - error!("Failed to clean up after a failed live migration. VM might keep running but in an odd or possibly slowed-down state: {e}"); - } - })?; - - Ok(()) - } - - /// Perform the migration and communicate with the [`Vmm`] thread. - fn run(mut self) -> MigrationThreadOut { - debug!("migration thread is starting"); - - let res = self.migrate().inspect_err(|e| error!("migrate error: {e}")); + .inspect(|_| event!("vm", "migration-finished")) + .inspect_err(|e| { + event!("vm", "migration-failed"); + error!("migrate error: {e}"); + }); // Notify VMM thread to get migration result by joining this thread. self.check_migration_evt.write(1).unwrap(); @@ -2095,7 +2072,7 @@ impl Vmm { } Err(e) => { error!("Migration failed: {e}"); - + // We don't fail the VMM here, it just continues running its VM. // If the failure happened very late in the migration path, the VM might already be // stopped. We resume it to ensure proper operation. // @@ -2105,11 +2082,6 @@ impl Vmm { match vm.resume() { Ok(_) => { info!("Resumed VM successfully after failed migration"); - - // Ensure full VM performance. The operation is idempotent. - let _ = vm.stop_dirty_log().inspect_err(|e| { - warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); - }); } Err(e) => { error!("Failed resuming VM after failed migration: {e}"); @@ -2118,6 +2090,11 @@ impl Vmm { } } + // Ensure full VM performance. The operation is idempotent. + let _ = vm.stop_dirty_log().inspect_err(|e| { + warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); + }); + // Give VMM back control. self.vm = MaybeVmOwnership::Vmm(vm); if let Some(event) = self.current_postponed_lifecycle_event() { From 1688e55c3acdb219a8109745230e09ede827d53c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Feb 2026 15:37:21 +0100 Subject: [PATCH 081/178] vmm: api: add VmCancelMigration action On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- fuzz/fuzz_targets/http_api.rs | 4 ++++ vm-migration/src/lib.rs | 3 +++ vmm/src/api/mod.rs | 44 +++++++++++++++++++++++++++++++++++ vmm/src/lib.rs | 13 +++++++++++ 4 files changed, 64 insertions(+) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 6e1c15feae..6a7844b97d 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -308,6 +308,10 @@ impl RequestHandler for StubApiRequestHandler { fn vm_migration_progress(&mut self) -> Option { None } + + fn vm_cancel_migration(&mut self) -> Result<(), MigratableError> { + Ok(()) + } } fn http_receiver_stub(exit_evt: EventFd, api_evt: EventFd, api_receiver: Receiver) { diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 38e42b29c3..952eba6a3e 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -86,6 +86,9 @@ pub enum MigratableError { #[error("Failed to retrieve dirty ranges for migratable component")] DirtyLog(#[source] anyhow::Error), + #[error("Failed to cancel migration")] + CancelMigration(#[source] anyhow::Error), + #[error("Failed to start migration for migratable component")] StartMigration(#[source] anyhow::Error), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 363dd8737f..184bede0e3 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -205,6 +205,10 @@ pub enum ApiError { #[error("Error starting migration sender")] VmSendMigration(#[source] MigratableError), + /// Error cancelling migration + #[error("Error cancelling migration")] + VmCancelMigration(#[source] MigratableError), + /// Error triggering power button #[error("Error triggering power button")] VmPowerButton(#[source] VmError), @@ -778,11 +782,18 @@ pub trait RequestHandler { receive_data_migration: VmReceiveMigrationData, ) -> Result<(), MigratableError>; + /// Dispatches the migration. fn vm_send_migration( &mut self, send_data_migration: VmSendMigrationData, ) -> Result<(), MigratableError>; + /// Triggers a migration cancellation. + /// + /// The cancellation is not guaranteed to succeed, as the migration may have + /// succeeded already. + fn vm_cancel_migration(&mut self) -> Result<(), MigratableError>; + fn vm_nmi(&mut self) -> Result<(), VmError>; /// Returns the progress of the currently active migration or any previous @@ -1533,6 +1544,39 @@ impl ApiAction for VmReceiveMigration { } } +pub struct VmCancelMigration; + +impl ApiAction for VmCancelMigration { + type RequestBody = (); + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmCancelMigration {data:?}"); + + let response = vmm + .vm_cancel_migration() + .map_err(ApiError::VmCancelMigration) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmRemoveDevice; impl ApiAction for VmRemoveDevice { diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index cd7ded8f02..4a85bcfe46 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3240,6 +3240,19 @@ impl RequestHandler for Vmm { Ok(()) } + fn vm_cancel_migration(&mut self) -> result::Result<(), MigratableError> { + match self.vm { + MaybeVmOwnership::Migration => (), + _ => { + return Err(MigratableError::CancelMigration(anyhow!( + "There is no ongoing migration" + ))); + } + } + + todo!() + } + fn vm_migration_progress(&mut self) -> Option { // We explicitly do not check here for `is VM running?` to always // enable querying the state of the last failed migration. From 9fa49fd84607f3edadc7e062376401ee974eb545 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Feb 2026 15:39:31 +0100 Subject: [PATCH 082/178] vmm: http api: add VmCancelMigration action On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/api/http/http_endpoint.rs | 7 ++++--- vmm/src/api/http/mod.rs | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 07c2941256..ad58424319 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -48,9 +48,9 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmCancelMigration, VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, + VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -417,6 +417,7 @@ vm_action_put_handler!(VmPause); vm_action_put_handler!(VmResume); vm_action_put_handler!(VmPowerButton); vm_action_put_handler!(VmNmi); +vm_action_put_handler!(VmCancelMigration); vm_action_put_handler_body!(VmAddDevice); vm_action_put_handler_body!(AddDisk); diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 1f7a1e7cf7..3ed9a4c2b4 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -29,8 +29,8 @@ use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdow use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, - VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, - VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, + VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, + VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; @@ -278,6 +278,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.send-migration"), Box::new(VmActionHandler::new(&VmSendMigration)), ); + r.routes.insert( + endpoint!("/vm.cancel-migration"), + Box::new(VmActionHandler::new(&VmCancelMigration)), + ); r.routes.insert( endpoint!("/vm.shutdown"), Box::new(VmActionHandler::new(&VmShutdown)), From 2a2f4e618d8ea1c3f13c62ff6f10b26b7e9aad0b Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Feb 2026 15:41:54 +0100 Subject: [PATCH 083/178] vmm: migration: add handle wrapper for MigrationWorker This will allow us to implement the cancel mechanism in the next commit. Further, we remove the runtime panic if the migration thread can't be spawned. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 127 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 103 insertions(+), 24 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 4a85bcfe46..62c64b44ae 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -634,6 +634,34 @@ impl VmmVersionInfo { } } +/// Handle for the [`MigrationWorker`] thread. +struct MigrationWorkerHandle { + // Option to take the inner handle + handle: Option>, +} + +impl MigrationWorkerHandle { + /// Joins the thread and returns the result. + fn join(mut self) -> MigrationThreadOut { + self.handle + .take() + .expect("should have thread") + .join() + .expect("should join migration thread gracefully") + } +} + +impl Drop for MigrationWorkerHandle { + fn drop(&mut self) { + if let Some(handle) = self.handle.take() { + warn!("Migration thread wasn't cleaned up explicitly via join()"); + handle + .join() + .expect("should join migration thread gracefully"); + } + } +} + /// Abstraction for the thread controlling and performing the live migration. /// /// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. @@ -676,6 +704,57 @@ impl MigrationWorker { migration_cfg: self.config, } } + + #[expect(clippy::result_large_err)] + fn spawn( + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + postponed_lifecycle_event: Arc>>, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc< + dyn hypervisor::Hypervisor, + >, + ) -> result::Result { + let worker = MigrationWorker { + vm, + check_migration_evt, + config, + postponed_lifecycle_event, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor, + }; + + // Cumbersome but we need this to take a value from the worker when + // thread spawning failed. Ownership of the worker is either by the + // thread or this function. + let worker = Arc::new(Mutex::new(Some(worker))); + let thread_worker = worker.clone(); + + let inner_handle = thread::Builder::new() + .name("migration".into()) + .spawn(move || { + thread_worker + .lock() + .unwrap() + .take() + .expect("migration worker should only be taken once") + .run() + }) + .context("should spawn migration thread") + .map_err(|e| { + // Get the VM back from the worker. + let worker = worker + .lock() + .unwrap() + .take() + .expect("migration worker should remain available on spawn failure"); + (worker.vm, MigratableError::MigrateSend(e)) + })?; + + Ok(MigrationWorkerHandle { + handle: Some(inner_handle), + }) + } } pub struct VmmThreadHandle { @@ -753,9 +832,7 @@ pub struct Vmm { postponed_lifecycle_event: Arc>>, received_postponed_lifecycle_event: Option, /// Handle to the [`MigrationWorker`] thread. - /// - /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. - migration_thread_handle: Option>, + migration_thread_handle: Option, } /// Just a wrapper for the data that goes into @@ -2045,8 +2122,7 @@ impl Vmm { .migration_thread_handle .take() .expect("should have thread") - .join() - .expect("should have joined"); + .join(); match migration_res { Ok(()) => { @@ -3217,26 +3293,29 @@ impl RequestHandler for Vmm { )); } - // Start migration thread - { - let worker = MigrationWorker { - vm, - check_migration_evt: self.check_migration_evt.try_clone().unwrap(), - config: send_data_migration, - postponed_lifecycle_event: self.postponed_lifecycle_event.clone(), - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - hypervisor: self.hypervisor.clone(), - }; + // When spawning the thread fails, the VM keeps running normally. + let migration_worker = MigrationWorker::spawn( + vm, + self.check_migration_evt.try_clone().unwrap(), + send_data_migration, + self.postponed_lifecycle_event.clone(), + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.clone(), + ) + .map_err(|(vm, e)| { + self.vm = MaybeVmOwnership::Vmm(vm); + + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_failed(&e); + + e + })?; + let old = self.migration_thread_handle.replace(migration_worker); + // If this fails, we messed up the thread lifecycle management. + debug_assert!(old.is_none()); - self.migration_thread_handle = Some( - thread::Builder::new() - .name("migration".into()) - .spawn(move || worker.run()) - // For upstreaming, we should simply continue and return an - // error when this fails. For our PoC, this is fine. - .unwrap(), - ); - } Ok(()) } From c70dd779055e04be9b8fc9999abf49faf97adb48 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 27 Feb 2026 16:23:11 +0100 Subject: [PATCH 084/178] vmm: add try_resume_vm() helper By extracting this functionality from the Err() match arm into a helper function, we can reuse it in the next commit. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 99 ++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 51 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 62c64b44ae..603cfa86c0 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2115,7 +2115,7 @@ impl Vmm { // At this point, the thread must be finished. // If we fail here, we have lost anyway. Just panic. let MigrationThreadOut { - mut vm, + vm, migration_res, migration_cfg, } = self @@ -2124,6 +2124,52 @@ impl Vmm { .expect("should have thread") .join(); + let mut try_resume_vm = |mut vm: Vm| { + // If the failure happened very late in the migration path, the VM might already be + // stopped. We resume it to ensure proper operation. + // + // Cloud Hypervisor only supports migration of running VMs, therefore it cannot + // happen that we resume a previously paused VM. + if vm.get_state() == VmState::Paused { + match vm.resume() { + Ok(_) => { + info!("Resumed VM successfully after failed migration"); + } + Err(e) => { + error!("Failed resuming VM after failed migration: {e}"); + self.exit_evt.write(1).unwrap(); + } + } + } + + // Ensure full VM performance. The operation is idempotent. + let _ = vm.stop_dirty_log().inspect_err(|e| { + warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); + }); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); + + if let Some(event) = self.current_postponed_lifecycle_event() { + match event { + PostMigrationLifecycleEvent::VmReboot => { + self.reset_evt + .write(1) + .context("Failed replaying reset event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + PostMigrationLifecycleEvent::VmmShutdown => { + self.exit_evt + .write(1) + .context("Failed replaying shutdown event after failed migration") + .inspect_err(|write_err| error!("{write_err}")) + .ok(); + } + } + } + }; + match migration_res { Ok(()) => { self.vm = MaybeVmOwnership::None; @@ -2148,56 +2194,7 @@ impl Vmm { } Err(e) => { error!("Migration failed: {e}"); - // We don't fail the VMM here, it just continues running its VM. - // If the failure happened very late in the migration path, the VM might already be - // stopped. We resume it to ensure proper operation. - // - // Cloud Hypervisor only supports migration of running VMs, therefore it cannot - // happen that we resume a previously paused VM. - if vm.get_state() == VmState::Paused { - match vm.resume() { - Ok(_) => { - info!("Resumed VM successfully after failed migration"); - } - Err(e) => { - error!("Failed resuming VM after failed migration: {e}"); - self.exit_evt.write(1).unwrap(); - } - } - } - - // Ensure full VM performance. The operation is idempotent. - let _ = vm.stop_dirty_log().inspect_err(|e| { - warn!("Failed stopping dirty log after resuming VM: {e} - VM performance might be slower than usual"); - }); - - // Give VMM back control. - self.vm = MaybeVmOwnership::Vmm(vm); - if let Some(event) = self.current_postponed_lifecycle_event() { - match event { - PostMigrationLifecycleEvent::VmReboot => { - self.reset_evt - .write(1) - .context("Failed replaying reset event after failed migration") - .inspect_err(|write_err| error!("{write_err}")) - .ok(); - } - PostMigrationLifecycleEvent::VmmShutdown => { - self.exit_evt - .write(1) - .context("Failed replaying shutdown event after failed migration") - .inspect_err(|write_err| error!("{write_err}")) - .ok(); - } - } - } - // Update migration progress snapshot - { - let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); - lock.as_mut() - .expect("live migration should be ongoing") - .mark_as_failed(&e); - } + try_resume_vm(vm); } } self.clear_postponed_lifecycle_event(); From 806b51b96a0121136605a1edf13efc857f309cb0 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 2 Mar 2026 14:37:13 +0100 Subject: [PATCH 085/178] vmm: migration: actually support cancellation Introduce the minimal functionality required to support canceling a live migration. This establishes the basic mechanism, while subsequent commits will reduce the latency of cancellation so that migrations stop more promptly. Management software can and should wait for the migration to be actually canceled via the vm.migration-progress endpoint. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/lib.rs | 3 ++ vmm/src/lib.rs | 61 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 952eba6a3e..55ef543fcb 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -95,6 +95,9 @@ pub enum MigratableError { #[error("Failed to complete migration for migratable component")] CompleteMigration(#[source] anyhow::Error), + #[error("Failed to continue the migration as it was cancelled")] + Cancelled, + #[error("Failed to release a disk lock")] UnlockError(#[source] anyhow::Error), diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 603cfa86c0..6e1ab7401f 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -19,6 +19,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::panic::AssertUnwindSafe; #[cfg(feature = "guest_debug")] use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; use std::sync::{Arc, Mutex}; use std::thread::JoinHandle; @@ -638,9 +639,20 @@ impl VmmVersionInfo { struct MigrationWorkerHandle { // Option to take the inner handle handle: Option>, + cancel: Arc, } impl MigrationWorkerHandle { + /// Cancels the migration. + /// + /// Note that timing issues in the very last phase of the migration allow a + /// tiny window in that migration succeeds before they could be canceled. + fn trigger_cancellation(&self) { + info!("Will cancel ongoing live-migration"); + self.cancel.store(true, Ordering::Release); + // we just dispatch here and do not block for the migration thread + } + /// Joins the thread and returns the result. fn join(mut self) -> MigrationThreadOut { self.handle @@ -673,6 +685,7 @@ struct MigrationWorker { postponed_lifecycle_event: Arc>>, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc, + cancel: Arc, } impl MigrationWorker { @@ -687,6 +700,7 @@ impl MigrationWorker { self.hypervisor.as_ref(), &self.config, self.postponed_lifecycle_event.as_ref(), + self.cancel.clone(), ) .inspect(|_| event!("vm", "migration-finished")) .inspect_err(|e| { @@ -715,6 +729,7 @@ impl MigrationWorker { dyn hypervisor::Hypervisor, >, ) -> result::Result { + let cancel = Arc::new(AtomicBool::new(false)); let worker = MigrationWorker { vm, check_migration_evt, @@ -722,6 +737,7 @@ impl MigrationWorker { postponed_lifecycle_event, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor, + cancel: cancel.clone(), }; // Cumbersome but we need this to take a value from the worker when @@ -753,6 +769,7 @@ impl MigrationWorker { Ok(MigrationWorkerHandle { handle: Some(inner_handle), + cancel, }) } } @@ -1807,9 +1824,19 @@ impl Vmm { hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, postponed_lifecycle_event: &Mutex>, + cancel: Arc, ) -> result::Result<(), MigratableError> { // State machine that is updated with more context as we progress. let mut ctx = OngoingMigrationContext::new(); + let return_if_cancelled_cb = move |socket: &mut SocketStream| { + if cancel.load(Ordering::Acquire) { + info!("Cancelling migration now"); + Request::abandon().write_to(socket)?; + Err(MigratableError::Cancelled) + } else { + Ok(()) + } + }; // Set up the socket connection let mut socket = if send_data_migration.connections.get() > 1 && !send_data_migration.local @@ -1929,6 +1956,10 @@ impl Vmm { mem_send.cleanup()?; } + // Very last cancellation check. After this, we release the disk locks and we can't cancel + // anymore. + return_if_cancelled_cb(&mut socket)?; + // Update migration progress snapshot { let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); @@ -2192,9 +2223,30 @@ impl Vmm { } } } + Err(MigratableError::Cancelled) => { + error!("Migration cancelled"); + event!("vm", "migration-cancelled"); + try_resume_vm(vm); + + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_cancelled(); + } + } Err(e) => { error!("Migration failed: {e}"); try_resume_vm(vm); + + // Update migration progress snapshot + { + let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); + lock.as_mut() + .expect("live migration should be ongoing") + .mark_as_failed(&e); + } } } self.clear_postponed_lifecycle_event(); @@ -3326,7 +3378,14 @@ impl RequestHandler for Vmm { } } - todo!() + let handle = self + .migration_thread_handle + .as_ref() + .expect("should have handle"); + // We just dispatch the cancellation. + handle.trigger_cancellation(); + + Ok(()) } fn vm_migration_progress(&mut self) -> Option { From a9253633d47bf2b03f7f4917a4a1de421073aa95 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 2 Mar 2026 14:51:22 +0100 Subject: [PATCH 086/178] vmm: migration: early cancellation (add more checks) This adds multiple points in the migration path where the migration can be canceled early. The pre-copy phase is not addressed here and will follow in the next commit! On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 18 ++++++++++++++++-- vmm/src/migration_transport.rs | 9 ++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 6e1ab7401f..96a95b2f5d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1563,6 +1563,7 @@ impl Vmm { is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, mem_send: &mut SendAdditionalConnections, postponed_lifecycle_event: &Mutex>, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> result::Result<(), MigratableError>, ) -> result::Result { let update_migration_progress = |s: &mut MemoryMigrationContext, vm: &Vm| { let mut lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); @@ -1595,6 +1596,8 @@ impl Vmm { }; loop { + return_if_cancelled_cb(socket)?; + // todo: check if auto-converge is enabled at all? if Self::can_increase_autoconverge_step(ctx) && vm.throttle_percent() < AUTO_CONVERGE_MAX @@ -1630,7 +1633,7 @@ impl Vmm { // Send the current dirty pages let transfer_begin = Instant::now(); - mem_send.send_memory(iteration_table, socket)?; + mem_send.send_memory(iteration_table, socket, return_if_cancelled_cb)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); @@ -1768,6 +1771,7 @@ impl Vmm { mem_send: &mut SendAdditionalConnections, ctx: &mut OngoingMigrationContext, postponed_lifecycle_event: &Mutex>, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> result::Result<(), MigratableError>, ) -> result::Result<(), MigratableError> { let mut mem_ctx = MemoryMigrationContext::new(); @@ -1780,6 +1784,7 @@ impl Vmm { |ctx| Self::is_precopy_converged(ctx, send_data_migration), mem_send, postponed_lifecycle_event, + return_if_cancelled_cb, )?; let downtime_begin = Instant::now(); // End throttle thread @@ -1799,7 +1804,7 @@ impl Vmm { mem_ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); - mem_send.send_memory(final_table, socket)?; + mem_send.send_memory(final_table, socket, return_if_cancelled_cb)?; let transfer_duration = transfer_begin.elapsed(); mem_ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); mem_ctx.iteration += 1; @@ -1859,6 +1864,8 @@ impl Vmm { MigratableError::MigrateSend(anyhow!("Error starting migration (got bad response)")), )?; + return_if_cancelled_cb(&mut socket)?; + // Send config let vm_config = vm.get_config(); #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1887,6 +1894,8 @@ impl Vmm { .map_err(MigratableError::MigrateSend)? }; + return_if_cancelled_cb(&mut socket)?; + if send_data_migration.local { match &mut socket { SocketStream::Unix(unix_socket) => { @@ -1906,6 +1915,8 @@ impl Vmm { } } + return_if_cancelled_cb(&mut socket)?; + let vm_migration_config = VmMigrationConfig { vm_config, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1914,6 +1925,8 @@ impl Vmm { }; migration_transport::send_config(&mut socket, &vm_migration_config)?; + return_if_cancelled_cb(&mut socket)?; + // Let every Migratable object know about the migration being started. vm.start_migration()?; @@ -1944,6 +1957,7 @@ impl Vmm { &mut mem_send, &mut ctx, postponed_lifecycle_event, + &return_if_cancelled_cb, ) .inspect_err(|_| { // Calling cleanup multiple times is fine, thus here we just make sure diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 56da4fd953..c6d8b09a68 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -691,6 +691,7 @@ impl SendAdditionalConnections { &mut self, table: MemoryRangeTable, socket: &mut SocketStream, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> Result<(), MigratableError>, ) -> Result { if table.regions().is_empty() { return Ok(false); @@ -698,13 +699,19 @@ impl SendAdditionalConnections { // If we use only one connection, we send the memory directly. if self.threads.is_empty() { - send_memory_ranges(&self.guest_memory, &table, socket)?; + for chunk in table.partition(Self::CHUNK_SIZE) { + return_if_cancelled_cb(socket) + .inspect_err(|_| info!("cancelling migration during memory iteration"))?; + send_memory_ranges(&self.guest_memory, &chunk, socket)?; + } return Ok(true); } // The chunk size is chosen to be big enough so that even very fast links need some // milliseconds to send it. for chunk in table.partition(Self::CHUNK_SIZE) { + return_if_cancelled_cb(socket) + .inspect_err(|_| info!("cancelling migration during memory iteration"))?; self.send_chunk(chunk)?; } From a2abcf55f4b05242311c9b1bb01cd49283ef1b39 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 5 Mar 2026 09:26:37 +0100 Subject: [PATCH 087/178] vmm: migration cancellation: integrate into TCP threads This commit reduces the delay in stopping memory transmission during the pre-copy phase when a migration is canceled. The cancellation handling is implemented in SendAdditionalConnections, which coordinates all memory transmission threads. In the cloud-hypervisor log, we can now even see that the cancellation happens fairly quickly when in pre-copy phase with multiple connections: ``` cloud-hypervisor: 11.263371s: INFO:vmm/src/api/mod.rs:1147 -- API request event: VmCancelMigration () cloud-hypervisor: 11.263401s: INFO:vmm/src/lib.rs:805 -- Will cancel ongoing live-migration cloud-hypervisor: 11.263416s: INFO:vmm/src/lib.rs:2662 -- Cancelling migration now cloud-hypervisor: 11.263568s: INFO:vmm/src/lib.rs:1667 -- Sending disconnect message to channels cloud-hypervisor: 11.263594s: INFO:vmm/src/lib.rs:1677 -- Waiting for threads to finish cloud-hypervisor: 11.302994s: INFO:vmm/src/lib.rs:1531 -- Sent 128 MiB via additional connection. cloud-hypervisor: 11.303037s: INFO:vmm/src/lib.rs:1531 -- Sent 64 MiB via additional connection. cloud-hypervisor: 11.303062s: INFO:vmm/src/lib.rs:1531 -- Sent 64 MiB via additional connection. cloud-hypervisor: 11.303066s: INFO:vmm/src/lib.rs:1531 -- Sent 64 MiB via additional connection. cloud-hypervisor: 11.303354s: INFO:vmm/src/lib.rs:1681 -- Threads finished cloud-hypervisor: 11.303672s: ERROR:vmm/src/lib.rs:858 -- migrate error: Failed to continue the migration as it was cancelled ``` On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/migration_transport.rs | 55 ++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index c6d8b09a68..073ff8ba98 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -12,7 +12,9 @@ use std::os::unix::net::{UnixListener, UnixStream}; use std::path::{Path, PathBuf}; use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::mpsc::{Receiver, Sender, SyncSender, TrySendError, channel, sync_channel}; +use std::sync::mpsc::{ + Receiver, Sender, SyncSender, TryRecvError, TrySendError, channel, sync_channel, +}; use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -525,6 +527,9 @@ pub(crate) struct SendAdditionalConnections { /// this using this flag. Only the main thread checks this variable, the worker /// threads will be stopped during cleanup. worker_error: Arc, + /// Externally triggered cancellation. Workers drain queued memory messages + /// after this is set and wait for the disconnect message. + external_cancel: Arc, /// After the main thread sent all memory chunks to the sender threads, it waits /// until one of the workers notifies it. Either because an error occurred, or /// because they arrived at the gate. @@ -567,6 +572,7 @@ impl SendAdditionalConnections { let buffer_size = Self::BUFFERED_REQUESTS_PER_THREAD * configured_connections as usize; let (message_tx, message_rx) = sync_channel::(buffer_size); let worker_error = Arc::new(AtomicBool::new(false)); + let external_cancel = Arc::new(AtomicBool::new(false)); let (notify_tx, notify_rx) = channel::(); // If one connection is configured, we don't have to create any additional threads. @@ -577,6 +583,7 @@ impl SendAdditionalConnections { threads, message_tx, worker_error, + external_cancel, notify_rx, }); } @@ -590,6 +597,7 @@ impl SendAdditionalConnections { let guest_memory = guest_memory.clone(); let message_rx = message_rx.clone(); let worker_error = worker_error.clone(); + let external_cancel = external_cancel.clone(); let notify_tx = notify_tx.clone(); let thread = thread::Builder::new() @@ -600,6 +608,7 @@ impl SendAdditionalConnections { &guest_memory, &message_rx, &worker_error, + &external_cancel, ¬ify_tx, ) }) @@ -622,6 +631,7 @@ impl SendAdditionalConnections { threads, message_tx, worker_error, + external_cancel, notify_rx, }) } @@ -631,6 +641,7 @@ impl SendAdditionalConnections { guest_memory: &GuestMemoryAtomic, message_rx: &Mutex>, worker_error: &AtomicBool, + external_cancel: &AtomicBool, notify_tx: &Sender, ) -> Result<(), MigratableError> { info!("Spawned thread to send VM memory."); @@ -655,6 +666,10 @@ impl SendAdditionalConnections { })?; match message { SendMemoryThreadMessage::Memory(table) => { + if external_cancel.load(Ordering::Acquire) { + continue; + } + send_memory_ranges(guest_memory, &table, socket) .inspect_err(|_| { worker_error.store(true, Ordering::Relaxed); @@ -710,12 +725,14 @@ impl SendAdditionalConnections { // The chunk size is chosen to be big enough so that even very fast links need some // milliseconds to send it. for chunk in table.partition(Self::CHUNK_SIZE) { - return_if_cancelled_cb(socket) - .inspect_err(|_| info!("cancelling migration during memory iteration"))?; + return_if_cancelled_cb(socket).inspect_err(|_| { + info!("cancelling migration during memory iteration"); + self.external_cancel.store(true, Ordering::Release); + })?; self.send_chunk(chunk)?; } - self.wait_for_pending_data()?; + self.wait_for_pending_data(socket, return_if_cancelled_cb)?; Ok(true) } @@ -750,7 +767,11 @@ impl SendAdditionalConnections { } /// Wait until all data that is in-flight has actually been sent and acknowledged. - fn wait_for_pending_data(&mut self) -> Result<(), MigratableError> { + fn wait_for_pending_data( + &mut self, + socket: &mut SocketStream, + return_if_cancelled_cb: &impl Fn(&mut SocketStream) -> Result<(), MigratableError>, + ) -> Result<(), MigratableError> { let gate = Arc::new(Gate::new()); for _ in 0..self.threads.len() { self.message_tx @@ -764,26 +785,34 @@ impl SendAdditionalConnections { // they arrived at the gate. let mut seen_threads = 0; loop { - match self - .notify_rx - .recv() - .context("Error receiving message from workers") - .map_err(MigratableError::MigrateSend)? - { - SendMemoryThreadNotify::Gate => { + return_if_cancelled_cb(socket).inspect_err(|_| { + gate.open(); + self.external_cancel.store(true, Ordering::Release); + })?; + + thread::sleep(Duration::from_millis(2)); + + match self.notify_rx.try_recv() { + Ok(SendMemoryThreadNotify::Gate) => { seen_threads += 1; if seen_threads == self.threads.len() { gate.open(); return Ok(()); } } - SendMemoryThreadNotify::Error => { + Ok(SendMemoryThreadNotify::Error) => { // If an error occurred in one of the worker threads, we open // the gate to make sure that no thread hangs. After that, we // receive the error from Self::cleanup() and return it. gate.open(); return self.cleanup(); } + Err(TryRecvError::Empty) => {} + Err(TryRecvError::Disconnected) => { + return Err(MigratableError::MigrateSend(anyhow!( + "All senders died unexpectedly." + ))); + } } } } From 5283a0c9013044bbf45eb90efc15de2ab35510de Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 19 Feb 2026 15:40:48 +0100 Subject: [PATCH 088/178] ch-remote: add cancel-migration On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/src/bin/ch-remote.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 7d7a75abd6..3e288f5ca4 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -626,6 +626,8 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu )?; simple_api_command(socket, "PUT", "create", Some(&data)).map_err(Error::HttpApiClient) } + Some("cancel-migration") => simple_api_command(socket, "PUT", "cancel-migration", None) + .map_err(Error::HttpApiClient), _ => unreachable!(), } } @@ -1121,6 +1123,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .about("Add vsock device") .arg(Arg::new("vsock_config").index(1).help(VsockConfig::SYNTAX)), Command::new("boot").about("Boot a created VM"), + Command::new("cancel-migration").about("Cancel any ongoing migration"), Command::new("coredump") .about("Create a coredump from VM") .arg(Arg::new("coredump_config").index(1).help("")), From 0aebb7b3d70f84327d070283ce6d19446b8642e1 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 30 Mar 2026 15:53:27 +0200 Subject: [PATCH 089/178] vmm: migration: properly print error chain on failure TL;DR: Major improvement for developers to see why a migration failed The error model of Cloud Hypervisor leverages std::error::Error and its source() function respectively to build chains of errors. This helps to spot the root cause and see how a certain operation failed throughout the stack. Therefore, the Display::fmt() impl of every error only prints what failed "on its level", but never appends ": {0}", i.e., the underling error's Display::fmt(). ch-remote and cloud-hypervisor can print these error chains nicely when they exit - in the case of a migration, we do not exit however. The solution is to print the error chain there as well to get more meaningful errors. As example: old: ``` cloud-hypervisor: 62.820480s: INFO:vmm/src/lib.rs:3382 -- VM migration check event cloud-hypervisor: 62.820576s: ERROR:vmm/src/lib.rs:3138 -- Migration failed: Failed to send migratable component snapshot ``` new: ``` cloud-hypervisor: 62.820480s: INFO:vmm/src/lib.rs:3382 -- VM migration check event cloud-hypervisor: 15.311401s: ERROR:vmm/src/lib.rs:3110 -- Migration failed with the following chain of errors: cloud-hypervisor: 15.311412s: ERROR:vmm/src/lib.rs:3118 -- 0: Failed to send migratable component snapshot cloud-hypervisor: 15.311422s: ERROR:vmm/src/lib.rs:3118 -- 1: Error connecting to TCP socket cloud-hypervisor: 15.311442s: ERROR:vmm/src/lib.rs:3118 -- 2: Connection refused (os error 111) cloud-hypervisor: 62.820618s: INFO:event_monitor/src/lib.rs:113 -- Event: source = vm event = migration-failed ``` On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 96a95b2f5d..efa5f62a97 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2150,6 +2150,37 @@ impl Vmm { self.vm.vm_mut().unwrap().restore() } + /// Prints the error chain to `error!()` level, akin to user-facing errors when Cloud Hypervisor + /// or ch-remote fail. + // TODO: For upstreaming, we should unify this with the code-paths used by ch-remote and + // Cloud Hypervisor on failure. + fn log_print_error_chain<'a>(top_error: &'a (dyn std::error::Error + 'static)) { + // Print chain of errors + if top_error.source().is_none() { + error!("Migration failed with the following error:"); + error!(" {top_error}"); + } else { + // In cli_print_error_chain(), we also print the + // ::fmt() as oneliner so that we can see all + // properties. As we use anyhow errors in the migration path, + // Debug::fmt() is not helpful for us as it doesn't print the + // underlying properties (like the default Debug::fmt() impl would + // do). Instead, it would print a trace itself, which is not what + // we want to do here. + + error!("Migration failed with the following chain of errors:"); + std::iter::successors(Some(top_error), |sub_error| { + // Dereference necessary to mitigate rustc compiler bug. + // See + (*sub_error).source() + }) + .enumerate() + .for_each(|(level, error)| { + error!(" {level}: {error}"); + }); + } + } + /// Checks the migration result. /// /// This should be called when the migration thread indicated a state @@ -2251,7 +2282,7 @@ impl Vmm { } } Err(e) => { - error!("Migration failed: {e}"); + Self::log_print_error_chain(&e); try_resume_vm(vm); // Update migration progress snapshot From 0795c73d024b4078fd826b4697f79408c9e815a7 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Mar 2026 16:37:50 +0100 Subject: [PATCH 090/178] vmm: defer guest exit during migration Move the migration workaround from the shared Exit path to GuestExit and rename the postponed shutdown event to `VmShutdown`. With --no-shutdown, guest-triggered shutdown must keep following the guest exit path even when it is delayed until after migration completion. This preserves the distinction between guest shutdown and real VMM exit conditions. The existing fatal exit path stays unchanged. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/lib.rs | 26 +++++++++++++------------- vmm/src/vm.rs | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index efa5f62a97..7411285647 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1282,10 +1282,10 @@ impl Vmm { .context("Failed writing reset eventfd after migration") .map_err(MigratableError::MigrateReceive)?; } - Some(PostMigrationLifecycleEvent::VmmShutdown) => { - self.exit_evt + Some(PostMigrationLifecycleEvent::VmShutdown) => { + self.guest_exit_evt .write(1) - .context("Failed writing exit eventfd after migration") + .context("Failed writing guest exit eventfd after migration") .map_err(MigratableError::MigrateReceive)?; } } @@ -2235,10 +2235,10 @@ impl Vmm { .inspect_err(|write_err| error!("{write_err}")) .ok(); } - PostMigrationLifecycleEvent::VmmShutdown => { - self.exit_evt + PostMigrationLifecycleEvent::VmShutdown => { + self.guest_exit_evt .write(1) - .context("Failed replaying shutdown event after failed migration") + .context("Failed replaying guest exit event after failed migration") .inspect_err(|write_err| error!("{write_err}")) .ok(); } @@ -2336,13 +2336,6 @@ impl Vmm { info!("VM exit event"); // Consume the event. self.exit_evt.read().map_err(Error::EventFdRead)?; - // Workaround for guest-induced shutdown during a live-migration. - if matches!(self.vm, MaybeVmOwnership::Migration) { - self.postpone_lifecycle_event_during_migration( - PostMigrationLifecycleEvent::VmmShutdown, - ); - continue; - } self.vmm_shutdown().map_err(Error::VmmShutdown)?; break 'outer; @@ -2363,6 +2356,13 @@ impl Vmm { EpollDispatch::GuestExit => { info!("VM guest exit event"); self.guest_exit_evt.read().map_err(Error::EventFdRead)?; + // Workaround for guest-induced shutdown during a live-migration. + if matches!(self.vm, MaybeVmOwnership::Migration) { + self.postpone_lifecycle_event_during_migration( + PostMigrationLifecycleEvent::VmShutdown, + ); + continue; + } if self.no_shutdown { self.vm_shutdown().map_err(Error::VmShutdown)?; } else { diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 9ac2360e6a..514f683487 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -549,7 +549,7 @@ pub struct Vm { #[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum PostMigrationLifecycleEvent { VmReboot, - VmmShutdown, + VmShutdown, } impl Vm { From 09c63995470dbce52335b0ee80ceead588bba38c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 13 Mar 2026 06:38:29 +0100 Subject: [PATCH 091/178] vmm: reduce API event verbosity These are called very frequently by libvirt and spam the log. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/api/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 184bede0e3..7a0a24eb8b 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -40,7 +40,7 @@ use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; -use log::{debug, info}; +use log::{info, trace}; use micro_http::Body; use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; @@ -1277,7 +1277,7 @@ impl ApiAction for VmCounters { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - info!("API request event: VmCounters"); + trace!("API request event: VmCounters"); let response = vmm .vm_counters() @@ -1382,7 +1382,7 @@ impl ApiAction for VmInfo { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - debug!("API request event: VmInfo"); + trace!("API request event: VmInfo"); let response = vmm .vm_info() @@ -2010,7 +2010,7 @@ impl ApiAction for VmMigrationProgress { fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { - debug!("API request event: VmMigrationProgress"); + trace!("API request event: VmMigrationProgress"); let snapshot = Ok(vmm.vm_migration_progress()); let response = snapshot From 8c86589f9f53f9aee91c4772c0a3f55e60d25e27 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 10 Mar 2026 14:57:49 +0100 Subject: [PATCH 092/178] vmm: keep virtio activation alive in migration Live migration can deadlock if the guest triggers a virtio device activation while the migration worker owns the VM. The failure shows up during boot and firmware, where the guest can reset and reinitialize virtio devices while precopy is running. In the failing case, the source log shows a pending virtio activation that never completes: 8.115833s _virtio-pci-net_0: Needs activation; returning barrier 8.115854s vmm/src/vm.rs:464 -- Waiting for barrier 24.875452s Entering downtime phase 24.875481s stopping vcpu throttling thread ... vCPU thread did not respond in 10ms to signal - retrying vCPU thread did not respond in 20ms to signal - retrying ... thread 'throttle-vcpu' (1029) panicked ... Pause(Error signalling vCPUs: Timeout when waiting for signal to be acknowledged) The vCPU blocks on the activation barrier and never reaches the normal pause checkpoint. Later, migration enters downtime and stops the vCPU throttle thread. In the failing case, that thread is still inside a CpuManager::pause() call, which waits for every vCPU to acknowledge the signal. The blocked vCPU never does, so the pause times out. The VMM already receives ActivateVirtioDevices events during migration, but it only drains pending activations when self.vm is in MaybeVmOwnership::Vmm. Once vm_send_migration() moves the Vm into the migration worker, self.vm becomes MaybeVmOwnership::Migration and the event handler no longer has a path to call activate_virtio_devices(). Fix this by storing the DeviceManager inside MaybeVmOwnership::Migration. This keeps just enough state on the VMM thread to drain pending virtio activations while the migration worker owns the Vm. The barrier logic stays unchanged. The VMM now releases the same activation barrier during migration that it already released before migration started. This keeps the guest from getting stuck in the activation wait and lets the later pause succeed. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/lib.rs | 128 +++++++++++++++++++++++++++++++------------------ vmm/src/vm.rs | 4 ++ 2 files changed, 85 insertions(+), 47 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 7411285647..1e2dadc32d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -21,7 +21,7 @@ use std::panic::AssertUnwindSafe; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, Weak}; use std::thread::JoinHandle; use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] @@ -71,6 +71,7 @@ use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; use crate::coredump::GuestDebuggable; #[cfg(feature = "kvm")] use crate::cpu::IS_IN_SHUTDOWN; +use crate::device_manager::DeviceManager; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; use crate::migration::{get_vm_snapshot, recv_vm_config, recv_vm_state}; @@ -781,13 +782,40 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +struct MigrationVmState { + // The migration worker owns the VM during migration, so this should stop + // working once that VM has been dropped. + device_manager: Weak>, +} + +impl MigrationVmState { + fn new(vm: &Vm) -> Self { + Self { + device_manager: Arc::downgrade(vm.device_manager()), + } + } + + fn activate_virtio_devices(&self) -> result::Result<(), VmError> { + self.device_manager + .upgrade() + .expect("device manager should remain alive during migration") + .lock() + .unwrap() + .activate_virtio_devices() + .map_err(VmError::ActivateVirtioDevices) + } +} + /// Describes the current ownership of a running VM. #[allow(clippy::large_enum_variant)] -pub enum MaybeVmOwnership { +enum MaybeVmOwnership { /// The VMM holds the ownership of the VM. Vmm(Vm), /// The VM is temporarily blocked by the current ongoing migration. - Migration, + /// + /// We still keep the device manager reachable so the epoll thread can + /// drain pending virtio activations while the migration worker owns the VM. + Migration(MigrationVmState), /// No VM is running. None, } @@ -798,13 +826,12 @@ impl MaybeVmOwnership { /// # Panics /// This method panics if `self` is not [`Self::Vmm`]. fn take_vm_for_migration(&mut self) -> Vm { - if !matches!(self, Self::Vmm(_)) { - panic!("should only be called when a migration can start"); - } - - match mem::replace(self, Self::Migration) { - MaybeVmOwnership::Vmm(vm) => vm, - _ => unreachable!(), + match mem::replace(self, Self::None) { + Self::Vmm(vm) => { + *self = Self::Migration(MigrationVmState::new(&vm)); + vm + } + _ => panic!("should only be called when a migration can start"), } } @@ -2079,7 +2106,7 @@ impl Vmm { prefault: bool, memory_restore_mode: MemoryRestoreMode, ) -> std::result::Result<(), VmError> { - if matches!(self.vm, MaybeVmOwnership::Migration) { + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { return Err(VmError::VmMigrating); } @@ -2345,7 +2372,7 @@ impl Vmm { // Consume the event. self.reset_evt.read().map_err(Error::EventFdRead)?; // Workaround for guest-induced shutdown during a live-migration. - if matches!(self.vm, MaybeVmOwnership::Migration) { + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { self.postpone_lifecycle_event_during_migration( PostMigrationLifecycleEvent::VmReboot, ); @@ -2357,7 +2384,7 @@ impl Vmm { info!("VM guest exit event"); self.guest_exit_evt.read().map_err(Error::EventFdRead)?; // Workaround for guest-induced shutdown during a live-migration. - if matches!(self.vm, MaybeVmOwnership::Migration) { + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { self.postpone_lifecycle_event_during_migration( PostMigrationLifecycleEvent::VmShutdown, ); @@ -2371,11 +2398,18 @@ impl Vmm { } } EpollDispatch::ActivateVirtioDevices => { - if let MaybeVmOwnership::Vmm(ref vm) = self.vm { - let count = self.activate_evt.read().map_err(Error::EventFdRead)?; - info!("Trying to activate pending virtio devices: count = {count}"); - vm.activate_virtio_devices() - .map_err(Error::ActivateVirtioDevices)?; + let count = self.activate_evt.read().map_err(Error::EventFdRead)?; + info!("Trying to activate pending virtio devices: count = {count}"); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm + .activate_virtio_devices() + .map_err(Error::ActivateVirtioDevices)?, + MaybeVmOwnership::Migration(state) => { + state + .activate_virtio_devices() + .map_err(Error::ActivateVirtioDevices)?; + } + MaybeVmOwnership::None => {} } } EpollDispatch::Api => { @@ -2470,7 +2504,7 @@ impl RequestHandler for Vmm { info!("Booting VM"); event!("vm", "booting"); - if matches!(self.vm, MaybeVmOwnership::Migration) { + if matches!(self.vm, MaybeVmOwnership::Migration(_)) { return Err(VmError::VmMigrating); } @@ -2535,7 +2569,7 @@ impl RequestHandler for Vmm { fn vm_pause(&mut self) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), - MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } @@ -2543,7 +2577,7 @@ impl RequestHandler for Vmm { fn vm_resume(&mut self) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), - MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } @@ -2560,7 +2594,7 @@ impl RequestHandler for Vmm { .map_err(VmError::SnapshotSend) }) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } @@ -2568,7 +2602,7 @@ impl RequestHandler for Vmm { fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { match &self.vm { MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), - MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), MaybeVmOwnership::None => (), } @@ -2636,7 +2670,7 @@ impl RequestHandler for Vmm { MaybeVmOwnership::Vmm(ref mut vm) => { vm.coredump(destination_url).map_err(VmError::Coredump) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2644,7 +2678,7 @@ impl RequestHandler for Vmm { fn vm_shutdown(&mut self) -> result::Result<(), VmError> { let vm = match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm, - MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; // Drain console_info so that the FDs are not reused @@ -2665,7 +2699,7 @@ impl RequestHandler for Vmm { // First we stop the current VM let vm = match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm, - MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; let config = vm.get_config(); @@ -2739,7 +2773,7 @@ impl RequestHandler for Vmm { let state = match &self.vm { MaybeVmOwnership::Vmm(vm) => vm.get_state(), // TODO in theory one could live-migrate a non-running VM .. - MaybeVmOwnership::Migration => VmState::Running, + MaybeVmOwnership::Migration(_) => VmState::Running, MaybeVmOwnership::None => VmState::Created, }; @@ -2750,14 +2784,14 @@ impl RequestHandler for Vmm { memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); memory_actual_size += vm.virtio_mem_plugged_size(); } - MaybeVmOwnership::Migration => {} + MaybeVmOwnership::Migration(_) => {} MaybeVmOwnership::None => {} } let device_tree = match &self.vm { MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), // TODO we need to fix this - MaybeVmOwnership::Migration => None, + MaybeVmOwnership::Migration(_) => None, MaybeVmOwnership::None => None, }; @@ -2799,7 +2833,7 @@ impl RequestHandler for Vmm { MaybeVmOwnership::None => { self.vm_config = None; } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, } Ok(()) @@ -2829,7 +2863,7 @@ impl RequestHandler for Vmm { .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; Ok(()) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); if let Some(desired_vcpus) = desired_vcpus { @@ -2862,7 +2896,7 @@ impl RequestHandler for Vmm { Ok(()) } } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::ResizeDisk), } } @@ -2876,7 +2910,7 @@ impl RequestHandler for Vmm { .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; Ok(()) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by setting the new desired ram. let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; @@ -2918,7 +2952,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -2950,7 +2984,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -2967,7 +3001,7 @@ impl RequestHandler for Vmm { .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; Ok(()) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { if let Some(ref config) = self.vm_config { let mut config = config.lock().unwrap(); @@ -3002,7 +3036,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3031,7 +3065,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3069,7 +3103,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3098,7 +3132,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3127,7 +3161,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3161,7 +3195,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => { // Update VmConfig by adding the new device. let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); @@ -3181,7 +3215,7 @@ impl RequestHandler for Vmm { .map(Some) .map_err(VmError::SerializeJson) } - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -3189,7 +3223,7 @@ impl RequestHandler for Vmm { fn vm_power_button(&mut self) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -3197,7 +3231,7 @@ impl RequestHandler for Vmm { fn vm_nmi(&mut self) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), - MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -3303,7 +3337,7 @@ impl RequestHandler for Vmm { match self.vm { MaybeVmOwnership::Vmm(_) => (), - MaybeVmOwnership::Migration => { + MaybeVmOwnership::Migration(_) => { return Err(MigratableError::MigrateSend(anyhow!( "There is already an ongoing migration" ))); @@ -3415,7 +3449,7 @@ impl RequestHandler for Vmm { fn vm_cancel_migration(&mut self) -> result::Result<(), MigratableError> { match self.vm { - MaybeVmOwnership::Migration => (), + MaybeVmOwnership::Migration(_) => (), _ => { return Err(MigratableError::CancelMigration(anyhow!( "There is no ongoing migration" diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 514f683487..fa879810af 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3146,6 +3146,10 @@ impl Vm { Ok(()) } + pub fn device_manager(&self) -> &Arc> { + &self.device_manager + } + pub fn activate_virtio_devices(&self) -> Result<()> { self.device_manager .lock() From 0c041142667652389212c7beb927b2ed786c1e48 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 10:06:54 +0100 Subject: [PATCH 093/178] vm-migration: speed up volatile read and write We will use the KeepAliveStream also if the migration uses a single TCP connection from now on. We have seen timeouts if the VM has huge amounts of memory and prefaults the memory during migration. Thus we need the keep alive messages also in this case. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/keep_alive_stream.rs | 117 +++++++++++++++++++------- 1 file changed, 85 insertions(+), 32 deletions(-) diff --git a/vm-migration/src/keep_alive_stream.rs b/vm-migration/src/keep_alive_stream.rs index 821af88436..4b2ab8a33b 100644 --- a/vm-migration/src/keep_alive_stream.rs +++ b/vm-migration/src/keep_alive_stream.rs @@ -29,10 +29,10 @@ use crate::protocol::Request; // The messages that will be sent to the `KeepAliveWorker`. #[derive(Debug)] enum KeepAliveStreamMessage { - // Read `len` bytes from `stream`. - Read(usize /* len */), - // Write `buf` to `stream`. - Write(Vec /* buf */), + // Read `len` bytes into `buf` from `stream`. + Read { len: usize, buf: Vec }, + // Write `buf[..len]` to `stream`. + Write { len: usize, buf: Vec }, // Flush `stream`. Flush, // Stop listening for messages, i.e. stop the worker. @@ -45,7 +45,7 @@ enum KeepAliveStreamAnswer { // Result of reading from `stream`. Read(io::Result<(Vec, usize)>), // Result of writing to `stream`. - Write(io::Result), + Write(io::Result<(Vec, usize)>), // Result of flushing `stream`. Flush(io::Result<()>), } @@ -64,14 +64,19 @@ where Self { stream } } - pub fn read(&mut self, len: usize) -> io::Result<(Vec, usize)> { - let mut buf: Vec = vec![0u8; len]; - let n = Read::read(&mut self.stream, &mut buf)?; + pub fn read(&mut self, mut buf: Vec, len: usize) -> io::Result<(Vec, usize)> { + if buf.len() < len { + buf.resize(len, 0); + } + + let n = Read::read(&mut self.stream, &mut buf[..len])?; Ok((buf, n)) } - pub fn write(&mut self, buf: &[u8]) -> io::Result { - Write::write(&mut self.stream, buf) + pub fn write(&mut self, buf: Vec, len: usize) -> io::Result<(Vec, usize)> { + debug_assert!(len <= buf.len()); + let n = Write::write(&mut self.stream, &buf[..len])?; + Ok((buf, n)) } pub fn flush(&mut self) -> io::Result<()> { @@ -87,6 +92,10 @@ pub struct KeepAliveStream { message_tx: SyncSender, /// Used to receive answers from the worker. answer_rx: Receiver, + /// Scratch buffer that gets moved to/from the worker for reads. + read_buf: Vec, + /// Scratch buffer that gets moved to/from the worker for writes. + write_buf: Vec, } impl KeepAliveStream { @@ -106,9 +115,9 @@ impl KeepAliveStream { // The idea is to always send a keep alive message when this times out. match message_rx.recv_timeout(timeout) { Ok(message) => match message { - KeepAliveStreamMessage::Read(payload) => { + KeepAliveStreamMessage::Read { len, buf } => { if answer_tx - .send(KeepAliveStreamAnswer::Read(worker.read(payload))) + .send(KeepAliveStreamAnswer::Read(worker.read(buf, len))) .is_err() { // We simply break the loop and thus stop the thread if anything bad happens. @@ -116,9 +125,9 @@ impl KeepAliveStream { break; } } - KeepAliveStreamMessage::Write(payload) => { + KeepAliveStreamMessage::Write { len, buf } => { if answer_tx - .send(KeepAliveStreamAnswer::Write(worker.write(&payload))) + .send(KeepAliveStreamAnswer::Write(worker.write(buf, len))) .is_err() { break; @@ -147,6 +156,8 @@ impl KeepAliveStream { thread: Some(thread), message_tx, answer_rx, + read_buf: Vec::new(), + write_buf: Vec::new(), }) } } @@ -161,17 +172,22 @@ impl Drop for KeepAliveStream { } impl Read for KeepAliveStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { + fn read(&mut self, out_buf: &mut [u8]) -> io::Result { + let len = out_buf.len(); + // Move the buffer to avoid lifetime or ownership issues. + let read_buf = std::mem::take(&mut self.read_buf); + self.message_tx - .send(KeepAliveStreamMessage::Read(buf.len())) + .send(KeepAliveStreamMessage::Read { len, buf: read_buf }) .map_err(|e| { io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) })?; match self.answer_rx.recv() { Ok(KeepAliveStreamAnswer::Read(result)) => match result { - Ok((recv_buf, len)) => { - buf[..len].copy_from_slice(&recv_buf[..len]); + Ok((buf, len)) => { + self.read_buf = buf; + out_buf[..len].copy_from_slice(&self.read_buf[..len]); Ok(len) } Err(e) => Err(e), @@ -187,15 +203,33 @@ impl Read for KeepAliveStream { } impl Write for KeepAliveStream { - fn write(&mut self, buf: &[u8]) -> io::Result { + fn write(&mut self, in_buf: &[u8]) -> io::Result { + let len = in_buf.len(); + if self.write_buf.len() < len { + self.write_buf.resize(len, 0); + } + + self.write_buf[..len].copy_from_slice(in_buf); + // Move the buffer to avoid lifetime or ownership issues. + let write_buf = std::mem::take(&mut self.write_buf); + self.message_tx - .send(KeepAliveStreamMessage::Write(Vec::from(buf))) + .send(KeepAliveStreamMessage::Write { + len, + buf: write_buf, + }) .map_err(|e| { io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) })?; match self.answer_rx.recv() { - Ok(KeepAliveStreamAnswer::Write(result)) => result, + Ok(KeepAliveStreamAnswer::Write(result)) => match result { + Ok((buf, len)) => { + self.write_buf = buf; + Ok(len) + } + Err(e) => Err(e), + }, Ok(a) => Err(io::Error::other(format!( "Received unexpected answer: {a:?}. This is most likely a bug!", ))), @@ -226,10 +260,14 @@ impl Write for KeepAliveStream { impl ReadVolatile for KeepAliveStream { fn read_volatile( &mut self, - buf: &mut VolatileSlice, + vs: &mut VolatileSlice, ) -> result::Result { + let len = vs.len(); + // Move the buffer to avoid lifetime or ownership issues. + let read_buf = std::mem::take(&mut self.read_buf); + self.message_tx - .send(KeepAliveStreamMessage::Read(buf.len())) + .send(KeepAliveStreamMessage::Read { len, buf: read_buf }) .map_err(|e| { io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) }) @@ -237,8 +275,9 @@ impl ReadVolatile for KeepAliveStream { match self.answer_rx.recv() { Ok(KeepAliveStreamAnswer::Read(result)) => match result { - Ok((recv_buf, len)) => { - buf.copy_from(&recv_buf[..len]); + Ok((buf, len)) => { + self.read_buf = buf; + vs.copy_from(&self.read_buf[..len]); Ok(len) } Err(e) => Err(VolatileMemoryError::IOError(e)), @@ -256,21 +295,35 @@ impl ReadVolatile for KeepAliveStream { impl WriteVolatile for KeepAliveStream { fn write_volatile( &mut self, - buf: &VolatileSlice, + vs: &VolatileSlice, ) -> result::Result { - let mut send_buf = vec![0u8; buf.len()]; - buf.copy_to(&mut send_buf); + let len = vs.len(); + if self.write_buf.len() < len { + self.write_buf.resize(len, 0); + } + + let len = vs.copy_to(&mut self.write_buf[..len]); + // Move the buffer to avoid lifetime or ownership issues. + let write_buf = std::mem::take(&mut self.write_buf); + self.message_tx - .send(KeepAliveStreamMessage::Write(send_buf)) + .send(KeepAliveStreamMessage::Write { + len, + buf: write_buf, + }) .map_err(|e| { io::Error::other(format!("Unable to send message to KeepAliveWorker: {e}")) }) .map_err(VolatileMemoryError::IOError)?; match self.answer_rx.recv() { - Ok(KeepAliveStreamAnswer::Write(result)) => { - result.map_err(VolatileMemoryError::IOError) - } + Ok(KeepAliveStreamAnswer::Write(result)) => match result { + Ok((buf, len)) => { + self.write_buf = buf; + Ok(len) + } + Err(e) => Err(VolatileMemoryError::IOError(e)), + }, Ok(a) => Err(VolatileMemoryError::IOError(io::Error::other(format!( "Received unexpected answer: {a:?}. This is most likely a bug!", )))), From dd4f596e9aa5097c74389a17f0ea808886cc48b6 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 10:39:20 +0100 Subject: [PATCH 094/178] vm-migration: Add AsFd for KeepAliveStream We will use the KeepAliveStream also on the reciever side of the live migration, thus it has to implement AsFd. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/keep_alive_stream.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/vm-migration/src/keep_alive_stream.rs b/vm-migration/src/keep_alive_stream.rs index 4b2ab8a33b..53cc020d46 100644 --- a/vm-migration/src/keep_alive_stream.rs +++ b/vm-migration/src/keep_alive_stream.rs @@ -4,6 +4,7 @@ // use std::io::{self, Read, Write}; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; use std::sync::mpsc::{Receiver, RecvTimeoutError, SyncSender, sync_channel}; use std::thread::JoinHandle; use std::time::Duration; @@ -17,8 +18,6 @@ use crate::protocol::Request; /// The `KeepAliveStream` is a stream that is intended to be used for the main /// connection of live migrations. If the `KeepAliveStream` does not read or /// write often enough, it will send keep alive messages on the given stream. -/// The `KeepAliveStream` should not be used to send or receive memory, because -/// the `read_volatile()` and `write_volatile()` functions will be very slow. /// /// The `KeepAliveStream` is designed to be compatible with the `SocketStream` /// enum, and thus it should be really easy to use it. @@ -50,15 +49,13 @@ enum KeepAliveStreamAnswer { Flush(io::Result<()>), } -// The [`KeepAliveStream`] should only be used by the sender, not the receiver. -// Thus it doesn't have to implement `AsFd`. -struct KeepAliveWorker { +struct KeepAliveWorker { stream: S, } impl KeepAliveWorker where - S: Read + Write, + S: Read + Write + AsFd, { pub fn new(stream: S) -> Self { Self { stream } @@ -87,6 +84,8 @@ where pub struct KeepAliveStream { /// The `KeepAliveWorker`. thread: Option>, + /// Duplicated file descriptor for `AsFd`. + fd: OwnedFd, /// Used to send messages to the worker. message_tx: SyncSender, @@ -99,10 +98,12 @@ pub struct KeepAliveStream { } impl KeepAliveStream { - pub fn new( + pub fn new( stream: T, timeout: Duration, ) -> result::Result { + let fd = stream.as_fd().try_clone_to_owned()?; + // We want to block on send and on recv if nobody listens. Thus we set the bound to 0. let (message_tx, message_rx) = sync_channel::(0); let (answer_tx, answer_rx) = sync_channel::(0); @@ -154,6 +155,7 @@ impl KeepAliveStream { Ok(Self { thread: Some(thread), + fd, message_tx, answer_rx, read_buf: Vec::new(), @@ -171,6 +173,12 @@ impl Drop for KeepAliveStream { } } +impl AsFd for KeepAliveStream { + fn as_fd(&self) -> BorrowedFd<'_> { + self.fd.as_fd() + } +} + impl Read for KeepAliveStream { fn read(&mut self, out_buf: &mut [u8]) -> io::Result { let len = out_buf.len(); From 4db6bd3652df45c26b67060a4d44480854417b09 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 10:52:15 +0100 Subject: [PATCH 095/178] vmm: differentiate timeouts by reader/writer, not sender/receiver This is important when we wrap the receiver socket into the KeepAliveStream, because we want readers to wait longer than senders, and we want readers to wait long enough to see keep alive messages. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 50 ++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 073ff8ba98..531e3a4107 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -41,15 +41,40 @@ use crate::{GuestMemoryMmap, VmMigrationConfig}; /// receiver side. pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; -const RECEIVE_MIGRATION_SOCKET_TIMEOUT: Duration = Duration::from_secs(10); -const SEND_MIGRATION_SOCKET_TIMEOUT: Duration = Duration::from_secs(5); +/// The time a writer may block on a socket until it throws an error. +/// +/// Also the interval at which the [`KeepAliveStream`] sends keep alive messages. +/// +/// # Relation with [`MIGRATION_READ_TIMEOUT_DURATION`] +/// +/// This timeout has to be smaller than [`MIGRATION_READ_TIMEOUT_DURATION`], +/// otherwise spurious timeouts may happen. +const MIGRATION_WRITE_TIMEOUT_DURATION: Duration = Duration::from_secs(5); -fn set_migration_socket_timeouts(socket: &TcpStream, timeout: Duration) -> anyhow::Result<()> { +/// The time a reader may block on a socket until it throws an error. +/// +/// # Relation with [`MIGRATION_WRITE_TIMEOUT_DURATION`] +/// +/// This timeout has to be larger than [`MIGRATION_WRITE_TIMEOUT_DURATION`], +/// otherwise spurious timeouts may happen. +const MIGRATION_READ_TIMEOUT_DURATION: Duration = { + let migration_read_timeout_duration = Duration::from_secs(10); + + // This timeout has to be larger than [`MIGRATION_WRITE_TIMEOUT_DURATION`], + // otherwise spurious timeouts may happen. + assert!( + MIGRATION_WRITE_TIMEOUT_DURATION.as_millis() < migration_read_timeout_duration.as_millis(), + "MIGRATION_WRITE_TIMEOUT_DURATION must be smaller than MIGRATION_READ_TIMEOUT_DURATION", + ); + migration_read_timeout_duration +}; + +fn set_migration_socket_timeouts(socket: &TcpStream) -> anyhow::Result<()> { socket - .set_read_timeout(Some(timeout)) + .set_read_timeout(Some(MIGRATION_READ_TIMEOUT_DURATION)) .context("Error setting read timeout on TCP socket")?; socket - .set_write_timeout(Some(timeout)) + .set_write_timeout(Some(MIGRATION_WRITE_TIMEOUT_DURATION)) .context("Error setting write timeout on TCP socket")?; Ok(()) } @@ -67,11 +92,10 @@ impl ReceiveListener { pub(crate) fn accept(&mut self) -> Result { match self { ReceiveListener::Tcp(listener) => { - let (socket, _) = accept_with_timeout(listener, RECEIVE_MIGRATION_SOCKET_TIMEOUT) + let (socket, _) = accept_with_timeout(listener, MIGRATION_READ_TIMEOUT_DURATION) .context("Failed to accept TCP migration connection") .map_err(MigratableError::MigrateReceive)?; - set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) - .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; Ok(SocketStream::Tcp(socket)) } @@ -81,11 +105,10 @@ impl ReceiveListener { .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), ReceiveListener::Tls(listener, config) => { - let (socket, _) = accept_with_timeout(listener, RECEIVE_MIGRATION_SOCKET_TIMEOUT) + let (socket, _) = accept_with_timeout(listener, MIGRATION_READ_TIMEOUT_DURATION) .context("Failed to accept TCP connection") .map_err(MigratableError::MigrateReceive)?; - set_migration_socket_timeouts(&socket, RECEIVE_MIGRATION_SOCKET_TIMEOUT) - .map_err(MigratableError::MigrateReceive)?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; TlsStream::new_server(socket, config) .map(Box::new) @@ -909,8 +932,7 @@ pub(crate) fn send_migration_socket( let socket = TcpStream::connect(address).map_err(|e| { MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) })?; - set_migration_socket_timeouts(&socket, SEND_MIGRATION_SOCKET_TIMEOUT) - .map_err(MigratableError::MigrateSend)?; + set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateSend)?; if let Some(tls_dir) = tls_dir { let server_name = tcp_address_to_server_name(address) @@ -944,7 +966,7 @@ pub(crate) fn send_migration_socket_with_keep_alive( ) -> Result { match send_migration_socket(destination_url, tls_dir)? { socket @ (SocketStream::Tcp(_) | SocketStream::Tls(_)) => { - KeepAliveStream::new(socket, SEND_MIGRATION_SOCKET_TIMEOUT) + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION) .map(SocketStream::KeepAlive) .context("Error creating keep-alive migration stream") .map_err(MigratableError::MigrateSend) From 7c79afe7ba216d46a4fcab6c393992115eac4739 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 10:53:52 +0100 Subject: [PATCH 096/178] vm-migration: move keep alive handling into the protocol Otherwise we have to scatter the keep alive handling over the whole code base, which we don't want. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/protocol.rs | 15 +++++++++++++-- vmm/src/lib.rs | 4 ---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 01e0fba853..a04c3939cb 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -199,8 +199,19 @@ impl Request { pub fn read_from(fd: &mut dyn Read) -> Result { let mut request = Request::default(); - fd.read_exact(Self::as_mut_slice(&mut request)) - .map_err(MigratableError::MigrateSocket)?; + + loop { + fd.read_exact(Self::as_mut_slice(&mut request)) + .map_err(MigratableError::MigrateSocket)?; + + // If we read a keep alive message, we throw it away and keep reading. + if request.command() == Command::KeepAlive { + request = Request::default(); + continue; + } + + break; + } Ok(request) } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 1e2dadc32d..4c4f1f65a0 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3278,10 +3278,6 @@ impl RequestHandler for Vmm { let req = Request::read_from(&mut socket)?; trace!("Command {:?} received", req.command()); - if req.command() == Command::KeepAlive { - continue; - } - let (response, new_state, mut maybe_error) = match self.vm_receive_migration_step( &mut socket, &listener, From 37268c071cf07999262d34383be547abfd9fd1ce Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 14:05:30 +0100 Subject: [PATCH 097/178] vm-migration: add KeepAlive status to response The sender of the live migration usually waits for a response when it isn't sending requests or doing any work. Thus the receiver should send keep alive responses to not break the protocol. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/protocol.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index a04c3939cb..089c14d133 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -229,6 +229,7 @@ pub enum Status { Invalid, Ok, Error, + KeepAlive, } #[repr(C)] @@ -259,6 +260,10 @@ impl Response { Self::new(Status::Error, 0) } + pub fn keep_alive() -> Self { + Self::new(Status::KeepAlive, 0) + } + pub fn status(&self) -> Status { self.status } @@ -269,8 +274,19 @@ impl Response { pub fn read_from(fd: &mut dyn Read) -> Result { let mut response = Response::default(); - fd.read_exact(Self::as_mut_slice(&mut response)) - .map_err(MigratableError::MigrateSocket)?; + + loop { + fd.read_exact(Self::as_mut_slice(&mut response)) + .map_err(MigratableError::MigrateSocket)?; + + // If we read a keep alive message, we throw it away and keep reading. + if response.status() == Status::KeepAlive { + response = Response::default(); + continue; + } + + break; + } Ok(response) } From 2c89466dbe79caaf5513b56873f92e223be31b1a Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 14:10:44 +0100 Subject: [PATCH 098/178] vm-migration: make KeepAliveStream work for sender and receiver The sender and receiver side have to behave a bit different to not break the protocol. Thus we add a bit special handling for both sides. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/keep_alive_stream.rs | 22 +++++++++++++++------- vmm/src/migration_transport.rs | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vm-migration/src/keep_alive_stream.rs b/vm-migration/src/keep_alive_stream.rs index 53cc020d46..24f06390df 100644 --- a/vm-migration/src/keep_alive_stream.rs +++ b/vm-migration/src/keep_alive_stream.rs @@ -13,7 +13,7 @@ use std::{result, thread}; use vm_memory::bitmap::BitmapSlice; use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; -use crate::protocol::Request; +use crate::protocol::{Request, Response}; /// The `KeepAliveStream` is a stream that is intended to be used for the main /// connection of live migrations. If the `KeepAliveStream` does not read or @@ -51,14 +51,16 @@ enum KeepAliveStreamAnswer { struct KeepAliveWorker { stream: S, + /// Is this running on the sender or receiver side? + is_sender: bool, } impl KeepAliveWorker where S: Read + Write + AsFd, { - pub fn new(stream: S) -> Self { - Self { stream } + pub fn new(stream: S, is_sender: bool) -> Self { + Self { stream, is_sender } } pub fn read(&mut self, mut buf: Vec, len: usize) -> io::Result<(Vec, usize)> { @@ -101,6 +103,7 @@ impl KeepAliveStream { pub fn new( stream: T, timeout: Duration, + is_sender: bool, ) -> result::Result { let fd = stream.as_fd().try_clone_to_owned()?; @@ -109,9 +112,9 @@ impl KeepAliveStream { let (answer_tx, answer_rx) = sync_channel::(0); let thread = thread::Builder::new() - .name("keep_alive_sender_thread".to_string()) + .name("migration_keep_alive_thread".to_string()) .spawn(move || { - let mut worker = KeepAliveWorker::new(stream); + let mut worker = KeepAliveWorker::new(stream, is_sender); loop { // The idea is to always send a keep alive message when this times out. match message_rx.recv_timeout(timeout) { @@ -145,8 +148,13 @@ impl KeepAliveStream { KeepAliveStreamMessage::Disconnect => break, }, Err(RecvTimeoutError::Timeout) => { - let keep_alive = Request::keep_alive(); - let _ = keep_alive.write_to(&mut worker.stream); + if worker.is_sender { + let keep_alive = Request::keep_alive(); + let _ = keep_alive.write_to(&mut worker.stream); + } else { + let keep_alive = Response::keep_alive(); + let _ = keep_alive.write_to(&mut worker.stream); + } } Err(RecvTimeoutError::Disconnected) => break, } diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 531e3a4107..95b3e2cca1 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -966,7 +966,7 @@ pub(crate) fn send_migration_socket_with_keep_alive( ) -> Result { match send_migration_socket(destination_url, tls_dir)? { socket @ (SocketStream::Tcp(_) | SocketStream::Tls(_)) => { - KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION) + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, true) .map(SocketStream::KeepAlive) .context("Error creating keep-alive migration stream") .map_err(MigratableError::MigrateSend) From 1d7098a9e06156f3d7a096b8b8a8b05027ec676d Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 11:28:38 +0100 Subject: [PATCH 099/178] vmm: always use KeepAliveStream for main connection On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 4c4f1f65a0..1455e7b891 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1871,14 +1871,13 @@ impl Vmm { }; // Set up the socket connection - let mut socket = if send_data_migration.connections.get() > 1 && !send_data_migration.local - { - migration_transport::send_migration_socket_with_keep_alive( + let mut socket = if send_data_migration.local { + migration_transport::send_migration_socket( &send_data_migration.destination_url, send_data_migration.tls_dir.as_deref(), )? } else { - migration_transport::send_migration_socket( + migration_transport::send_migration_socket_with_keep_alive( &send_data_migration.destination_url, send_data_migration.tls_dir.as_deref(), )? From 60fa66795436140eed1d947bd2bc5b3323da34f8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 17 Mar 2026 11:30:04 +0100 Subject: [PATCH 100/178] vmm: Use KeepAliveStream also for receiver On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 2 +- vmm/src/migration_transport.rs | 34 ++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 1455e7b891..351acbcfec 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3262,7 +3262,7 @@ impl RequestHandler for Vmm { )?; // Accept the connection and get the socket let mut socket = listener - .accept() + .accept(true) .context("Failed to accept migration connection") .map_err(|e| { warn!("{e}"); diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 95b3e2cca1..69b1064a15 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -89,7 +89,10 @@ pub(crate) enum ReceiveListener { impl ReceiveListener { /// Block until a connection is accepted. - pub(crate) fn accept(&mut self) -> Result { + pub(crate) fn accept( + &mut self, + main_connection: bool, + ) -> Result { match self { ReceiveListener::Tcp(listener) => { let (socket, _) = accept_with_timeout(listener, MIGRATION_READ_TIMEOUT_DURATION) @@ -97,7 +100,15 @@ impl ReceiveListener { .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; - Ok(SocketStream::Tcp(socket)) + let socket = SocketStream::Tcp(socket); + if main_connection { + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, false) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateReceive) + } else { + Ok(socket) + } } ReceiveListener::Unix(listener) => listener .accept() @@ -110,11 +121,20 @@ impl ReceiveListener { .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; - TlsStream::new_server(socket, config) + let socket = TlsStream::new_server(socket, config) .map(Box::new) .map(SocketStream::Tls) .context("Failed to accept TLS migration connection") - .map_err(MigratableError::MigrateReceive) + .map_err(MigratableError::MigrateReceive)?; + + if main_connection { + KeepAliveStream::new(socket, MIGRATION_WRITE_TIMEOUT_DURATION, false) + .map(SocketStream::KeepAlive) + .context("Error creating keep-alive migration stream") + .map_err(MigratableError::MigrateReceive) + } else { + Ok(socket) + } } } } @@ -129,7 +149,7 @@ impl ReceiveListener { .map_err(MigratableError::MigrateReceive)? { // The listener is readable; accept the connection. - Ok(Some(self.accept()?)) + Ok(Some(self.accept(false)?)) } else { // The abort event was signaled before any connection arrived. Ok(None) @@ -233,9 +253,7 @@ impl AsFd for SocketStream { SocketStream::Unix(s) => s.as_fd(), SocketStream::Tcp(s) => s.as_fd(), SocketStream::Tls(s) => s.as_fd(), - SocketStream::KeepAlive(_) => { - unreachable!("KeepAliveStream is only used by the migration sender") - } + SocketStream::KeepAlive(s) => s.as_fd(), } } } From 481c7257f3e2b18ed22d974efc44e91bc2abcea9 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 8 Apr 2026 16:25:51 +0200 Subject: [PATCH 101/178] vmm: migration: longer accept timeout for receiver This massively simplifies local development as `ch-remote receive-migration` won't terminate after 5s. This does not have implications for libvirt/ch, as the ch driver will kill the process if the migration won't start. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/migration_transport.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 69b1064a15..9d58f38a05 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -69,6 +69,15 @@ const MIGRATION_READ_TIMEOUT_DURATION: Duration = { migration_read_timeout_duration }; +/// The timeout of the migration-receiver. +/// +/// We set this to a relatively high number to ease local development with +/// `ch-remote`. For production, this has no negative impacts as the management +/// software has full control over the Cloud Hypervisor process and will kill +/// the process on terminated migration. The timeout is used as a fallback +/// if the management software doesn't kill the process correctly. +const MIGRATION_ACCEPT_TIMEOUT_DURATION: Duration = Duration::from_secs(60); + fn set_migration_socket_timeouts(socket: &TcpStream) -> anyhow::Result<()> { socket .set_read_timeout(Some(MIGRATION_READ_TIMEOUT_DURATION)) @@ -95,7 +104,11 @@ impl ReceiveListener { ) -> Result { match self { ReceiveListener::Tcp(listener) => { - let (socket, _) = accept_with_timeout(listener, MIGRATION_READ_TIMEOUT_DURATION) + info!( + "Waiting for incoming migration via TCP (timeout {}s) ...", + MIGRATION_ACCEPT_TIMEOUT_DURATION.as_secs() + ); + let (socket, _) = accept_with_timeout(listener, MIGRATION_ACCEPT_TIMEOUT_DURATION) .context("Failed to accept TCP migration connection") .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; @@ -116,7 +129,11 @@ impl ReceiveListener { .context("Failed to accept Unix migration connection") .map_err(MigratableError::MigrateReceive), ReceiveListener::Tls(listener, config) => { - let (socket, _) = accept_with_timeout(listener, MIGRATION_READ_TIMEOUT_DURATION) + info!( + "Waiting for incoming migration via TCP/TLS (timeout {}s) ...", + MIGRATION_ACCEPT_TIMEOUT_DURATION.as_secs() + ); + let (socket, _) = accept_with_timeout(listener, MIGRATION_ACCEPT_TIMEOUT_DURATION) .context("Failed to accept TCP connection") .map_err(MigratableError::MigrateReceive)?; set_migration_socket_timeouts(&socket).map_err(MigratableError::MigrateReceive)?; From e01f34a7d84bebc2a0cb49179ac0647a32092780 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 8 Apr 2026 15:51:04 +0200 Subject: [PATCH 102/178] vmm: migration: improve error message on timeout The ch-remote connected to the receiving VMM printed the error chain correctly. The VMM itself however printed "Failed to accept migration connection". The new code prints ``` cloud-hypervisor: 1.760247s: INFO:vmm/src/lib.rs:1137 -- Waiting for incoming migration (timeout 60s) ... cloud-hypervisor: 2.760333s: WARN:vmm/src/lib.rs:4205 -- Timed out waiting for sender to connect. ``` which makes much more sense. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 351acbcfec..3a0a8b6763 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3263,11 +3263,9 @@ impl RequestHandler for Vmm { // Accept the connection and get the socket let mut socket = listener .accept(true) - .context("Failed to accept migration connection") - .map_err(|e| { - warn!("{e}"); - MigratableError::MigrateReceive(e) - })?; + .inspect_err(|e| warn!("{e}")) + .context("Failed to accept incoming migration") + .map_err(MigratableError::MigrateReceive)?; event!("vm", "migration-receive-started"); From b6417067b5ff3961b2cc0f8a948229f4ad42c201 Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Mon, 23 Mar 2026 18:09:37 +0100 Subject: [PATCH 103/178] vm-migration: fix UB in network parsing Using a value that is not a valid discriminant for an enum is undefined behavior. Uses the zerocopy crate to check the validity of the bytes before converting to the respective structs. On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- Cargo.lock | 1 + vm-migration/Cargo.toml | 1 + vm-migration/src/lib.rs | 3 ++ vm-migration/src/protocol.rs | 67 +++++++++++++++++++++--------------- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 82d42a7023..02eea958cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2679,6 +2679,7 @@ dependencies = [ "serde_json", "thiserror", "vm-memory", + "zerocopy", ] [[package]] diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index a65773eb93..b6444dbfc8 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -13,6 +13,7 @@ serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } vm-memory = { workspace = true, features = ["backend-atomic", "backend-mmap"] } +zerocopy = { workspace = true, features = ["derive", "std"] } [lints] workspace = true diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 55ef543fcb..60b1a47496 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -104,6 +104,9 @@ pub enum MigratableError { #[error("Lifecycle operation skipped for disconnected component {0}")] DeviceDisconnected(String), + #[error("Failed to deserialize network data")] + DeserializeError(#[source] anyhow::Error), + #[error("Error setting up a TLS-encrypted connection")] Tls(#[source] tls::TlsError), } diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 089c14d133..c56f89f988 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -76,9 +76,10 @@ use std::io::{Read, Write}; +use anyhow::anyhow; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use vm_memory::ByteValued; +use zerocopy::{Immutable, IntoBytes, KnownLayout, TryFromBytes}; use crate::MigratableError; use crate::bitpos_iterator::BitposIteratorExt; @@ -108,7 +109,9 @@ use crate::bitpos_iterator::BitposIteratorExt; /// /// [live-migration protocol]: super::protocol #[repr(u16)] -#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] +#[derive( + Debug, Copy, Clone, Default, PartialEq, Eq, Immutable, IntoBytes, KnownLayout, TryFromBytes, +)] pub enum Command { #[default] Invalid = 0, @@ -132,16 +135,13 @@ pub enum Command { } #[repr(C)] -#[derive(Default, Copy, Clone)] +#[derive(Default, Copy, Clone, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub struct Request { command: Command, padding: [u8; 6], length: u64, // Length of payload for command excluding the Request struct } -// SAFETY: Request contains a series of integers with no implicit padding -unsafe impl ByteValued for Request {} - impl Request { pub fn new(command: Command, length: u64) -> Self { Self { @@ -198,32 +198,40 @@ impl Request { } pub fn read_from(fd: &mut dyn Read) -> Result { - let mut request = Request::default(); + /// A byte buffer that matches `Self` in size and alignment to allow deserializing `Self` into. + #[repr(C, align(8))] + struct RequestBuffer([u8; const { size_of::() }]); + const _: () = const { + // Check that the alignment of the buffer matches `Self`. + assert!(align_of::() == align_of::()); + }; + let mut buffer = RequestBuffer([0; size_of::()]); + let RequestBuffer(request) = &mut buffer; loop { - fd.read_exact(Self::as_mut_slice(&mut request)) + fd.read_exact(request) .map_err(MigratableError::MigrateSocket)?; + let request = Self::try_mut_from_bytes(request) + .map_err(|error| MigratableError::DeserializeError(anyhow!("{error:?}")))?; + // If we read a keep alive message, we throw it away and keep reading. if request.command() == Command::KeepAlive { - request = Request::default(); + *request = Request::default(); continue; } - - break; + return Ok(*request); } - - Ok(request) } pub fn write_to(&self, fd: &mut dyn Write) -> Result<(), MigratableError> { - fd.write_all(Self::as_slice(self)) + fd.write_all(self.as_bytes()) .map_err(MigratableError::MigrateSocket) } } #[repr(u16)] -#[derive(Copy, Clone, PartialEq, Eq, Default)] +#[derive(Copy, Clone, PartialEq, Eq, Default, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub enum Status { #[default] Invalid, @@ -233,16 +241,13 @@ pub enum Status { } #[repr(C)] -#[derive(Default, Copy, Clone)] +#[derive(Default, Copy, Clone, Immutable, IntoBytes, KnownLayout, TryFromBytes)] pub struct Response { status: Status, padding: [u8; 6], length: u64, // Length of payload for command excluding the Response struct } -// SAFETY: Response contains a series of integers with no implicit padding -unsafe impl ByteValued for Response {} - impl Response { pub fn new(status: Status, length: u64) -> Self { Self { @@ -273,22 +278,30 @@ impl Response { } pub fn read_from(fd: &mut dyn Read) -> Result { - let mut response = Response::default(); + /// A byte buffer that matches `Self` in size and alignment to allow deserializing `Self` into. + #[repr(C, align(8))] + struct ResponseBuffer([u8; const { size_of::() }]); + const _: () = const { + // Check that the alignment of the buffer matches `Self`. + assert!(align_of::() == align_of::()); + }; + let mut buffer = ResponseBuffer([0; size_of::()]); + let ResponseBuffer(response) = &mut buffer; loop { - fd.read_exact(Self::as_mut_slice(&mut response)) + fd.read_exact(response) .map_err(MigratableError::MigrateSocket)?; + let response = Self::try_mut_from_bytes(response) + .map_err(|error| MigratableError::DeserializeError(anyhow!("{error:?}")))?; + // If we read a keep alive message, we throw it away and keep reading. if response.status() == Status::KeepAlive { - response = Response::default(); + *response = Response::default(); continue; } - - break; + return Ok(*response); } - - Ok(response) } pub fn ok_or_abandon( @@ -308,7 +321,7 @@ impl Response { } pub fn write_to(&self, fd: &mut dyn Write) -> Result<(), MigratableError> { - fd.write_all(Self::as_slice(self)) + fd.write_all(self.as_bytes()) .map_err(MigratableError::MigrateSocket) } } From 0e4be70cb2d907092d9ee23f2e3d881fefe9c57d Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 25 Mar 2026 15:12:33 +0100 Subject: [PATCH 104/178] vm-migration: stop sending abandon after error response When the receiver of a live migration encounters an error, it sends an error response. The sender of the migration would then send an abandon request and wait for a response. This abandon request is not necessary, because the receiver already abandoned the migration due to the error it encountered. From now on this function will not send an abandon request to the receiver anymore, thus it was renamed to "ok_or_error". Also, this case was always broken, because after sending the error response, the receiver just exits without waiting for the additional abandon request. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/protocol.rs | 12 ++---------- vmm/src/migration_transport.rs | 4 +--- vmm/src/vm.rs | 9 +++------ 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index c56f89f988..bc3633fac4 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -304,17 +304,9 @@ impl Response { } } - pub fn ok_or_abandon( - self, - fd: &mut T, - error: MigratableError, - ) -> Result - where - T: Read + Write, - { + /// Return the response if its status is `Ok`; return the caller-provided error for any other status. + pub fn ok_or_error(self, error: MigratableError) -> Result { if self.status != Status::Ok { - Request::abandon().write_to(fd)?; - Response::read_from(fd)?; return Err(error); } Ok(self) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 9d58f38a05..e73b8bfcd5 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -1042,9 +1042,7 @@ pub(crate) fn expect_ok_response( socket: &mut SocketStream, error: MigratableError, ) -> Result<(), MigratableError> { - Response::read_from(socket)? - .ok_or_abandon(socket, error) - .map(|_| ()) + Response::read_from(socket)?.ok_or_error(error).map(|_| ()) } /// Send a request and validate that the peer responds with OK. diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index fa879810af..393911c6fe 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3105,12 +3105,9 @@ impl Vm { .context("Error sending memory fd") .map_err(MigratableError::MigrateSend)?; - Response::read_from(socket)?.ok_or_abandon( - socket, - MigratableError::MigrateSend(anyhow!( - "Error during memory fd migration (got bad response)" - )), - )?; + Response::read_from(socket)?.ok_or_error(MigratableError::MigrateSend(anyhow!( + "Error during memory fd migration (got bad response)" + )))?; } Ok(()) From 3655c61c5f3de180508d5902896839d59c0affa1 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 16 Jan 2026 11:42:40 +0100 Subject: [PATCH 105/178] vmm: add mechanism for post-migration announcements Some devices can or should do announcements after a live migration, e.g., a network device announces its new location in a network to update the MAC->port mappings in layer 2 switches. With this change a device can return an announcement action ("announcer") to do that. We use reverse ARP (RARP) packets for this, similar to QEMU. Also, we do 5 rounds of announcements in background (in a dedicated thread). On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- net_util/src/lib.rs | 2 +- virtio-devices/src/device.rs | 17 ++++++++ virtio-devices/src/lib.rs | 4 +- virtio-devices/src/net.rs | 85 +++++++++++++++++++++++++++++++++++- vmm/src/device_manager.rs | 76 ++++++++++++++++++++++++++++++-- vmm/src/lib.rs | 4 ++ vmm/src/vm.rs | 12 +++++ 7 files changed, 192 insertions(+), 8 deletions(-) diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index 7152c1676f..de8c5b1465 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -101,7 +101,7 @@ fn create_unix_socket() -> Result { Ok(unsafe { net::UdpSocket::from_raw_fd(sock) }) } -fn vnet_hdr_len() -> usize { +pub fn vnet_hdr_len() -> usize { std::mem::size_of::() } diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 4c61ba35d1..acddd015e4 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -191,6 +191,12 @@ pub trait VirtioDevice: Send { fn access_platform(&self) -> Option> { None } + + /// Some devices can announce their location after a live migration to + /// speed up normal execution. + fn post_migration_announcer(&self) -> Option> { + None + } } /// Trait to define address translation for devices managed by virtio-iommu @@ -432,3 +438,14 @@ impl Pausable for VirtioCommon { Ok(()) } } + +/// A PostMigrationAnnouncer provides a callback that informs other components +/// in the system. For example, network devices send out RARP packets to update +/// the MAC to port mappings of switches. +pub trait PostMigrationAnnouncer: Send { + /// Announces that a migration _might_ have occurred. + /// Implementers need to assume that the announcement can be + /// scheduled to run some time after a migration has occurred and + /// that it might even be executed when no migration has happened. + fn announce(&mut self); +} diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index 6ac3977982..15af1b6126 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -43,8 +43,8 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - ActivationContext, DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, - VirtioInterruptType, VirtioSharedMemoryList, + ActivationContext, DmaRemapping, PostMigrationAnnouncer, VirtioCommon, VirtioDevice, + VirtioInterrupt, VirtioInterruptType, VirtioSharedMemoryList, }; pub use self::epoll_helper::{ EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 451cfa99b2..f711e3b286 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -20,8 +20,9 @@ use log::{debug, error, info, trace, warn}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ - CtrlQueue, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TapError, TxVirtio, - VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, open_tap, + CtrlQueue, MAC_ADDR_LEN, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, + TapError, TxVirtio, VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, + open_tap, vnet_hdr_len, }; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; @@ -40,6 +41,7 @@ use super::{ EpollHelperHandler, Error as DeviceError, RateLimiterConfig, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; +use crate::device::PostMigrationAnnouncer; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{GuestMemoryMmap, VirtioInterrupt}; @@ -416,6 +418,11 @@ pub struct NetState { pub queue_size: Vec, } +// Minimum length of an ethernet frame. This size omits the FCS/CRC (frame check +// sequence), which will be added by the hardware. This size can also be found +// in the Linux kernel's UAPI headers. +const ETH_FRAME_LEN: usize = 60; + impl Net { /// Create a new virtio network device with the given TAP interface. #[allow(clippy::too_many_arguments)] @@ -646,6 +653,39 @@ impl Net { pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); } + + // Builds a reverse ARP packet with this device's MAC address. + fn build_rarp_announce(&self) -> [u8; ETH_FRAME_LEN] { + const ETH_P_RARP: u16 = 0x8035; // Ethertype RARP + const ARP_HTYPE_ETH: u16 = 0x1; // Hardware type Ethernet + const ARP_PTYPE_IP: u16 = 0x0800; // Protocol type IPv4 + const ARP_OP_REQUEST_REV: u16 = 0x0003; // RARP Request opcode + + const IPV4_ADDR_LENGTH: usize = 4; // Size of an IPv4 address + + let mut buf = [0u8; ETH_FRAME_LEN]; + + // Ethernet header + buf[0..6].copy_from_slice(&[0xff; MAC_ADDR_LEN]); // This is a broadcast + buf[6..12].copy_from_slice(&self.config.mac); // Src is this NIC + buf[12..14].copy_from_slice(Ð_P_RARP.to_be_bytes()); // This is a RARP packet + + // ARP Header + buf[14..16].copy_from_slice(&ARP_HTYPE_ETH.to_be_bytes()); + buf[16..18].copy_from_slice(&ARP_PTYPE_IP.to_be_bytes()); + buf[18] = MAC_ADDR_LEN as u8; // Hardware address length (ethernet) + buf[19] = IPV4_ADDR_LENGTH as u8; // Protocol address length (IPv4) + // This is a "fake RARP" packet, we don't want to perform a real RARP lookup. + // Thus the content of the next fields is largely irrelevant. Setting source + // hardware address = target hardware address is fine according to RFC 903. + buf[20..22].copy_from_slice(&ARP_OP_REQUEST_REV.to_be_bytes()); + buf[22..28].copy_from_slice(&self.config.mac); // Source hardware address + buf[28..32].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Source protocol address + buf[32..38].copy_from_slice(&self.config.mac); // Target hardware address + buf[38..42].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Target protocol address + + buf + } } impl Drop for Net { @@ -873,6 +913,13 @@ impl VirtioDevice for Net { fn access_platform(&self) -> Option> { self.common.access_platform() } + + fn post_migration_announcer(&self) -> Option> { + Some(Box::new(TapRarpAnnouncer::new( + self.build_rarp_announce(), + self.taps.clone(), + ))) + } } impl Pausable for Net { @@ -901,3 +948,37 @@ impl Snapshottable for Net { } impl Transportable for Net {} impl Migratable for Net {} + +/// Sends RARP packets on a virtio-net device, to update the MAC to port +/// mappings of switches in the network. This reduces the time until network +/// packets reliably arrive at the network device. +pub struct TapRarpAnnouncer { + announce: [u8; ETH_FRAME_LEN], // Buffer for the raw RARP packet. + taps: Vec, // The TAP devices to the the packets on. +} + +impl TapRarpAnnouncer { + pub fn new(announce: [u8; 60], taps: Vec) -> Self { + Self { announce, taps } + } +} + +impl PostMigrationAnnouncer for TapRarpAnnouncer { + fn announce(&mut self) { + // We have to add a virtio-net header to the announce. + let mut buf = vec![0u8; vnet_hdr_len() + self.announce.len()]; + buf[vnet_hdr_len()..].copy_from_slice(&self.announce); + + for tap in &self.taps { + // SAFETY: `buf.as_ptr()` is valid for `buf.len()` bytes and remains + // valid until the syscall returns. `tap.as_raw_fd()` is a valid TAP fd. + let _ = unsafe { + libc::write( + tap.as_raw_fd(), + buf.as_ptr() as *const libc::c_void, + buf.len(), + ) + }; + } + } +} diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 13f27e666f..80598d1d02 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -18,10 +18,11 @@ use std::os::unix::io::{AsRawFd, FromRawFd}; #[cfg(not(target_arch = "riscv64"))] use std::path::Path; use std::path::PathBuf; -use std::result; use std::sync::{Arc, Mutex}; +use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; +use std::{result, thread}; use acpi_tables::sdt::GenericAddress; use acpi_tables::{Aml, aml}; @@ -88,8 +89,8 @@ use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd, VfioOps}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ - AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping, - VirtioMemMappingSource, + AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, PostMigrationAnnouncer, + VdpaDmaMapping, VirtioMemMappingSource, }; use vm_allocator::{AddressAllocator, InterruptAllocError, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; @@ -5318,6 +5319,75 @@ impl DeviceManager { self.vfio_ops = None; } } + + /// Helps the environment converge quickly after a live migration by + /// prompting devices to advertise the VM from its new host. + /// + /// This is mainly useful for networking: switches and peers can refresh + /// their view of where the guest now lives instead of waiting for normal + /// traffic to update MAC-to-port mappings on its own. + /// + /// The method gathers the [`PostMigrationAnnouncer`] implementations + /// exposed by virtio devices, runs one announcement synchronously for + /// minimum delay, and then schedules a few retries from a background + /// thread. + pub fn post_migration_announce(&self) { + let mut announcers: Vec> = self + .virtio_devices + .iter() + .filter_map(|dev| dev.virtio_device.lock().unwrap().post_migration_announcer()) + .collect(); + + if announcers.is_empty() { + info!("No announcers"); + return; + } + + // We do the first announcement synchronously, because we want the announcements + // as soon as possible. + announcers.iter_mut().for_each(|a| a.announce()); + info!("Post migration announce (sync)"); + + // For good measure we repeat the announcements. This increases the chance that + // the announcements have the expected effect. + const ROUNDS: u32 = 4; + const INITIAL_DELAY: Duration = Duration::from_millis(50); + const STEP_DELAY: Duration = Duration::from_millis(100); + const MAX_DELAY: Duration = Duration::from_millis(450); + schedule_post_migration_announcements( + announcers, + ROUNDS, + INITIAL_DELAY, + STEP_DELAY, + MAX_DELAY, + ); + } +} + +/// Starts a thread that periodically performs the post-migration announcements. +fn schedule_post_migration_announcements( + mut announcers: Vec>, + rounds: u32, + initial_delay: Duration, + step_delay: Duration, + max_delay: Duration, +) { + let _ = thread::Builder::new() + .name("post-migration-announcers".to_string()) + .spawn(move || { + for round in 0..rounds { + info!("Post migration announce (async): {}/{}", round + 1, rounds); + + // The first announcement already was done synchronously, thus + // we sleep at the start of the loop. + + let delay = (initial_delay + step_delay.saturating_mul(round)).min(max_delay); + debug!("Sleeping {}ms", delay.as_millis()); + thread::sleep(delay); + + announcers.iter_mut().for_each(|a| a.announce()); + } + }); } #[cfg(feature = "ivshmem")] diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 3a0a8b6763..a65381d0d9 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1290,6 +1290,10 @@ impl Vmm { // vm_receive_state before, which creates the VM. let vm = self.vm.vm_mut().unwrap(); + // Advertise new VM location to network switches. + // The thread in background periodically sends multiple messages. + vm.post_migration_announce(); + // We are on the control-loop thread handling an API request, so // there is no concurrent access from other VMM or migration // threads. The VM is in the Paused state , which permits both diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 393911c6fe..223ff238d5 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3051,6 +3051,10 @@ impl Vm { .try_lock_disks() .map_err(Error::LockingError)?; + // TODO for upstreaming probably relevant + // Advertise new VM location to network switches. + // self.post_migration_announce(); + // Now we can start all vCPUs from here. self.cpu_manager .lock() @@ -3288,6 +3292,14 @@ impl Vm { .nmi() .map_err(Error::ErrorNmi); } + + /// Calls [`DeviceManager::post_migration_announce`]. + pub fn post_migration_announce(&self) { + self.device_manager + .lock() + .unwrap() + .post_migration_announce(); + } } impl Pausable for Vm { From 0f8e0733378930a3e51c7915d1ac3565a7abfce8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 25 Mar 2026 16:24:11 +0100 Subject: [PATCH 106/178] vmm: add internal post-migration announce action Add an internal API action that triggers the post migration announce. We can now wire this into the API. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/api/mod.rs | 39 +++++++++++++++++++++++++++++++++++++++ vmm/src/lib.rs | 15 +++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 7a0a24eb8b..6d1f7c6d7e 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -105,6 +105,10 @@ pub enum ApiError { #[error("The VM could not resume")] VmResume(#[source] VmError), + /// The VM could not perform the post-migration announcement. + #[error("The VM could not perform the post-migration announcement")] + VmPostMigrationAnnounce(#[source] VmError), + /// The VM is not booted. #[error("The VM is not booted")] VmNotBooted, @@ -717,6 +721,8 @@ pub trait RequestHandler { fn vm_resume(&mut self) -> Result<(), VmError>; + fn vm_post_migration_announce(&mut self) -> Result<(), VmError>; + fn vm_snapshot(&mut self, destination_url: &str) -> Result<(), VmError>; fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> Result<(), VmError>; @@ -1797,6 +1803,39 @@ impl ApiAction for VmResume { } } +pub struct VmPostMigrationAnnounce; + +impl ApiAction for VmPostMigrationAnnounce { + type RequestBody = (); + type ResponseBody = Option; + + fn request(&self, _: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmPostMigrationAnnounce"); + + let response = vmm + .vm_post_migration_announce() + .map_err(ApiError::VmPostMigrationAnnounce) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmSendMigration; impl ApiAction for VmSendMigration { diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index a65381d0d9..94e2606e52 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2585,6 +2585,21 @@ impl RequestHandler for Vmm { } } + fn vm_post_migration_announce(&mut self) -> result::Result<(), VmError> { + match self.vm { + MaybeVmOwnership::Vmm(ref vm) => { + if vm.get_state() != VmState::Running { + return Err(VmError::VmNotRunning); + } + + vm.post_migration_announce(); + Ok(()) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, + } + } + fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => { From 2bffeb418d19c2f8745712178d539e8bcff75632 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 25 Mar 2026 16:45:13 +0100 Subject: [PATCH 107/178] vmm: api: add API call for post migration announcements On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- cloud-hypervisor/src/bin/ch-remote.rs | 12 ++++++++++++ docs/api.md | 5 +++-- fuzz/fuzz_targets/http_api.rs | 4 ++++ vmm/src/api/dbus/mod.rs | 11 +++++++++-- vmm/src/api/http/http_endpoint.rs | 6 ++++-- vmm/src/api/http/mod.rs | 10 +++++++--- vmm/src/api/openapi/cloud-hypervisor.yaml | 10 ++++++++++ 7 files changed, 49 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 3e288f5ca4..dd2eefb79e 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -110,6 +110,7 @@ trait DBusApi1 { fn vm_delete(&self) -> zbus::Result<()>; fn vm_info(&self) -> zbus::Result; fn vm_pause(&self) -> zbus::Result<()>; + fn vm_post_migration_announce(&self) -> zbus::Result<()>; fn vm_power_button(&self) -> zbus::Result<()>; fn vm_reboot(&self) -> zbus::Result<()>; fn vm_remove_device(&self, vm_remove_device: &str) -> zbus::Result<()>; @@ -225,6 +226,11 @@ impl<'a> DBusApi1ProxyBlocking<'a> { self.vm_pause().map_err(Error::DBusApiClient) } + fn api_vm_post_migration_announce(&self) -> ApiResult { + self.vm_post_migration_announce() + .map_err(Error::DBusApiClient) + } + fn api_vm_power_button(&self) -> ApiResult { self.vm_power_button().map_err(Error::DBusApiClient) } @@ -299,6 +305,10 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu Some("resume") => { simple_api_command(socket, "PUT", "resume", None).map_err(Error::HttpApiClient) } + Some("post-migration-announce") => { + simple_api_command(socket, "PUT", "post-migration-announce", None) + .map_err(Error::HttpApiClient) + } Some("power-button") => { simple_api_command(socket, "PUT", "power-button", None).map_err(Error::HttpApiClient) } @@ -639,6 +649,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) Some("delete") => proxy.api_vm_delete(), Some("shutdown-vmm") => proxy.api_vmm_shutdown(), Some("resume") => proxy.api_vm_resume(), + Some("post-migration-announce") => proxy.api_vm_post_migration_announce(), Some("power-button") => proxy.api_vm_power_button(), Some("reboot") => proxy.api_vm_reboot(), Some("pause") => proxy.api_vm_pause(), @@ -1137,6 +1148,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("nmi").about("Trigger NMI"), Command::new("pause").about("Pause the VM"), Command::new("ping").about("Ping the VMM to check for API server availability"), + Command::new("post-migration-announce").about("Trigger post-migration announcements"), Command::new("power-button").about("Trigger a power button in the VM"), Command::new("reboot").about("Reboot the VM"), Command::new("receive-migration") diff --git a/docs/api.md b/docs/api.md index cea3f31812..d6f9be6e9e 100644 --- a/docs/api.md +++ b/docs/api.md @@ -71,8 +71,8 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: ##### Virtual Machine (VM) Actions -| Action | Endpoint | Request Body | Response Body | Prerequisites | -| --------------------------------------- | ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | +| Action | Endpoint | Request Body | Response Body | Prerequisites | +|-----------------------------------------| ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | | Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | | Delete the VM | `/vm.delete` | N/A | N/A | N/A | | Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | @@ -81,6 +81,7 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: | Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | | Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | | Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | +| Trigger post-migration announce | `/vm.post-migration-announce` | N/A | N/A | The VM is booted and not paused | | Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | | Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | | Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 6a7844b97d..600a231b39 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -312,6 +312,10 @@ impl RequestHandler for StubApiRequestHandler { fn vm_cancel_migration(&mut self) -> Result<(), MigratableError> { Ok(()) } + + fn vm_post_migration_announce(&mut self) -> Result<(), VmError> { + Ok(()) + } } fn http_receiver_stub(exit_evt: EventFd, api_evt: EventFd, api_receiver: Receiver) { diff --git a/vmm/src/api/dbus/mod.rs b/vmm/src/api/dbus/mod.rs index ae39feb7d7..94d007cad7 100644 --- a/vmm/src/api/dbus/mod.rs +++ b/vmm/src/api/dbus/mod.rs @@ -24,8 +24,9 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, Body, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, - VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, - VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, + VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, + VmResize, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, + VmmShutdown, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, NetConfig, Result as VmmResult, VmConfig}; @@ -250,6 +251,12 @@ impl DBusApi { self.vm_action(&VmPause, ()).await.map(|_| ()) } + async fn vm_post_migration_announce(&self) -> Result<()> { + self.vm_action(&VmPostMigrationAnnounce, ()) + .await + .map(|_| ()) + } + async fn vm_power_button(&self) -> Result<()> { self.vm_action(&VmPowerButton, ()).await.map(|_| ()) } diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index ad58424319..57aa6c4469 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -49,8 +49,9 @@ use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, - VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -415,6 +416,7 @@ vm_action_put_handler!(VmShutdown); vm_action_put_handler!(VmReboot); vm_action_put_handler!(VmPause); vm_action_put_handler!(VmResume); +vm_action_put_handler!(VmPostMigrationAnnounce); vm_action_put_handler!(VmPowerButton); vm_action_put_handler!(VmNmi); vm_action_put_handler!(VmCancelMigration); diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 3ed9a4c2b4..5464ca87ab 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -30,9 +30,9 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, - VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, - VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, - VmShutdown, VmSnapshot, + VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, + VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, + VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -274,6 +274,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.resume"), Box::new(VmActionHandler::new(&VmResume)), ); + r.routes.insert( + endpoint!("/vm.post-migration-announce"), + Box::new(VmActionHandler::new(&VmPostMigrationAnnounce)), + ); r.routes.insert( endpoint!("/vm.send-migration"), Box::new(VmActionHandler::new(&VmSendMigration)), diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index f1f5cfc824..fd5ccec531 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -109,6 +109,16 @@ paths: 405: description: The VM instance could not resume because it is not paused. + /vm.post-migration-announce: + put: + summary: Trigger post-migration announcements for a running VM instance. + operationId: postMigrationAnnounceVM + responses: + 204: + description: The VM instance successfully triggered post-migration announcements. + 500: + description: The VM instance could not trigger post-migration announcements because it is not running. + /vm.shutdown: put: summary: Shut the VM instance down. From 54c920ed1ae9a6d11eab1f61485d29069a04392b Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 31 Mar 2026 11:45:11 +0200 Subject: [PATCH 108/178] net_util: refactor ctrl queue parsing Restructure CtrlQueue::process() so each command parses its own descriptor layout and returns the used length alongside the status descriptor. This is a behavior-neutral cleanup that prepares follow-up control queue features. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- net_util/src/ctrl_queue.rs | 53 ++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index 8b34a33a7a..e5dd02db1d 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -104,22 +104,19 @@ impl CtrlQueue { .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; - let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; - - let data_desc_addr = data_desc - .addr() - .translate_gva(access_platform, data_desc.len() as usize) - .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; - - let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; - - let ok = match u32::from(ctrl_hdr.class) { + let (ok, status_desc) = match u32::from(ctrl_hdr.class) { VIRTIO_NET_CTRL_MQ => { + let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let data_desc_addr = data_desc + .addr() + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; let queue_pairs = desc_chain .memory() .read_obj::(data_desc_addr) .map_err(Error::GuestMemory)?; - if u32::from(ctrl_hdr.cmd) != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET { + let ok = if u32::from(ctrl_hdr.cmd) != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET { warn!("Unsupported command: {}", ctrl_hdr.cmd); false } else if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN as u16) @@ -130,14 +127,22 @@ impl CtrlQueue { } else { info!("Number of MQ pairs requested: {queue_pairs}"); true - } + }; + + (ok, status_desc) } VIRTIO_NET_CTRL_GUEST_OFFLOADS => { + let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let data_desc_addr = data_desc + .addr() + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; let features = desc_chain .memory() .read_obj::(data_desc_addr) .map_err(Error::GuestMemory)?; - if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET { + let ok = if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET { let mut ok = true; for tap in self.taps.iter_mut() { info!("Reprogramming tap offload with features: {features}"); @@ -152,15 +157,21 @@ impl CtrlQueue { } else { warn!("Unsupported command: {}", ctrl_hdr.cmd); false - } - } - _ if is_tolerated_ctrl_command(ctrl_hdr) => { - debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); - true + }; + + (ok, status_desc) } _ => { - warn!("Unsupported command {ctrl_hdr:?}"); - false + let _data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; + let ok = if is_tolerated_ctrl_command(ctrl_hdr) { + debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); + true + } else { + warn!("Unsupported command {ctrl_hdr:?}"); + false + }; + (ok, status_desc) } }; @@ -174,8 +185,6 @@ impl CtrlQueue { .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; - // Per the virtio spec the used length is bytes the device wrote - // to device-writable descriptors; here just the 1-byte ack. queue .add_used(desc_chain.memory(), desc_chain.head_index(), 1) .map_err(Error::QueueAddUsed)?; From 909cb532c5eb2fe0b576bf62daa4cf146cfeff54 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 31 Mar 2026 13:19:09 +0200 Subject: [PATCH 109/178] virtio-devices: net: refactor constructor state Extract virtio-net constructor bookkeeping into a small helper struct and dedicated restore/fresh initialization helpers. This keeps new_with_tap() focused on assembly and makes follow-up feature changes easier to review. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/net.rs | 162 +++++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 62 deletions(-) diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index f711e3b286..a8ce2c4fe9 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -423,7 +423,91 @@ pub struct NetState { // in the Linux kernel's UAPI headers. const ETH_FRAME_LEN: usize = 60; +/// Constructor-time copy of the fields needed to initialize the live device +/// state, derived either from a restored NetState or from fresh defaults. +struct NetConstructorState { + avail_features: u64, + acked_features: u64, + config: VirtioNetConfig, + queue_sizes: Vec, + paused: bool, +} + impl Net { + /// Restores a [`NetConstructorState`] from the provided [`NetState`]. + fn restored_constructor_state(id: &str, state: NetState) -> NetConstructorState { + info!("Restoring virtio-net {id}"); + + NetConstructorState { + avail_features: state.avail_features, + acked_features: state.acked_features, + config: state.config, + queue_sizes: state.queue_size, + paused: true, + } + } + + #[allow(clippy::too_many_arguments)] + /// Creates a new [`NetConstructorState`]. + fn fresh_constructor_state( + guest_mac: Option, + access_platform_enabled: bool, + mtu: Option, + num_queues: usize, + queue_size: u16, + offload_tso: bool, + offload_ufo: bool, + offload_csum: bool, + ) -> NetConstructorState { + let mut avail_features = (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); + + if mtu.is_some() { + avail_features |= 1 << VIRTIO_NET_F_MTU; + } + + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; + } + + // Configure TSO/UFO features when hardware checksum offload is enabled. + if offload_csum { + avail_features |= (1 << VIRTIO_NET_F_CSUM) + | (1 << VIRTIO_NET_F_GUEST_CSUM) + | (1 << VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); + + if offload_tso { + avail_features |= (1 << VIRTIO_NET_F_HOST_ECN) + | (1 << VIRTIO_NET_F_HOST_TSO4) + | (1 << VIRTIO_NET_F_HOST_TSO6) + | (1 << VIRTIO_NET_F_GUEST_ECN) + | (1 << VIRTIO_NET_F_GUEST_TSO4) + | (1 << VIRTIO_NET_F_GUEST_TSO6); + } + + if offload_ufo { + avail_features |= (1 << VIRTIO_NET_F_HOST_UFO) | (1 << VIRTIO_NET_F_GUEST_UFO); + } + } + + avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; + let queue_num = num_queues + 1; + + let mut config = VirtioNetConfig::default(); + if let Some(mac) = guest_mac { + build_net_config_space(&mut config, mac, num_queues, mtu, &mut avail_features); + } else { + build_net_config_space_with_mq(&mut config, num_queues, mtu, &mut avail_features); + } + + NetConstructorState { + avail_features, + acked_features: 0, + config, + queue_sizes: vec![queue_size; queue_num], + paused: false, + } + } + /// Create a new virtio network device with the given TAP interface. #[allow(clippy::too_many_arguments)] pub fn new_with_tap( @@ -452,81 +536,35 @@ impl Net { } }; - let (avail_features, acked_features, config, queue_sizes, paused) = if let Some(state) = - state - { - info!("Restoring virtio-net {id}"); - ( - state.avail_features, - state.acked_features, - state.config, - state.queue_size, - true, - ) + let constructor_state = if let Some(state) = state { + Self::restored_constructor_state(&id, state) } else { - let mut avail_features = (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); - - if mtu.is_some() { - avail_features |= 1 << VIRTIO_NET_F_MTU; - } - - if access_platform_enabled { - avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; - } - - // Configure TSO/UFO features when hardware checksum offload is enabled. - if offload_csum { - avail_features |= (1 << VIRTIO_NET_F_CSUM) - | (1 << VIRTIO_NET_F_GUEST_CSUM) - | (1 << VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); - - if offload_tso { - avail_features |= (1 << VIRTIO_NET_F_HOST_ECN) - | (1 << VIRTIO_NET_F_HOST_TSO4) - | (1 << VIRTIO_NET_F_HOST_TSO6) - | (1 << VIRTIO_NET_F_GUEST_ECN) - | (1 << VIRTIO_NET_F_GUEST_TSO4) - | (1 << VIRTIO_NET_F_GUEST_TSO6); - } - - if offload_ufo { - avail_features |= (1 << VIRTIO_NET_F_HOST_UFO) | (1 << VIRTIO_NET_F_GUEST_UFO); - } - } - - avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; - let queue_num = num_queues + 1; - - let mut config = VirtioNetConfig::default(); - if let Some(mac) = guest_mac { - build_net_config_space(&mut config, mac, num_queues, mtu, &mut avail_features); - } else { - build_net_config_space_with_mq(&mut config, num_queues, mtu, &mut avail_features); - } - - ( - avail_features, - 0, - config, - vec![queue_size; queue_num], - false, + Self::fresh_constructor_state( + guest_mac, + access_platform_enabled, + mtu, + num_queues, + queue_size, + offload_tso, + offload_ufo, + offload_csum, ) }; Ok(Net { common: VirtioCommon { device_type: VirtioDeviceType::Net as u32, - avail_features, - acked_features, - queue_sizes, + avail_features: constructor_state.avail_features, + acked_features: constructor_state.acked_features, + queue_sizes: constructor_state.queue_sizes, paused_sync: Some(Arc::new(Barrier::new((num_queues / 2) + 1))), min_queues: 2, - paused: Arc::new(AtomicBool::new(paused)), + paused: Arc::new(AtomicBool::new(constructor_state.paused)), ..Default::default() }, id, taps, - config, + config: constructor_state.config, ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action, From 70fc3a25b6c3d65cf00484306c31cee537e9af75 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 7 Apr 2026 14:39:30 +0200 Subject: [PATCH 110/178] virtio-devices: net: report link up in config status On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/net.rs | 76 +++++++++++++++++++++- virtio-devices/src/vhost_user/net.rs | 97 ++++++++++++++++++++++++---- 2 files changed, 159 insertions(+), 14 deletions(-) diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index a8ce2c4fe9..8c66ee3510 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -490,6 +490,7 @@ impl Net { } avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; + avail_features |= 1 << VIRTIO_NET_F_STATUS; let queue_num = num_queues + 1; let mut config = VirtioNetConfig::default(); @@ -687,6 +688,22 @@ impl Net { } } + /// Return the guest-visible virtio-net config, recomputing `status` from the + /// current state of the device. + fn config_with_status(&self) -> VirtioNetConfig { + let mut config = self.config; + + // We want to recompute the guest-visible status field from the current state of + // the device. We clear this field first to avoid showing stale data. + config.status = 0; + + if self.common.feature_acked(VIRTIO_NET_F_STATUS.into()) { + config.status |= VIRTIO_NET_S_LINK_UP as u16; + } + + config + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); @@ -773,7 +790,8 @@ impl VirtioDevice for Net { } fn read_config(&self, offset: u64, data: &mut [u8]) { - self.read_config_from_slice(self.config.as_slice(), offset, data); + let config = self.config_with_status(); + self.read_config_from_slice(config.as_slice(), offset, data); } fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { @@ -1020,3 +1038,59 @@ impl PostMigrationAnnouncer for TapRarpAnnouncer { } } } + +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + + use seccompiler::SeccompAction; + use virtio_bindings::virtio_net::{VIRTIO_NET_F_STATUS, VIRTIO_NET_S_LINK_UP}; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + fn test_net(acked_features: u64) -> Net { + Net { + common: VirtioCommon { + acked_features, + ..Default::default() + }, + id: "test-net".to_string(), + taps: Vec::new(), + config: VirtioNetConfig::default(), + ctrl_queue_epoll_thread: None, + counters: NetCounters::default(), + seccomp_action: SeccompAction::Allow, + rate_limiter_config: None, + exit_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + device_status: Arc::new(AtomicU8::new(0)), + } + } + + const STATUS_OFFSET: usize = std::mem::offset_of!(VirtioNetConfig, status); + fn read_status(device: &Net) -> u16 { + let mut data = vec![0; size_of::()]; + device.read_config(0, &mut data); + + u16::from_le_bytes( + data[STATUS_OFFSET..STATUS_OFFSET + size_of::()] + .try_into() + .unwrap(), + ) + } + + #[test] + fn test_fresh_constructor_state_exposes_status() { + let state = + Net::fresh_constructor_state(None, false, Some(MIN_MTU), 2, 256, false, false, false); + + assert_ne!(state.avail_features & (1 << VIRTIO_NET_F_STATUS), 0); + } + + #[test] + fn test_status_feature_reports_link_up() { + let net = test_net(1 << VIRTIO_NET_F_STATUS); + + assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); + } +} diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index eed7c8284d..950c7192af 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -14,7 +14,8 @@ use virtio_bindings::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, - VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, + VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, VIRTIO_NET_F_STATUS, + VIRTIO_NET_S_LINK_UP, }; use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use virtio_queue::QueueT; @@ -88,10 +89,10 @@ impl Net { ) = if let Some(state) = state { info!("Restoring vhost-user-net {id}"); - // The backend acknowledged features must not contain - // VIRTIO_NET_F_MAC since we don't expect the backend - // to handle it. - let backend_acked_features = state.acked_features & !(1 << VIRTIO_NET_F_MAC); + // The backend acknowledged features must not contain frontend-only + // bits since we don't expect the backend to handle them. + let backend_acked_features = + state.acked_features & !((1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS)); vu.set_protocol_features_vhost_user( backend_acked_features, @@ -179,9 +180,9 @@ impl Net { num_queues += 1; } - // Make sure the virtio feature to set the MAC address is exposed to - // the guest, even if it hasn't been negotiated with the backend. - acked_features |= 1 << VIRTIO_NET_F_MAC; + // Make sure frontend-owned config-space features stay exposed to + // the guest, even if they are not negotiated with the backend. + acked_features |= (1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS); ( acked_features, @@ -231,6 +232,26 @@ impl Net { fn state(&self) -> std::result::Result { self.vu_common.state(self.config) } + + /// Return the guest-visible virtio-net config, recomputing `status` from the + /// current state of the device. + fn config_with_status(&self) -> VirtioNetConfig { + let mut config = self.config; + + // We want to recompute the guest-visible status field from the current state of + // the device. We clear this field first to avoid showing stale data. + config.status = 0; + + if self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_STATUS.into()) + { + config.status |= VIRTIO_NET_S_LINK_UP as u16; + } + + config + } } impl Drop for Net { @@ -267,7 +288,8 @@ impl VirtioDevice for Net { } fn read_config(&self, offset: u64, data: &mut [u8]) { - self.read_config_from_slice(self.config.as_slice(), offset, data); + let config = self.config_with_status(); + self.read_config_from_slice(config.as_slice(), offset, data); } fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { @@ -335,10 +357,10 @@ impl VirtioDevice for Net { let backend_req_handler: Option> = None; - // The backend acknowledged features must not contain VIRTIO_NET_F_MAC - // since we don't expect the backend to handle it. - let backend_acked_features = - self.vu_common.virtio_common.acked_features & !(1 << VIRTIO_NET_F_MAC); + // The backend acknowledged features must not contain frontend-only + // bits since we don't expect the backend to handle them. + let backend_acked_features = self.vu_common.virtio_common.acked_features + & !((1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS)); // Run a dedicated thread for handling potential reconnections with // the backend. @@ -442,3 +464,52 @@ impl Migratable for Net { self.vu_common.complete_migration() } } + +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + + use seccompiler::SeccompAction; + use virtio_bindings::virtio_net::{VIRTIO_NET_F_STATUS, VIRTIO_NET_S_LINK_UP}; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + fn test_net(acked_features: u64) -> Net { + Net { + vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + acked_features, + ..Default::default() + }, + ..Default::default() + }, + id: "test-vu-net".to_string(), + config: VirtioNetConfig::default(), + guest_memory: None, + ctrl_queue_epoll_thread: None, + seccomp_action: SeccompAction::Allow, + exit_evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + access_platform_enabled: false, + } + } + + const STATUS_OFFSET: usize = std::mem::offset_of!(VirtioNetConfig, status); + fn read_status(device: &Net) -> u16 { + let mut data = vec![0; size_of::()]; + device.read_config(0, &mut data); + + u16::from_le_bytes( + data[STATUS_OFFSET..STATUS_OFFSET + size_of::()] + .try_into() + .unwrap(), + ) + } + + #[test] + fn test_status_feature_reports_link_up() { + let net = test_net(1 << VIRTIO_NET_F_STATUS); + + assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); + } +} From cd9eb4b77d009e5507d7f07666847069cc037c55 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 31 Mar 2026 13:21:59 +0200 Subject: [PATCH 111/178] virtio-devices: net: support guest announce after migration In addition to the RARP announcement, advertise VIRTIO_NET_F_GUEST_ANNOUNCE on virtio-net devices and request a guest announcement after migration by setting the announce status bit and raising a config interrupt. Handle the guest announce ACK on the control queue. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- docs/live_migration.md | 8 ++ net_util/src/ctrl_queue.rs | 26 +++++- virtio-devices/src/net.rs | 84 +++++++++++++---- virtio-devices/src/vhost_user/net.rs | 129 ++++++++++++++++++++++----- 4 files changed, 206 insertions(+), 41 deletions(-) diff --git a/docs/live_migration.md b/docs/live_migration.md index 271df04aaf..36191dfc0c 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -134,6 +134,10 @@ src $ ch-remote --api-socket=/tmp/api send-migration unix:/tmp/sock When the above commands completed, the VM should be successfully migrated to the destination machine without interrupting the workload. +Cloud Hypervisor sends out RARP packages after the migration, to +announce the new location of the VM to the network. For `virtio-net` +devices, Cloud Hypervisor asks guests that negotiated +`VIRTIO_NET_F_GUEST_ANNOUNCE` to also re-announce themselves. ### TCP Socket Migration @@ -190,6 +194,10 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. +Cloud Hypervisor sends out RARP packages after the migration, to +announce the new location of the VM to the network. For `virtio-net` +devices, Cloud Hypervisor asks guests that negotiated +`VIRTIO_NET_F_GUEST_ANNOUNCE` to also re-announce themselves. #### Encryption diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index e5dd02db1d..15fcaf0422 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -2,6 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + use log::{debug, error, info, warn}; use thiserror::Error; use virtio_bindings::virtio_net::{ @@ -72,18 +75,23 @@ fn is_tolerated_ctrl_command(ctrl_hdr: ControlHeader) -> bool { u32::from(ctrl_hdr.cmd), VIRTIO_NET_CTRL_VLAN_ADD | VIRTIO_NET_CTRL_VLAN_DEL ), - VIRTIO_NET_CTRL_ANNOUNCE => u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK, _ => false, } } pub struct CtrlQueue { pub taps: Vec, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + pub announce_pending: Arc, } impl CtrlQueue { - pub fn new(taps: Vec) -> Self { - CtrlQueue { taps } + pub fn new(taps: Vec, announce_pending: Arc) -> Self { + CtrlQueue { + taps, + announce_pending, + } } pub fn process( @@ -161,6 +169,18 @@ impl CtrlQueue { (ok, status_desc) } + VIRTIO_NET_CTRL_ANNOUNCE => { + let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; + let ok = if u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK { + self.announce_pending.store(false, Ordering::Release); + true + } else { + warn!("Unsupported command: {}", ctrl_hdr.cmd); + false + }; + + (ok, status_desc) + } _ => { let _data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 8c66ee3510..10bb1f437c 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -402,6 +402,9 @@ pub struct Net { id: String, taps: Vec, config: VirtioNetConfig, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + announce_pending: Arc, ctrl_queue_epoll_thread: Option>, counters: NetCounters, seccomp_action: SeccompAction, @@ -411,10 +414,13 @@ pub struct Net { } #[derive(Serialize, Deserialize)] +/// Serialized snapshot of the device state. The fields are copied from the +/// live device when snapshotting and restored back into a new device instance. pub struct NetState { pub avail_features: u64, pub acked_features: u64, pub config: VirtioNetConfig, + pub announce_pending: bool, pub queue_size: Vec, } @@ -429,6 +435,7 @@ struct NetConstructorState { avail_features: u64, acked_features: u64, config: VirtioNetConfig, + announce_pending: bool, queue_sizes: Vec, paused: bool, } @@ -442,6 +449,7 @@ impl Net { avail_features: state.avail_features, acked_features: state.acked_features, config: state.config, + announce_pending: state.announce_pending, queue_sizes: state.queue_size, paused: true, } @@ -491,6 +499,7 @@ impl Net { avail_features |= 1 << VIRTIO_NET_F_CTRL_VQ; avail_features |= 1 << VIRTIO_NET_F_STATUS; + avail_features |= 1 << VIRTIO_NET_F_GUEST_ANNOUNCE; let queue_num = num_queues + 1; let mut config = VirtioNetConfig::default(); @@ -504,6 +513,7 @@ impl Net { avail_features, acked_features: 0, config, + announce_pending: false, queue_sizes: vec![queue_size; queue_num], paused: false, } @@ -566,6 +576,7 @@ impl Net { id, taps, config: constructor_state.config, + announce_pending: Arc::new(AtomicBool::new(constructor_state.announce_pending)), ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action, @@ -684,6 +695,7 @@ impl Net { avail_features: self.common.avail_features, acked_features: self.common.acked_features, config: self.config, + announce_pending: self.announce_pending.load(Ordering::Acquire), queue_size: self.common.queue_sizes.clone(), } } @@ -699,6 +711,10 @@ impl Net { if self.common.feature_acked(VIRTIO_NET_F_STATUS.into()) { config.status |= VIRTIO_NET_S_LINK_UP as u16; + + if self.announce_pending.load(Ordering::Acquire) { + config.status |= VIRTIO_NET_S_ANNOUNCE as u16; + } } config @@ -825,7 +841,7 @@ impl VirtioDevice for Net { mem: mem.clone(), kill_evt, pause_evt, - ctrl_q: CtrlQueue::new(self.taps.clone()), + ctrl_q: CtrlQueue::new(self.taps.clone(), Arc::clone(&self.announce_pending)), queue: ctrl_queue, queue_evt: ctrl_queue_evt, access_platform: self.common.access_platform(), @@ -936,6 +952,7 @@ impl VirtioDevice for Net { fn reset(&mut self) { self.common.reset(); + self.announce_pending.store(false, Ordering::Release); event!("virtio-device", "reset", "id", &self.id); } @@ -971,10 +988,7 @@ impl VirtioDevice for Net { } fn post_migration_announcer(&self) -> Option> { - Some(Box::new(TapRarpAnnouncer::new( - self.build_rarp_announce(), - self.taps.clone(), - ))) + Some(Box::new(VirtioNetPostMigrationAnnouncer::new(self))) } } @@ -1005,25 +1019,42 @@ impl Snapshottable for Net { impl Transportable for Net {} impl Migratable for Net {} -/// Sends RARP packets on a virtio-net device, to update the MAC to port -/// mappings of switches in the network. This reduces the time until network -/// packets reliably arrive at the network device. -pub struct TapRarpAnnouncer { - announce: [u8; ETH_FRAME_LEN], // Buffer for the raw RARP packet. - taps: Vec, // The TAP devices to the the packets on. +/// Announces this virtio-net device on the network. +/// Most fields are cloned references to device state so retry rounds can run +/// without borrowing the device itself. +pub struct VirtioNetPostMigrationAnnouncer { + id: String, + /// Remembers whether this device negotiated the guest-visible announce path. + guest_announce_negotiated: bool, + announce_pending: Arc, + interrupt_cb: Option>, + /// Prebuilt host-side RARP payload used for immediate post-migration + /// announcement retries. + rarp_announce: [u8; ETH_FRAME_LEN], + taps: Vec, } -impl TapRarpAnnouncer { - pub fn new(announce: [u8; 60], taps: Vec) -> Self { - Self { announce, taps } +impl VirtioNetPostMigrationAnnouncer { + pub fn new(dev: &Net) -> Self { + Self { + id: dev.id.clone(), + guest_announce_negotiated: dev.common.feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), + announce_pending: Arc::clone(&dev.announce_pending), + interrupt_cb: dev.common.interrupt_cb.clone(), + rarp_announce: dev.build_rarp_announce(), + taps: dev.taps.clone(), + } } } -impl PostMigrationAnnouncer for TapRarpAnnouncer { +impl PostMigrationAnnouncer for VirtioNetPostMigrationAnnouncer { + // Send a host-side RARP immediately so the network can converge before the + // guest runs again, and then also ask the guest to re-announce itself when + // GUEST_ANNOUNCE was negotiated. fn announce(&mut self) { - // We have to add a virtio-net header to the announce. - let mut buf = vec![0u8; vnet_hdr_len() + self.announce.len()]; - buf[vnet_hdr_len()..].copy_from_slice(&self.announce); + // We have to add a virtio-net header to the RARP announce. + let mut buf = vec![0u8; vnet_hdr_len() + self.rarp_announce.len()]; + buf[vnet_hdr_len()..].copy_from_slice(&self.rarp_announce); for tap in &self.taps { // SAFETY: `buf.as_ptr()` is valid for `buf.len()` bytes and remains @@ -1036,6 +1067,22 @@ impl PostMigrationAnnouncer for TapRarpAnnouncer { ) }; } + + if self.guest_announce_negotiated + && let Some(interrupt_cb) = &self.interrupt_cb + { + self.announce_pending.store(true, Ordering::Release); + + interrupt_cb + .trigger(VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to send interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } } } @@ -1058,6 +1105,7 @@ mod unit_tests { id: "test-net".to_string(), taps: Vec::new(), config: VirtioNetConfig::default(), + announce_pending: Arc::new(AtomicBool::new(false)), ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action: SeccompAction::Allow, diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 950c7192af..818f3ce67b 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -1,21 +1,21 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::atomic::AtomicBool; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{result, thread}; -use log::{error, info}; +use log::{error, info, warn}; use net_util::{CtrlQueue, MacAddr, VirtioNetConfig, build_net_config_space}; use seccompiler::SeccompAction; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_bindings::virtio_net::{ - VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_ECN, - VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, - VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, - VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, VIRTIO_NET_F_STATUS, - VIRTIO_NET_S_LINK_UP, + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_GUEST_CSUM, + VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, + VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, + VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, VIRTIO_NET_S_LINK_UP, }; use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use virtio_queue::QueueT; @@ -29,8 +29,8 @@ use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon, VhostUserState}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, - VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, PostMigrationAnnouncer, + VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, }; const DEFAULT_QUEUE_NUMBER: usize = 2; @@ -44,6 +44,9 @@ pub struct Net { vu_common: VhostUserCommon, id: String, config: VirtioNetConfig, + /// Tracks whether the guest still needs to acknowledge a post-migration + /// announce request through the control queue. + announce_pending: Arc, guest_memory: Option>, ctrl_queue_epoll_thread: Option>, seccomp_action: SeccompAction, @@ -52,6 +55,23 @@ pub struct Net { } impl Net { + /// Derive the guest-visible feature set from the backend-negotiated + /// features plus frontend-only bits that Cloud Hypervisor implements + /// locally, such as `VIRTIO_NET_F_MAC`, `VIRTIO_NET_F_STATUS`, and + /// `VIRTIO_NET_F_GUEST_ANNOUNCE`. + fn frontend_avail_features(backend_acked_features: u64) -> u64 { + let mut guest_avail_features = backend_acked_features | (1 << VIRTIO_NET_F_MAC); + + // Guest announce is implemented by the frontend through config + // changes and the locally handled control queue. + if guest_avail_features & (1 << VIRTIO_NET_F_CTRL_VQ) != 0 { + guest_avail_features |= 1 << VIRTIO_NET_F_STATUS; + guest_avail_features |= 1 << VIRTIO_NET_F_GUEST_ANNOUNCE; + } + + guest_avail_features + } + /// Create a new vhost-user-net device #[allow(clippy::too_many_arguments)] pub fn new( @@ -84,6 +104,7 @@ impl Net { acked_protocol_features, vu_num_queues, config, + announce_pending, paused, vring_bases, ) = if let Some(state) = state { @@ -91,8 +112,10 @@ impl Net { // The backend acknowledged features must not contain frontend-only // bits since we don't expect the backend to handle them. - let backend_acked_features = - state.acked_features & !((1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS)); + let backend_acked_features = state.acked_features + & !((1 << VIRTIO_NET_F_MAC) + | (1 << VIRTIO_NET_F_STATUS) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE)); vu.set_protocol_features_vhost_user( backend_acked_features, @@ -107,12 +130,15 @@ impl Net { num_queues += 1; } + let announce_pending = (state.config.status & (VIRTIO_NET_S_ANNOUNCE as u16)) != 0; + ( state.avail_features, state.acked_features, state.acked_protocol_features, state.vu_num_queues, state.config, + announce_pending, true, state.vring_bases, ) @@ -120,6 +146,7 @@ impl Net { // Filling device and vring features VMM supports. let mut avail_features = (1 << VIRTIO_NET_F_MRG_RXBUF) | (1 << VIRTIO_NET_F_CTRL_VQ) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | DEFAULT_VIRTIO_FEATURES; if mtu.is_some() { @@ -154,7 +181,7 @@ impl Net { | VhostUserProtocolFeatures::LOG_SHMFD | VhostUserProtocolFeatures::DEVICE_STATE; - let (mut acked_features, acked_protocol_features) = + let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; let backend_num_queues = @@ -180,12 +207,12 @@ impl Net { num_queues += 1; } - // Make sure frontend-owned config-space features stay exposed to - // the guest, even if they are not negotiated with the backend. - acked_features |= (1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS); + // Build the feature set that gets exposed to the guest. Some frontend available + // features are dependent on the features the backend supports. + let guest_avail_features = Self::frontend_avail_features(acked_features); ( - acked_features, + guest_avail_features, // If part of the available features that have been acked, // the PROTOCOL_FEATURES bit must be already set through // the VIRTIO acked features as we know the guest would @@ -195,6 +222,7 @@ impl Net { vu_num_queues, config, false, + false, None, ) }; @@ -221,6 +249,7 @@ impl Net { ..Default::default() }, config, + announce_pending: Arc::new(AtomicBool::new(announce_pending)), guest_memory: None, ctrl_queue_epoll_thread: None, seccomp_action, @@ -230,7 +259,7 @@ impl Net { } fn state(&self) -> std::result::Result { - self.vu_common.state(self.config) + self.vu_common.state(self.config_with_status()) } /// Return the guest-visible virtio-net config, recomputing `status` from the @@ -248,6 +277,10 @@ impl Net { .feature_acked(VIRTIO_NET_F_STATUS.into()) { config.status |= VIRTIO_NET_S_LINK_UP as u16; + + if self.announce_pending.load(Ordering::Acquire) { + config.status |= VIRTIO_NET_S_ANNOUNCE as u16; + } } config @@ -326,7 +359,7 @@ impl VirtioDevice for Net { mem: mem.clone(), kill_evt, pause_evt, - ctrl_q: CtrlQueue::new(Vec::new()), + ctrl_q: CtrlQueue::new(Vec::new(), Arc::clone(&self.announce_pending)), queue: ctrl_queue, queue_evt: ctrl_queue_evt, access_platform: None, @@ -358,9 +391,11 @@ impl VirtioDevice for Net { let backend_req_handler: Option> = None; // The backend acknowledged features must not contain frontend-only - // bits since we don't expect the backend to handle them. + // features since we don't expect the backend to handle them. let backend_acked_features = self.vu_common.virtio_common.acked_features - & !((1 << VIRTIO_NET_F_MAC) | (1 << VIRTIO_NET_F_STATUS)); + & !((1 << VIRTIO_NET_F_MAC) + | (1 << VIRTIO_NET_F_STATUS) + | (1 << VIRTIO_NET_F_GUEST_ANNOUNCE)); // Run a dedicated thread for handling potential reconnections with // the backend. @@ -397,12 +432,17 @@ impl VirtioDevice for Net { fn reset(&mut self) { self.vu_common.reset(&self.id); + self.announce_pending.store(false, Ordering::Release); } fn shutdown(&mut self) { self.vu_common.shutdown(); } + fn post_migration_announcer(&self) -> Option> { + Some(Box::new(VhostUserNetPostMigrationAnnouncer::new(self))) + } + fn add_memory_region( &mut self, region: &Arc, @@ -465,6 +505,54 @@ impl Migratable for Net { } } +/// Announces this vhost-user-net device on the network. +/// Most fields are cloned references to device state so retry rounds can run +/// without borrowing the device itself. +pub struct VhostUserNetPostMigrationAnnouncer { + id: String, + /// Remembers whether this device negotiated the guest-visible announce path. + guest_announce_negotiated: bool, + announce_pending: Arc, + interrupt_cb: Option>, +} + +impl VhostUserNetPostMigrationAnnouncer { + pub fn new(dev: &Net) -> Self { + Self { + id: dev.id.clone(), + guest_announce_negotiated: dev + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), + announce_pending: Arc::clone(&dev.announce_pending), + interrupt_cb: dev.vu_common.virtio_common.interrupt_cb.clone(), + } + } +} + +impl PostMigrationAnnouncer for VhostUserNetPostMigrationAnnouncer { + // Vhost-user-net relies on the guest-visible announce path: mark the + // request pending and re-trigger the config interrupt while this retry + // session remains valid. + fn announce(&mut self) { + if self.guest_announce_negotiated + && let Some(interrupt_cb) = &self.interrupt_cb + { + self.announce_pending.store(true, Ordering::Release); + + interrupt_cb + .trigger(crate::VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to send interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } +} + #[cfg(test)] mod unit_tests { use std::mem::size_of; @@ -486,6 +574,7 @@ mod unit_tests { }, id: "test-vu-net".to_string(), config: VirtioNetConfig::default(), + announce_pending: Arc::new(AtomicBool::new(false)), guest_memory: None, ctrl_queue_epoll_thread: None, seccomp_action: SeccompAction::Allow, From 8a01679ae2fc6c6bf91a6bbca99f82c3f7377c47 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 27 Mar 2026 16:08:13 +0100 Subject: [PATCH 112/178] virtio-devices: net: add guest announce tests Add unit tests for the new guest-announce flow in the control queue, virtio-net, and vhost-user-net. The tests cover setting and clearing the announce state, triggering the config interrupt, and disabling the host-side RARP fallback when the guest negotiated VIRTIO_NET_F_GUEST_ANNOUNCE. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- net_util/src/ctrl_queue.rs | 64 +++++++++++++++ virtio-devices/src/net.rs | 107 ++++++++++++++++++++++++- virtio-devices/src/vhost_user/net.rs | 115 ++++++++++++++++++++++++++- 3 files changed, 279 insertions(+), 7 deletions(-) diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index 15fcaf0422..530c9367ab 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -220,3 +220,67 @@ impl CtrlQueue { Ok(()) } } + +#[cfg(test)] +mod unit_tests { + use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, Ordering}; + + use virtio_bindings::virtio_net::{VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK}; + use virtio_bindings::virtio_ring::{VRING_DESC_F_NEXT, VRING_DESC_F_WRITE}; + use vm_memory::{Bytes, GuestAddress}; + use vm_virtio::queue::testing::VirtQueue as GuestQ; + + use super::*; + + #[test] + fn test_process_announce_ack() { + // The guest acknowledges the post-migration request on the control + // queue, which clears the pending announce state in the device model. + // This test builds the minimal virtqueue request for that command: + // one readable descriptor for the control header, followed by one + // writable descriptor where the device stores the command status. + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let guest_q = GuestQ::new(GuestAddress(0), &mem, 16); + let mut queue = guest_q.create_queue(); + + let ctrl_hdr = ControlHeader { + class: VIRTIO_NET_CTRL_ANNOUNCE as u8, + cmd: VIRTIO_NET_CTRL_ANNOUNCE_ACK as u8, + }; + let ctrl_addr = GuestAddress(0x1000); + let status_addr = GuestAddress(0x1100); + mem.write_obj(ctrl_hdr, ctrl_addr).unwrap(); + + // Descriptor 0 contains the control header and points to descriptor 1. + guest_q.dtable[0].set( + ctrl_addr.0, + size_of::() as u32, + VRING_DESC_F_NEXT.try_into().unwrap(), + 1, + ); + // Descriptor 1 is the writable status byte returned by the device. + guest_q.dtable[1].set(status_addr.0, 1, VRING_DESC_F_WRITE.try_into().unwrap(), 0); + + // Publish the two-descriptor request to the available ring so + // CtrlQueue::process() can pop and handle it. + guest_q.avail.ring[0].set(0); + guest_q.avail.idx.set(1); + + // Start from the state reached after post_migration(): the guest still + // owes us an ANNOUNCE_ACK on the control queue. + let announce_pending = Arc::new(AtomicBool::new(true)); + let mut ctrl_q = CtrlQueue::new(Vec::new(), Arc::clone(&announce_pending)); + + ctrl_q.process(&mem, &mut queue, None).unwrap(); + + // A successful ANNOUNCE_ACK clears the pending flag and reports + // VIRTIO_NET_OK in the guest-provided status buffer. + assert!(!announce_pending.load(Ordering::Acquire)); + assert_eq!( + mem.read_obj::(status_addr).unwrap(), + VIRTIO_NET_OK as u8 + ); + } +} diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 10bb1f437c..95d86d7f2c 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -1089,17 +1089,57 @@ impl PostMigrationAnnouncer for VirtioNetPostMigrationAnnouncer { #[cfg(test)] mod unit_tests { use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use seccompiler::SeccompAction; - use virtio_bindings::virtio_net::{VIRTIO_NET_F_STATUS, VIRTIO_NET_S_LINK_UP}; + use virtio_bindings::virtio_net::{ + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, + VIRTIO_NET_S_LINK_UP, + }; use vmm_sys_util::eventfd::EventFd; use super::*; + use crate::device::{VirtioInterrupt, VirtioInterruptType}; - fn test_net(acked_features: u64) -> Net { + struct TestInterrupt { + config_count: AtomicUsize, + } + + impl TestInterrupt { + fn new() -> Self { + Self { + config_count: AtomicUsize::new(0), + } + } + } + + impl VirtioInterrupt for TestInterrupt { + fn trigger( + &self, + int_type: VirtioInterruptType, + ) -> std::result::Result<(), std::io::Error> { + if matches!(int_type, VirtioInterruptType::Config) { + self.config_count.fetch_add(1, Ordering::AcqRel); + } + Ok(()) + } + + fn set_notifier( + &self, + _int_type: u32, + _notifier: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } + } + + fn test_net(acked_features: u64, interrupt_cb: Option>) -> Net { Net { common: VirtioCommon { acked_features, + interrupt_cb, ..Default::default() }, id: "test-net".to_string(), @@ -1137,8 +1177,69 @@ mod unit_tests { #[test] fn test_status_feature_reports_link_up() { - let net = test_net(1 << VIRTIO_NET_F_STATUS); + let net = test_net(1 << VIRTIO_NET_F_STATUS, None); assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); } + + #[test] + fn test_post_migration_sets_announce_and_triggers_config() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_post_migration_without_feature_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net(0, Some(interrupt.clone() as Arc)); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_post_migration_retries_retrigger_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + announcer.announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 2); + } + + #[test] + fn test_reset_clears_pending_announce() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + assert!(net.announce_pending.load(Ordering::Acquire)); + + net.reset(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + } } diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 818f3ce67b..6565a4502e 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -552,22 +552,61 @@ impl PostMigrationAnnouncer for VhostUserNetPostMigrationAnnouncer { } } } - #[cfg(test)] mod unit_tests { use std::mem::size_of; + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; use seccompiler::SeccompAction; - use virtio_bindings::virtio_net::{VIRTIO_NET_F_STATUS, VIRTIO_NET_S_LINK_UP}; + use virtio_bindings::virtio_net::{ + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_STATUS, VIRTIO_NET_S_ANNOUNCE, + VIRTIO_NET_S_LINK_UP, + }; use vmm_sys_util::eventfd::EventFd; use super::*; + use crate::device::{VirtioInterrupt, VirtioInterruptType}; + + struct TestInterrupt { + config_count: AtomicUsize, + } - fn test_net(acked_features: u64) -> Net { + impl TestInterrupt { + fn new() -> Self { + Self { + config_count: AtomicUsize::new(0), + } + } + } + + impl VirtioInterrupt for TestInterrupt { + fn trigger( + &self, + int_type: VirtioInterruptType, + ) -> std::result::Result<(), std::io::Error> { + if matches!(int_type, VirtioInterruptType::Config) { + self.config_count.fetch_add(1, Ordering::AcqRel); + } + Ok(()) + } + + fn set_notifier( + &self, + _int_type: u32, + _notifier: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } + } + + fn test_net(acked_features: u64, interrupt_cb: Option>) -> Net { Net { vu_common: VhostUserCommon { virtio_common: VirtioCommon { acked_features, + interrupt_cb, ..Default::default() }, ..Default::default() @@ -597,8 +636,76 @@ mod unit_tests { #[test] fn test_status_feature_reports_link_up() { - let net = test_net(1 << VIRTIO_NET_F_STATUS); + let net = test_net(1 << VIRTIO_NET_F_STATUS, None); assert_eq!(read_status(&net), VIRTIO_NET_S_LINK_UP as u16); } + + #[test] + fn test_post_migration_sets_announce_and_triggers_config() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_frontend_avail_features_expose_guest_announce_and_status() { + let avail_features = Net::frontend_avail_features(1 << VIRTIO_NET_F_CTRL_VQ); + + assert_ne!(avail_features & (1 << VIRTIO_NET_F_MAC), 0); + assert_ne!(avail_features & (1 << VIRTIO_NET_F_STATUS), 0); + assert_ne!(avail_features & (1 << VIRTIO_NET_F_GUEST_ANNOUNCE), 0); + } + + #[test] + fn test_post_migration_without_feature_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net(0, Some(interrupt.clone() as Arc)); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_post_migration_with_ctrl_vq_but_without_guest_announce_is_noop() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + 1 << VIRTIO_NET_F_CTRL_VQ, + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); + } + + #[test] + fn test_reset_clears_pending_announce() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + + net.post_migration_announcer().unwrap().announce(); + assert!(net.announce_pending.load(Ordering::Acquire)); + + net.reset(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + } } From f07580bdaf408167b7fd3597f1ec034e560e81be Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 31 Mar 2026 16:20:50 +0200 Subject: [PATCH 113/178] virtio-devices: net: fix guest announce compatibility Preserve migration compatibility with older snapshots by defaulting a missing announce_pending field to false during deserialization, and cover both cases with regression tests. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- Cargo.lock | 1 + virtio-devices/Cargo.toml | 3 +++ virtio-devices/src/net.rs | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 02eea958cb..3da2b3eda3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2601,6 +2601,7 @@ dependencies = [ "rate_limiter", "seccompiler", "serde", + "serde_json", "serde_with", "serial_buffer", "thiserror", diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index d2658eeeca..ef6b4f717a 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -50,5 +50,8 @@ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } +[dev-dependencies] +serde_json = { workspace = true } + [lints] workspace = true diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 95d86d7f2c..f64a9ef679 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -416,10 +416,14 @@ pub struct Net { #[derive(Serialize, Deserialize)] /// Serialized snapshot of the device state. The fields are copied from the /// live device when snapshotting and restored back into a new device instance. +/// +/// Fields not present in previous versions are tagged with `#[serde(default)]` +/// to allow deserialization if the field is not present. pub struct NetState { pub avail_features: u64, pub acked_features: u64, pub config: VirtioNetConfig, + #[serde(default)] pub announce_pending: bool, pub queue_size: Vec, } @@ -1242,4 +1246,24 @@ mod unit_tests { assert!(!net.announce_pending.load(Ordering::Acquire)); assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); } + + #[test] + fn test_net_state_deserialize_without_announce_pending_defaults_to_false() { + // Older snapshots do not contain announce_pending. Restoring them on a + // newer binary must treat the missing field as "no announce pending". + let state = NetState { + avail_features: 1, + acked_features: 2, + config: VirtioNetConfig::default(), + announce_pending: true, + queue_size: vec![256, 256], + }; + let mut value = serde_json::to_value(state).unwrap(); + + value.as_object_mut().unwrap().remove("announce_pending"); + + let restored: NetState = serde_json::from_value(value).unwrap(); + + assert!(!restored.announce_pending); + } } From 7bfbdc35098fe9d454fe256630bc91a087055926 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 31 Mar 2026 16:34:55 +0200 Subject: [PATCH 114/178] virtio-devices: net: restore pending announce notifications Re-trigger config interrupts for restored pending guest announce requests once the net device is activated. Cover both virtio-net and vhost-user-net with regression tests. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/net.rs | 38 +++++++++++++++++++++++++++ virtio-devices/src/vhost_user/net.rs | 39 ++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index f64a9ef679..9747afb56b 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -761,6 +761,27 @@ impl Net { buf } + + /// Re-notify the guest about a restored pending ANNOUNCE request once the + /// transport has installed an interrupt callback during activation. + fn notify_pending_guest_announce(&self) { + if self.announce_pending.load(Ordering::Acquire) + && self + .common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()) + && let Some(interrupt_cb) = &self.common.interrupt_cb + { + interrupt_cb + .trigger(VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to resend pending announce interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } } impl Drop for Net { @@ -949,6 +970,7 @@ impl VirtioDevice for Net { } self.common.epoll_threads = Some(epoll_threads); + self.notify_pending_guest_announce(); event!("virtio-device", "activated", "id", &self.id); Ok(()) @@ -1213,6 +1235,22 @@ mod unit_tests { assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); } + #[test] + fn test_restored_pending_announce_retriggers_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + net.announce_pending.store(true, Ordering::Release); + + net.notify_pending_guest_announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + #[test] fn test_post_migration_retries_retrigger_config_interrupt() { let interrupt = Arc::new(TestInterrupt::new()); diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 6565a4502e..17a09caaf2 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -285,6 +285,28 @@ impl Net { config } + + /// Re-notify the guest about a restored pending ANNOUNCE request once the + /// transport has installed an interrupt callback during activation. + fn notify_pending_guest_announce(&self) { + if self.announce_pending.load(Ordering::Acquire) + && self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()) + && let Some(interrupt_cb) = &self.vu_common.virtio_common.interrupt_cb + { + interrupt_cb + .trigger(crate::VirtioInterruptType::Config) + .inspect_err(|e| { + warn!( + "Unable to resend pending announce interrupt for virtio-net device {}: {e}", + self.id + ); + }) + .ok(); + } + } } impl Drop for Net { @@ -427,6 +449,7 @@ impl VirtioDevice for Net { )?; self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); + self.notify_pending_guest_announce(); Ok(()) } @@ -677,6 +700,22 @@ mod unit_tests { assert_eq!(interrupt.config_count.load(Ordering::Acquire), 0); } + #[test] + fn test_restored_pending_announce_retriggers_config_interrupt() { + let interrupt = Arc::new(TestInterrupt::new()); + let net = test_net( + (1 << VIRTIO_NET_F_GUEST_ANNOUNCE) | (1 << VIRTIO_NET_F_STATUS), + Some(interrupt.clone() as Arc), + ); + net.announce_pending.store(true, Ordering::Release); + + net.notify_pending_guest_announce(); + + assert!(net.announce_pending.load(Ordering::Acquire)); + assert_ne!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + #[test] fn test_post_migration_with_ctrl_vq_but_without_guest_announce_is_noop() { let interrupt = Arc::new(TestInterrupt::new()); From a80ce717ae0d2b1e0e8b0b575afe4250321698b5 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 1 Apr 2026 09:48:01 +0200 Subject: [PATCH 115/178] virtio-devices: net: invalidate stale announce retries Track a runtime announce generation for virtio-net and vhost-user-net so post-migration retry announcers stop after reset or device teardown. This keeps repeated announce rounds within one migration session, while preventing stale retry threads from re-arming VIRTIO_NET_S_ANNOUNCE after the guest already reset, rebooted, or the device was dropped. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/net.rs | 64 +++++++++++++++++++++++++++- virtio-devices/src/vhost_user/net.rs | 61 +++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 3 deletions(-) diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 9747afb56b..f1fba651fe 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -10,7 +10,7 @@ use std::net::IpAddr; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering}; use std::sync::{Arc, Barrier}; use std::{result, thread}; @@ -405,6 +405,9 @@ pub struct Net { /// Tracks whether the guest still needs to acknowledge a post-migration /// announce request through the control queue. announce_pending: Arc, + /// Generation counter used to invalidate active announcers created before a + /// reset or device teardown, so they stop sending notifications. + announce_generation: Arc, ctrl_queue_epoll_thread: Option>, counters: NetCounters, seccomp_action: SeccompAction, @@ -581,6 +584,7 @@ impl Net { taps, config: constructor_state.config, announce_pending: Arc::new(AtomicBool::new(constructor_state.announce_pending)), + announce_generation: Arc::new(AtomicU64::new(0)), ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action, @@ -786,6 +790,8 @@ impl Net { impl Drop for Net { fn drop(&mut self) { + self.announce_generation.fetch_add(1, Ordering::AcqRel); + // Get a comma-separated list of the interface names of the tap devices // associated with this network device. let ifnames_str = self @@ -979,6 +985,7 @@ impl VirtioDevice for Net { fn reset(&mut self) { self.common.reset(); self.announce_pending.store(false, Ordering::Release); + self.announce_generation.fetch_add(1, Ordering::AcqRel); event!("virtio-device", "reset", "id", &self.id); } @@ -1053,6 +1060,10 @@ pub struct VirtioNetPostMigrationAnnouncer { /// Remembers whether this device negotiated the guest-visible announce path. guest_announce_negotiated: bool, announce_pending: Arc, + announce_generation: Arc, + /// Captures the announce generation at creation time to invalidate stale + /// retry sessions after reset or teardown. + generation: u64, interrupt_cb: Option>, /// Prebuilt host-side RARP payload used for immediate post-migration /// announcement retries. @@ -1066,6 +1077,8 @@ impl VirtioNetPostMigrationAnnouncer { id: dev.id.clone(), guest_announce_negotiated: dev.common.feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), announce_pending: Arc::clone(&dev.announce_pending), + announce_generation: Arc::clone(&dev.announce_generation), + generation: dev.announce_generation.load(Ordering::Acquire), interrupt_cb: dev.common.interrupt_cb.clone(), rarp_announce: dev.build_rarp_announce(), taps: dev.taps.clone(), @@ -1078,7 +1091,12 @@ impl PostMigrationAnnouncer for VirtioNetPostMigrationAnnouncer { // guest runs again, and then also ask the guest to re-announce itself when // GUEST_ANNOUNCE was negotiated. fn announce(&mut self) { - // We have to add a virtio-net header to the RARP announce. + // If the announce generations don't match, we don't send any announcements. + if self.announce_generation.load(Ordering::Acquire) != self.generation { + return; + } + + // We have to add a virtio-net header to the announce. let mut buf = vec![0u8; vnet_hdr_len() + self.rarp_announce.len()]; buf[vnet_hdr_len()..].copy_from_slice(&self.rarp_announce); @@ -1172,6 +1190,7 @@ mod unit_tests { taps: Vec::new(), config: VirtioNetConfig::default(), announce_pending: Arc::new(AtomicBool::new(false)), + announce_generation: Arc::new(AtomicU64::new(0)), ctrl_queue_epoll_thread: None, counters: NetCounters::default(), seccomp_action: SeccompAction::Allow, @@ -1285,6 +1304,47 @@ mod unit_tests { assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); } + #[test] + fn test_reset_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + net.reset(); + announcer.announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_drop_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut announcer = { + let net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + announcer + }; + + announcer.announce(); + + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + #[test] fn test_net_state_deserialize_without_announce_pending_defaults_to_false() { // Older snapshots do not contain announce_pending. Restoring them on a diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 17a09caaf2..1017df21da 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -1,7 +1,7 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{result, thread}; @@ -47,6 +47,9 @@ pub struct Net { /// Tracks whether the guest still needs to acknowledge a post-migration /// announce request through the control queue. announce_pending: Arc, + /// Generation counter used to invalidate active announcers created before a + /// reset or device teardown, so they stop sending notifications. + announce_generation: Arc, guest_memory: Option>, ctrl_queue_epoll_thread: Option>, seccomp_action: SeccompAction, @@ -250,6 +253,7 @@ impl Net { }, config, announce_pending: Arc::new(AtomicBool::new(announce_pending)), + announce_generation: Arc::new(AtomicU64::new(0)), guest_memory: None, ctrl_queue_epoll_thread: None, seccomp_action, @@ -312,6 +316,7 @@ impl Net { impl Drop for Net { fn drop(&mut self) { self.vu_common.shutdown(); + self.announce_generation.fetch_add(1, Ordering::AcqRel); if let Some(thread) = self.ctrl_queue_epoll_thread.take() && let Err(e) = thread.join() @@ -456,6 +461,7 @@ impl VirtioDevice for Net { fn reset(&mut self) { self.vu_common.reset(&self.id); self.announce_pending.store(false, Ordering::Release); + self.announce_generation.fetch_add(1, Ordering::AcqRel); } fn shutdown(&mut self) { @@ -536,6 +542,10 @@ pub struct VhostUserNetPostMigrationAnnouncer { /// Remembers whether this device negotiated the guest-visible announce path. guest_announce_negotiated: bool, announce_pending: Arc, + announce_generation: Arc, + /// Captures the announce generation at creation time to invalidate stale + /// retry sessions after reset or teardown. + generation: u64, interrupt_cb: Option>, } @@ -548,6 +558,8 @@ impl VhostUserNetPostMigrationAnnouncer { .virtio_common .feature_acked(VIRTIO_NET_F_GUEST_ANNOUNCE.into()), announce_pending: Arc::clone(&dev.announce_pending), + announce_generation: Arc::clone(&dev.announce_generation), + generation: dev.announce_generation.load(Ordering::Acquire), interrupt_cb: dev.vu_common.virtio_common.interrupt_cb.clone(), } } @@ -558,6 +570,11 @@ impl PostMigrationAnnouncer for VhostUserNetPostMigrationAnnouncer { // request pending and re-trigger the config interrupt while this retry // session remains valid. fn announce(&mut self) { + // If the announce generations don't match, we don't send any announcements. + if self.announce_generation.load(Ordering::Acquire) != self.generation { + return; + } + if self.guest_announce_negotiated && let Some(interrupt_cb) = &self.interrupt_cb { @@ -637,6 +654,7 @@ mod unit_tests { id: "test-vu-net".to_string(), config: VirtioNetConfig::default(), announce_pending: Arc::new(AtomicBool::new(false)), + announce_generation: Arc::new(AtomicU64::new(0)), guest_memory: None, ctrl_queue_epoll_thread: None, seccomp_action: SeccompAction::Allow, @@ -747,4 +765,45 @@ mod unit_tests { assert!(!net.announce_pending.load(Ordering::Acquire)); assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); } + + #[test] + fn test_reset_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + net.reset(); + announcer.announce(); + + assert!(!net.announce_pending.load(Ordering::Acquire)); + assert_eq!(read_status(&net) & VIRTIO_NET_S_ANNOUNCE as u16, 0); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } + + #[test] + fn test_drop_invalidates_old_announcer() { + let interrupt = Arc::new(TestInterrupt::new()); + let mut announcer = { + let net = test_net( + 1 << VIRTIO_NET_F_GUEST_ANNOUNCE, + Some(interrupt.clone() as Arc), + ); + let mut announcer = net.post_migration_announcer().unwrap(); + + announcer.announce(); + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + + announcer + }; + + announcer.announce(); + + assert_eq!(interrupt.config_count.load(Ordering::Acquire), 1); + } } From dcdf37c0c40349e3e5d1a3352b71e59d2fbfbd77 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 9 Dec 2025 14:20:43 +0100 Subject: [PATCH 116/178] vmm: Enable AMX states prior to checking CPUID compatibility Since enabling AMX tile state components affect the result returned by `Hypervisor::get_supported_cpuid` we want this enabled prior to checking CPUID compatibility between the source and destination VMs. Although this is not required today, it is necessary in order for the upcoming CPU profiles correctly, and it will also be necessary once the check_cpuid_compatibility checks are extended to take state components into account. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- vmm/src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 94e2606e52..6b26964998 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2082,8 +2082,17 @@ impl Vmm { let dest_cpuid = &{ let vm_config = &src_vm_config.lock().unwrap(); + if vm_config.cpus.features.amx { + // Need to enable AMX tile state components before generating common cpuid + // as this affects what Hypervisor::get_supported_cpuid returns. + self.hypervisor + .enable_amx_state_components() + .map_err(|e| MigratableError::MigrateReceive(e.into()))?; + } + let phys_bits = vm::physical_bits(self.hypervisor.as_ref(), vm_config.cpus.max_phys_bits); + arch::generate_common_cpuid( self.hypervisor.as_ref(), &arch::CpuidConfig { From 6646945dde04bcc7b361023138696ae41536ef14 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 9 Dec 2025 10:44:05 +0100 Subject: [PATCH 117/178] arch: Initial data structures for describing CPUID parameters These data structures are required to define CPU profiles. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- Cargo.lock | 123 +++++++++++++++++++- arch/Cargo.toml | 5 + arch/src/x86_64/cpuid_definitions/mod.rs | 136 +++++++++++++++++++++++ arch/src/x86_64/mod.rs | 4 +- 4 files changed, 262 insertions(+), 6 deletions(-) create mode 100644 arch/src/x86_64/cpuid_definitions/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 3da2b3eda3..3fd222f5d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,7 +119,9 @@ dependencies = [ "libc", "linux-loader", "log", + "proptest", "serde", + "serde_json", "thiserror", "uuid", "vm-fdt", @@ -307,6 +309,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -426,7 +443,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures", - "rand_core", + "rand_core 0.10.1", ] [[package]] @@ -852,6 +869,12 @@ dependencies = [ "spin", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" @@ -1023,7 +1046,7 @@ dependencies = [ "cfg-if", "libc", "r-efi 6.0.0", - "rand_core", + "rand_core 0.10.1", "wasip2", "wasip3", ] @@ -1828,6 +1851,15 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1856,6 +1888,31 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.11.1", + "num-traits", + "rand 0.9.4", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.45" @@ -1877,6 +1934,16 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core 0.9.5", +] + [[package]] name = "rand" version = "0.10.1" @@ -1885,7 +1952,26 @@ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", - "rand_core", + "rand_core 0.10.1", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", ] [[package]] @@ -1894,6 +1980,15 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "range_map_vec" version = "0.2.0" @@ -2051,6 +2146,18 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2287,7 +2394,7 @@ dependencies = [ "dirs", "epoll", "libc", - "rand", + "rand 0.10.1", "serde_json", "ssh2", "thiserror", @@ -2420,6 +2527,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -2452,7 +2565,7 @@ checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", - "rand", + "rand 0.10.1", "serde_core", "wasm-bindgen", ] diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 2e30b9e532..035eb9718c 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -29,5 +29,10 @@ vmm-sys-util = { workspace = true, features = ["with-serde"] } fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } +# Use this to test our custom serialization logic +[dev-dependencies] +proptest = "1.0.0" +serde_json = { workspace = true } + [lints] workspace = true diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs new file mode 100644 index 0000000000..75e316ceaa --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -0,0 +1,136 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Write; +use std::ops::RangeInclusive; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::x86_64::CpuidReg; + +pub(in crate::x86_64) fn serialize_as_hex( + input: &u32, + serializer: S, +) -> Result { + // two bytes for "0x" prefix and eight for the hex encoded number + let mut buffer = [0_u8; 10]; + let _ = write!(&mut buffer[..], "{input:#010x}"); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + serializer.serialize_str(str) +} + +pub(in crate::x86_64) fn deserialize_from_hex<'de, D: Deserializer<'de>>( + deserializer: D, +) -> Result { + let hex = <&'de str as Deserialize>::deserialize(deserializer)?; + u32::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("{hex} is not a hex encoded 32 bit integer")) + }) +} + +/// Parameters for inspecting CPUID definitions. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct Parameters { + // The leaf (EAX) parameter used with the CPUID instruction + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub leaf: u32, + // The sub-leaf (ECX) parameter used with the CPUID instruction + pub sub_leaf: RangeInclusive, + // The register we are interested in inspecting which gets filled by the CPUID instruction + pub register: CpuidReg, +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + use serde::Deserialize; + + use super::{Parameters, deserialize_from_hex, serialize_as_hex}; + use crate::x86_64::CpuidReg; + + /* + Check that the leaves get the string representation we expect. + This does not really matter from a functionality point of view, but we want + to read it in the expected format when manually viewing the generated CPU + profile files. + + Also assert that deserialization gives the original value back + */ + #[test] + fn hex_serialization() { + for (leaf, expected) in [ + 0x0_u32, 0x7, 0xd, 0x1e, 0x40000000, 0x4fffffff, 0x80000000, 0x8fffffff, + ] + .into_iter() + .zip([ + "0x00000000", + "0x00000007", + "0x0000000d", + "0x0000001e", + "0x40000000", + "0x4fffffff", + "0x80000000", + "0x8fffffff", + ]) { + let mut v = Vec::new(); + let mut serializer = serde_json::Serializer::new(&mut v); + serialize_as_hex(&leaf, &mut serializer).unwrap(); + let serialized = str::from_utf8(&v[..]).unwrap(); + // JSON Strings have surrounding "" hence we trim that + let serialized_trimmed = serialized + .strip_prefix('"') + .unwrap() + .strip_suffix('"') + .unwrap(); + dbg!(serialized_trimmed); + assert_eq!(serialized_trimmed, expected); + // Also check that we can deserialize this back to the original value + let mut deserializer = serde_json::Deserializer::from_str(serialized); + let deserialized = deserialize_from_hex(&mut deserializer).unwrap(); + assert_eq!(deserialized, leaf); + } + } + + // Check that serializing and then deserializing a value of type `Parameter` results in the + // same value we started with. + proptest! { + #[test] + fn parameter_serialization_roundtrip_works(leaf in 0u32..u32::MAX, x1 in 0u32..100, x2 in 0u32..100, reg in 0..4) { + let sub_leaf_range_start = std::cmp::min(x1, x2); + let sub_leaf_range_end = std::cmp::max(x1,x2); + let sub_leaf = sub_leaf_range_start..=sub_leaf_range_end; + let register = match reg { + 0 => CpuidReg::EAX, + 1 => CpuidReg::EBX, + 2 => CpuidReg::ECX, + 3 => CpuidReg::EDX, + _ => unreachable!() + }; + let cpuid_parameters = Parameters { + leaf, + sub_leaf, + register + }; + let serialized = serde_json::to_string(&cpuid_parameters).unwrap(); + let deserialized: Parameters = serde_json::from_str(&serialized).unwrap(); + prop_assert_eq!(&deserialized, &cpuid_parameters); + } + } + + // Check that `deserialize_from_hex` does not succeed if the stringified u32 does not start with 0x + proptest! { + #[test] + fn hex_deserialization_requires_prefix(leaf in any::().prop_map(|leaf| std::iter::once('"').chain(leaf.to_string().chars()).chain(std::iter::once('"')).collect::())) { + let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); + // Check that standard deserialization works + let result = ::deserialize(&mut deserializer); + prop_assert!(result.is_ok()); + let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); + prop_assert!(deserialize_from_hex(&mut deserializer).is_err()); + } + } +} diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index e5f4d48204..32775412b1 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +pub mod cpuid_definitions; pub mod interrupts; pub mod layout; pub mod regs; @@ -28,6 +29,7 @@ use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; use log::{debug, error, info}; +use serde::{Deserialize, Serialize}; pub use smbios::{SmbiosChassisConfig, SmbiosConfig, SmbiosSystem}; use thiserror::Error; use vm_memory::{ @@ -191,7 +193,7 @@ pub fn get_max_x2apic_id(topology: (u16, u16, u16, u16)) -> u32 { ) } -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum CpuidReg { EAX, EBX, From f599f643959686c5c2d11edbaa4eb35536312d57 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 7 Oct 2025 04:39:38 +0200 Subject: [PATCH 118/178] hypervisor: Implement common traits for HypervisorType and CpuVendor We want CPU profiles to keep a record of the hypervisor type and cpu vendor that they are intended to work with. This is made more convenient if all of these types implement common traits (used for serialization). Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- hypervisor/src/cpu.rs | 2 +- hypervisor/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index fef327ffb3..39909ef370 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -28,7 +28,7 @@ use crate::kvm::{TdxExitDetails, TdxExitStatus}; use crate::{CpuState, MpState, StandardRegisters}; #[cfg(target_arch = "x86_64")] -#[derive(Copy, Clone, Default)] +#[derive(Debug, Copy, Clone, Default, serde::Serialize, serde::Deserialize, Eq, PartialEq)] pub enum CpuVendor { #[default] Unknown, diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index f224e7217c..3b76dc6add 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -64,7 +64,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone, PartialEq)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, From 14d24667ba9da8a6a150e81a7e7935e0677c85a5 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 9 Dec 2025 13:27:00 +0100 Subject: [PATCH 119/178] arch: CpuProfile data structures We introduce essential data structures together with basic functionality that is necessary to apply a CPU profile to a host. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/lib.rs | 30 ++++ arch/src/x86_64/cpu_profile.rs | 242 +++++++++++++++++++++++++++++++++ arch/src/x86_64/mod.rs | 1 + 3 files changed, 273 insertions(+) create mode 100644 arch/src/x86_64/cpu_profile.rs diff --git a/arch/src/lib.rs b/arch/src/lib.rs index c1c1973667..362ae79e45 100644 --- a/arch/src/lib.rs +++ b/arch/src/lib.rs @@ -9,12 +9,17 @@ //! Supported platforms: x86_64, aarch64, riscv64. use std::collections::BTreeMap; +use std::str::FromStr; use std::sync::Arc; use std::{fmt, result}; +use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use thiserror::Error; +#[cfg(target_arch = "x86_64")] +pub use crate::x86_64::cpu_profile::CpuProfile; + type GuestMemoryMmap = vm_memory::GuestMemoryMmap; type GuestRegionMmap = vm_memory::GuestRegionMmap; @@ -53,6 +58,31 @@ pub enum Error { /// Type for returning public functions outcome. pub type Result = result::Result; +// If the target_arch is x86_64 we import CpuProfile from the x86_64 module, otherwise we +// declare it here. +#[cfg(not(target_arch = "x86_64"))] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility +/// between host's with potentially different CPU models. +pub enum CpuProfile { + #[default] + Host, +} + +impl FromStr for CpuProfile { + type Err = serde::de::value::Error; + fn from_str(s: &str) -> result::Result { + // Should accept both plain strings, and strings surrounded by `"`. + let normalized = s + .strip_prefix('"') + .unwrap_or(s) + .strip_suffix('"') + .unwrap_or(s); + Self::deserialize(normalized.into_deserializer()) + } +} + /// Type for memory region types. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum RegionType { diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs new file mode 100644 index 0000000000..fc05307312 --- /dev/null +++ b/arch/src/x86_64/cpu_profile.rs @@ -0,0 +1,242 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use hypervisor::arch::x86::CpuIdEntry; +use hypervisor::{CpuVendor, HypervisorType}; +use log::error; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::{Parameters, deserialize_from_hex, serialize_as_hex}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "kebab-case")] +/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility +/// between host's with potentially different CPU models. +pub enum CpuProfile { + #[default] + Host, + Skylake, + SapphireRapids, +} + +impl CpuProfile { + /// Loads pre-generated data associated with a CPU profile. + /// + /// If the `amx` flag is false then the AMX tile state components will be + /// zeroed out from the associated profile data. This is necessary because + /// they will then not be present in the vector of [`CpuidEntry`] values + /// obtained from the hypervisor. + // + // We can only generate CPU profiles for the KVM hypervisor for the time being. + #[cfg(feature = "kvm")] + pub(in crate::x86_64) fn data(&self, amx: bool) -> Option { + let mut data: CpuProfileData = match self { + Self::Host => None, + Self::Skylake => todo!(), + Self::SapphireRapids => todo!(), + }?; + + if !amx { + // In this case we will need to wipe out the AMX tile state components (if they are included in the profile) + for adj in data.adjustments.iter_mut() { + if adj.0.sub_leaf.start() != adj.0.sub_leaf.end() { + // The generated profiles produce as many sub-leaf entries as possible, and only use ranges for + // values not found. + continue; + } + let sub_leaf = *adj.0.sub_leaf.start(); + let leaf = adj.0.leaf; + if (leaf == 0xd) && (sub_leaf == 0) && (adj.0.register == CpuidReg::EAX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && (sub_leaf == 1) && (adj.0.register == CpuidReg::ECX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && ((sub_leaf == 17) | (sub_leaf == 18)) { + adj.1.replacements = 0; + } + } + } + + Some(data) + } + + #[cfg(not(feature = "kvm"))] + pub(in crate::x86_64) fn data(&self, _amx: bool) -> Option { + if matches!(*self, Self::Host) { + return None; + } + // This will need to be addressed before upstreaming. + // We will probably need one profile per hypervisor. + unimplemented!() + } +} + +/// Every [`CpuProfile`] different from `Host` has associated [`CpuProfileData`]. +/// +/// New constructors of this struct may only be generated through the CHV CLI (when built from source with +/// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to +/// increase the likelihood of successful live migrations among all hosts that opted in to the given +/// CPU profile. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[allow(dead_code)] +pub struct CpuProfileData { + /// The hypervisor used when generating this CPU profile. + pub(in crate::x86_64) hypervisor: HypervisorType, + /// The vendor of the CPU belonging to the host that generated this CPU profile. + pub(in crate::x86_64) cpu_vendor: CpuVendor, + /// Adjustments necessary to become compatible with the desired target. + pub(in crate::x86_64) adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)>, +} + +/* TODO: The [`CpuProfile`] struct will likely need a few more iterations. The following +section should explain why: + +# MSR restrictions + +CPU profiles also need to restrict which MSRs may be manipulated by the guest as various physical CPUs +can have differing supported MSRs. + +The CPU profile will thus necessarily need to contain some data related to MSR restrictions. That will +be taken care of in a follow up MR. + +*/ + +/// Used for adjusting an entire cpuid output register (EAX, EBX, ECX or EDX) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub(super) struct CpuidOutputRegisterAdjustments { + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub(in crate::x86_64) replacements: u32, + /// Used to zero out the area `replacements` occupy. This mask is not necessarily !replacements, as replacements may pack values of different types (i.e. it is wrong to think of it as a bitset conceptually speaking). + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub(in crate::x86_64) mask: u32, +} +impl CpuidOutputRegisterAdjustments { + pub(in crate::x86_64) fn adjust(self, cpuid_output_register: &mut u32) { + let temp_register_copy = *cpuid_output_register; + let replacements_area_masked_in_temp_copy = temp_register_copy & self.mask; + *cpuid_output_register = replacements_area_masked_in_temp_copy | self.replacements; + } + + pub(in crate::x86_64) fn adjust_cpuid_entries( + mut cpuid: Vec, + adjustments: &[(Parameters, Self)], + ) -> Result, MissingCpuidEntriesError> { + for entry in &mut cpuid { + for (reg, reg_value) in [ + (CpuidReg::EAX, &mut entry.eax), + (CpuidReg::EBX, &mut entry.ebx), + (CpuidReg::ECX, &mut entry.ecx), + (CpuidReg::EDX, &mut entry.edx), + ] { + // Get the adjustment corresponding to the entry's function/leaf and index/sub-leaf for each of the register. If no such + // adjustment is found we use the trivial adjustment (leading to the register being zeroed out entirely). + let adjustment = adjustments + .iter() + .find_map(|(param, adjustment)| { + ((param.leaf == entry.function) + & param.sub_leaf.contains(&entry.index) + & (param.register == reg)) + .then_some(*adjustment) + }) + .unwrap_or(CpuidOutputRegisterAdjustments { + mask: 0, + replacements: 0, + }); + adjustment.adjust(reg_value); + } + } + + Self::expected_entries_found(&cpuid, adjustments).map(|_| cpuid) + } + + /// Check that we found every value that was supposed to be replaced with something else than 0 + /// + /// IMPORTANT: This function assumes that the given `cpuid` has already been adjusted with the + /// provided `adjustments`. + fn expected_entries_found( + cpuid: &[CpuIdEntry], + adjustments: &[(Parameters, Self)], + ) -> Result<(), MissingCpuidEntriesError> { + let mut missing_entry = false; + + // Invalid state components can be ignored. The next few lines obtain the relevant entries to + // check for this. + let eax_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map_or(0, |entry| entry.eax); + let ecx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map_or(0, |entry| entry.ecx); + + let edx_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map_or(0, |entry| entry.edx); + let edx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map_or(0, |entry| entry.edx); + + for (param, adjustment) in adjustments { + if adjustment.replacements == 0 { + continue; + } + let sub_start = *param.sub_leaf.start(); + let sub_end = *param.sub_leaf.end(); + + let can_skip_lo = if (param.leaf == 0xd) && (2..32).contains(&sub_start) { + let start = sub_start; + let end = std::cmp::min(sub_end, 31); + let mask = (start..=end).fold(0, |acc, next| acc | (1 << next)); + ((mask & eax_0xd_0) == 0) & ((mask & ecx_0xd_1) == 0) + } else { + false + }; + + let can_skip_hi = if (param.leaf == 0xd) && (32..64).contains(&sub_end) { + let start = std::cmp::max(32, sub_start); + let end = sub_end; + let mask = (start..=end) + .map(|val| val - 32) + .fold(0, |acc, next| acc | (1 << next)); + ((mask & edx_0xd_0) == 0) & ((mask & edx_0xd_1) == 0) + } else { + false + }; + + if can_skip_lo && can_skip_hi { + // This means that all state components referred to by the specified sub-leaf range are not valid + // and may be skipped. + continue; + } + if !cpuid.iter().any(|entry| { + (entry.function == param.leaf) && (param.sub_leaf.contains(&entry.index)) + }) { + error!( + "cannot adjust CPU profile. No entry found matching the required parameters: {param:?}" + ); + missing_entry = true; + } + } + if missing_entry { + Err(MissingCpuidEntriesError) + } else { + Ok(()) + } + } +} + +#[derive(Debug, Error)] +#[error("Required CPUID entries not found")] +pub(in crate::x86_64) struct MissingCpuidEntriesError; diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 32775412b1..d53ecc9e96 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +pub mod cpu_profile; pub mod cpuid_definitions; pub mod interrupts; pub mod layout; From 997cfe3babc77901378ce184cffae884525c2c71 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 7 Oct 2025 05:34:22 +0200 Subject: [PATCH 120/178] misc: Make CPU profile part of various configs We integrate the CPU profile into the various configs that ultimately get set by the user. This quickly ends up involving multiple files, luckily Rust helps us find which ones via compilation errors. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/mod.rs | 3 ++- cloud-hypervisor/src/main.rs | 1 + fuzz/fuzz_targets/http_api.rs | 1 + vmm/src/config.rs | 11 ++++++++++- vmm/src/cpu.rs | 1 + vmm/src/lib.rs | 22 ++++++++++++++++++---- vmm/src/vm.rs | 15 +++++++++------ vmm/src/vm_config.rs | 4 ++++ 8 files changed, 46 insertions(+), 12 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index d53ecc9e96..1b5d061ac4 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -38,7 +38,7 @@ use vm_memory::{ GuestMemoryRegion, }; -use crate::{GuestMemoryMmap, InitramfsConfig, RegionType}; +use crate::{CpuProfile, GuestMemoryMmap, InitramfsConfig, RegionType}; // While modern architectures support more than 255 CPUs via x2APIC, // legacy devices such as mptable support at most 254 CPUs. @@ -97,6 +97,7 @@ pub struct CpuidConfig { #[cfg(feature = "tdx")] pub tdx: bool, pub amx: bool, + pub profile: CpuProfile, } #[derive(Debug, Error)] diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 270e0e2ee8..46d97d912d 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -980,6 +980,7 @@ mod unit_tests { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::Vm, + profile: Default::default(), }, memory: MemoryConfig { size: 536_870_912, diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 600a231b39..aa3841243d 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -139,6 +139,7 @@ impl RequestHandler for StubApiRequestHandler { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: Default::default(), }, memory: MemoryConfig { size: 536_870_912, diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b22c010e75..d7d1283067 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -11,6 +11,7 @@ use std::result; use std::str::FromStr; use std::sync::LazyLock; +use arch::CpuProfile; use block::ImageType; use clap::ArgMatches; use log::{debug, warn}; @@ -694,7 +695,8 @@ impl CpusConfig { .add("affinity") .add("features") .add("nested") - .add("core_scheduling"); + .add("core_scheduling") + .add("profile"); parser.parse(cpus).map_err(Error::ParseCpus)?; let boot_vcpus: u32 = parser @@ -726,6 +728,12 @@ impl CpusConfig { }) .collect() }); + + let profile = parser + .convert::("profile") + .map_err(Error::ParseCpus)? + .unwrap_or_default(); + let features_list = parser .convert::("features") .map_err(Error::ParseCpus)? @@ -768,6 +776,7 @@ impl CpusConfig { features, nested, core_scheduling, + profile, }) } } diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 9873e2abcc..02e984c6b0 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -994,6 +994,7 @@ impl CpuManager { #[cfg(feature = "tdx")] tdx, amx: self.config.features.amx, + profile: self.config.profile, }, ) .map_err(Error::CommonCpuId)? diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 6b26964998..0811b23dbd 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1907,17 +1907,27 @@ impl Vmm { ))); } - let amx = vm_config.lock().unwrap().cpus.features.amx; - let phys_bits = - vm::physical_bits(hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); + let (amx, phys_bits, profile, kvm_hyperv) = { + let guard = vm_config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let max_phys_bits = guard.cpus.max_phys_bits; + let profile = guard.cpus.profile; + let kvm_hyperv = guard.cpus.kvm_hyperv; + // Drop lock before function call + core::mem::drop(guard); + let phys_bits = vm::physical_bits(hypervisor, max_phys_bits); + (amx, phys_bits, profile, kvm_hyperv) + }; + arch::generate_common_cpuid( hypervisor, &arch::CpuidConfig { phys_bits, - kvm_hyperv: vm_config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) .context("Error generating common cpuid") @@ -2101,6 +2111,7 @@ impl Vmm { #[cfg(feature = "tdx")] tdx: false, amx: vm_config.cpus.features.amx, + profile: vm_config.cpus.profile, }, ) .context("Error generating common cpuid") @@ -3504,6 +3515,8 @@ const DEVICE_MANAGER_SNAPSHOT_ID: &str = "device-manager"; mod unit_tests { use std::path::PathBuf; + use arch::CpuProfile; + use super::*; #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; @@ -3541,6 +3554,7 @@ mod unit_tests { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: CpuProfile::default(), }, memory: MemoryConfig { size: 536_870_912, diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 223ff238d5..e2ed4225fa 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3411,19 +3411,22 @@ impl Snapshottable for Vm { #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let common_cpuid = { - let amx = self.config.lock().unwrap().cpus.features.amx; - let phys_bits = physical_bits( - self.hypervisor.as_ref(), - self.config.lock().unwrap().cpus.max_phys_bits, - ); + let guard = self.config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let phys_bits = physical_bits(self.hypervisor.as_ref(), guard.cpus.max_phys_bits); + let kvm_hyperv = guard.cpus.kvm_hyperv; + let profile = guard.cpus.profile; + // Drop the guard before function call + core::mem::drop(guard); arch::generate_common_cpuid( self.hypervisor.as_ref(), &arch::CpuidConfig { phys_bits, - kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) .map_err(|e| { diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 72ce2c567b..8ea8f0e8ff 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf}; use std::str::FromStr; use std::{fs, result}; +use arch::CpuProfile; use block::ImageType; pub use block::fcntl::LockGranularityChoice; use log::{debug, warn}; @@ -83,6 +84,8 @@ pub struct CpusConfig { pub nested: bool, #[serde(default)] pub core_scheduling: CoreScheduling, + #[serde(default)] + pub profile: CpuProfile, } pub const DEFAULT_VCPUS: u32 = 1; @@ -99,6 +102,7 @@ impl Default for CpusConfig { features: CpuFeatures::default(), nested: true, core_scheduling: CoreScheduling::default(), + profile: CpuProfile::default(), } } } From 65a8e4ca614d90af45d64135923a038aa58a4b47 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 9 Dec 2025 16:57:31 +0100 Subject: [PATCH 121/178] arch: Apply CPU profile (if any) when generating common CPUID If a CPU profile is configured it should result in guests seeing a restricted subset of CPUID. This is what we finally achieve in this commit. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpu_profile.rs | 2 +- arch/src/x86_64/mod.rs | 410 ++++++++++++++++++++------------- 2 files changed, 250 insertions(+), 162 deletions(-) diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index fc05307312..568046e275 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -239,4 +239,4 @@ impl CpuidOutputRegisterAdjustments { #[derive(Debug, Error)] #[error("Required CPUID entries not found")] -pub(in crate::x86_64) struct MissingCpuidEntriesError; +pub struct MissingCpuidEntriesError; diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 1b5d061ac4..f81aaeedec 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -6,7 +6,6 @@ // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. - pub mod cpu_profile; pub mod cpuid_definitions; pub mod interrupts; @@ -38,6 +37,7 @@ use vm_memory::{ GuestMemoryRegion, }; +use crate::x86_64::cpu_profile::CpuidOutputRegisterAdjustments; use crate::{CpuProfile, GuestMemoryMmap, InitramfsConfig, RegionType}; // While modern architectures support more than 255 CPUs via x2APIC, @@ -138,6 +138,26 @@ pub enum Error { #[error("Error getting supported CPUID through the hypervisor API")] CpuidGetSupported(#[source] HypervisorError), + #[error( + "The selected CPU profile cannot be utilized because the host's CPUID entries are not compatible with the profile" + )] + CpuProfileCpuidIncompatibility, + /// Error because TDX cannot be enabled when a custom (non host) CPU profile has been selected + #[error("TDX cannot be enabled when a custom CPU profile has been selected")] + CpuProfileTdxIncompatibility, + #[error( + "The selected CPU profile cannot be utilized because a necessary CPUID entry was not found" + )] + /// Error when trying to apply a CPU profile because a necessary CPUID entry was not found + MissingExpectedCpuidEntry(#[source] cpu_profile::MissingCpuidEntriesError), + /// Error when trying to apply a CPU profile because the host has a CPU from a different vendor + #[error( + "The selected CPU profile cannot be utilized because the host has a CPU from a different vendor: host_vendor:={cpu_vendor_host:?}, expected_vendor:={cpu_vendor_profile:?}" + )] + CpuProfileVendorIncompatibility { + cpu_vendor_profile: CpuVendor, + cpu_vendor_host: CpuVendor, + }, /// Error populating CPUID with KVM HyperV emulation details #[error("Error populating CPUID with KVM HyperV emulation details")] CpuidKvmHyperV(#[source] vmm_sys_util::fam::Error), @@ -560,6 +580,10 @@ impl CpuidFeatureEntry { } } +/// This function generates the CPUID entries to be set for all CPUs. +/// +/// If the `config` has a CPU profile set (other than host) then the profile +/// will be applied pub fn generate_common_cpuid( hypervisor: &dyn hypervisor::Hypervisor, config: &CpuidConfig, @@ -626,135 +650,21 @@ pub fn generate_common_cpuid( }); } - // Supported CPUID - let mut cpuid = hypervisor + // Supported CPUID according to the host and hypervisor + let mut host_cpuid = hypervisor .get_supported_cpuid() .map_err(Error::CpuidGetSupported)?; - CpuidPatch::patch_cpuid(&mut cpuid, &cpuid_patches); - - #[cfg(feature = "tdx")] - let tdx_capabilities = if config.tdx { - let caps = hypervisor - .tdx_capabilities() - .map_err(Error::TdxCapabilities)?; - info!("TDX capabilities {caps:#?}"); - Some(caps) - } else { - None - }; - - // Update some existing CPUID - for entry in cpuid.as_mut_slice().iter_mut() { - #[allow(unused_unsafe)] - match entry.function { - // Clear AMX related bits if the AMX feature is not enabled - 0x7 if !config.amx => { - if entry.index == 0 { - entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); - } - if entry.index == 1 { - entry.eax &= !(1 << AMX_FP16); - entry.edx &= !(1 << AMX_COMPLEX); - } - } - 0xd => - { - #[cfg(feature = "tdx")] - if let Some(caps) = &tdx_capabilities { - let xcr0_mask: u64 = 0x82ff; - let xss_mask: u64 = !xcr0_mask; - if entry.index == 0 { - entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); - entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; - } else if entry.index == 1 { - entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); - entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; - } - } - } - // Tile Information (purely AMX related). - 0x1d if !config.amx => { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } - // TMUL information (purely AMX related) - 0x1e if !config.amx => { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } - - // Copy host L1 cache details if not populated by KVM - 0x8000_0005 - if entry.eax == 0 - && entry.ebx == 0 - && entry.ecx == 0 - && entry.edx == 0 - // SAFETY: cpuid called with valid leaves - && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 => - { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - // Copy host L2 cache details if not populated by KVM - 0x8000_0006 - if entry.eax == 0 - && entry.ebx == 0 - && entry.ecx == 0 - && entry.edx == 0 - // SAFETY: cpuid called with valid leaves - && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 => - { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - // Set CPU physical bits - 0x8000_0008 => { - entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); - } - 0x4000_0001 => { - // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target - // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. - entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; - - // These features are not supported by TDX - #[cfg(feature = "tdx")] - if config.tdx { - entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) - | (1 << KVM_FEATURE_STEAL_TIME_BIT)); - } - } - _ => {} - } - } - // Copy CPU identification string + // + // If a CPU profile has been applied then this will get + // overwritten as soon as the profile is applied for i in 0x8000_0002..=0x8000_0004 { - cpuid.retain(|c| c.function != i); + host_cpuid.retain(|c| c.function != i); // SAFETY: call cpuid with valid leaves #[allow(unused_unsafe)] let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; - cpuid.push(CpuIdEntry { + host_cpuid.push(CpuIdEntry { function: i, eax: leaf.eax, ebx: leaf.ebx, @@ -764,54 +674,232 @@ pub fn generate_common_cpuid( }); } - if config.kvm_hyperv { - // Remove conflicting entries - cpuid.retain(|c| c.function != 0x4000_0000); - cpuid.retain(|c| c.function != 0x4000_0001); - // See "Hypervisor Top Level Functional Specification" for details - // Compliance with "Hv#1" requires leaves up to 0x4000_000a - cpuid.push(CpuIdEntry { - function: 0x40000000, - eax: 0x4000000a, // Maximum cpuid leaf - ebx: 0x756e694c, // "Linu" - ecx: 0x564b2078, // "x KV" - edx: 0x7648204d, // "M Hv" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000001, - eax: 0x31237648, // "Hv#1" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000002, - eax: 0x3839, // "Build number" - ebx: 0xa0000, // "Version" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0003, - eax: (1 << 1) // AccessPartitionReferenceCounter + let use_custom_profile = config.profile != CpuProfile::Host; + // Obtain cpuid entries that are adjusted to the specified CPU profile and the cpuid entries of the compatibility target + // TODO: Try to write this in a clearer way + let (host_adjusted_to_profile, profile_cpu_vendor) = { + config + .profile + .data(config.amx) + .map_or((Ok(None), None), |profile_data| { + ( + CpuidOutputRegisterAdjustments::adjust_cpuid_entries( + host_cpuid.clone(), + &profile_data.adjustments, + ) + .map(Some), + Some(profile_data.cpu_vendor), + ) + }) + }; + let mut host_adjusted_to_profile = + host_adjusted_to_profile.map_err(Error::MissingExpectedCpuidEntry)?; + + // There should be relatively few cases where live migration can succeed between hosts from different + // CPU vendors and making our checks account for that possibility would complicate things substantially. + // We thus require that the host's cpu vendor matches the one used to generate the CPU profile. + if let Some(cpu_vendor_profile) = profile_cpu_vendor + && let cpu_vendor_host = hypervisor.get_cpu_vendor() + && cpu_vendor_profile != cpu_vendor_host + { + return Err(Error::CpuProfileVendorIncompatibility { + cpu_vendor_profile, + cpu_vendor_host, + } + .into()); + } + // We now make the modifications according to the config parameters to each of the cpuid entries + // declared above and then perform a compatibility check. + for cpuid_option in [Some(&mut host_cpuid), host_adjusted_to_profile.as_mut()] { + let Some(cpuid) = cpuid_option else { + break; + }; + CpuidPatch::patch_cpuid(cpuid, &cpuid_patches); + + #[cfg(feature = "tdx")] + let tdx_capabilities = if config.tdx { + if use_custom_profile { + return Err(Error::CpuProfileTdxIncompatibility.into()); + } + let caps = hypervisor + .tdx_capabilities() + .map_err(Error::TdxCapabilities)?; + info!("TDX capabilities {caps:#?}"); + Some(caps) + } else { + None + }; + + // Update some existing CPUID + for entry in cpuid.as_mut_slice().iter_mut() { + match entry.function { + // Clear AMX related bits if the AMX feature is not enabled + 0x7 + if !config.amx =>{ + if entry.index == 0 { + entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); + } + if entry.index == 1 { + entry.eax &= !(1 << AMX_FP16); + entry.edx &= !(1 << AMX_COMPLEX); + } + } + + 0xd => + { + #[cfg(feature = "tdx")] + if let Some(caps) = &tdx_capabilities { + let xcr0_mask: u64 = 0x82ff; + let xss_mask: u64 = !xcr0_mask; + if entry.index == 0 { + entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); + entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; + } else if entry.index == 1 { + entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); + entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; + } + } + } + + 0x1d + // Tile Information (purely AMX related). + if !config.amx =>{ + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + } + + 0x1e + // TMUL information (purely AMX related) + if !config.amx =>{ + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + } + + + // Copy host L1 cache details if not populated by KVM + #[allow(unused_unsafe)] + 0x8000_0005 + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 + + // SAFETY: cpuid called with valid leaves + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 =>{ + #[allow(unused_unsafe)] + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + } + + // Copy host L2 cache details if not populated by KVM + #[allow(unused_unsafe)] + 0x8000_0006 + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 + + // SAFETY: cpuid called with valid leaves + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 =>{ + #[allow(unused_unsafe)] + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + + } + // Set CPU physical bits + 0x8000_0008 => { + entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); + } + 0x4000_0001 => { + // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target + // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. + entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; + + // These features are not supported by TDX + #[cfg(feature = "tdx")] + if config.tdx { + entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) + | (1 << KVM_FEATURE_STEAL_TIME_BIT)); + } + } + _ => {} + } + } + + if config.kvm_hyperv { + // Remove conflicting entries + cpuid.retain(|c| c.function != 0x4000_0000); + cpuid.retain(|c| c.function != 0x4000_0001); + // See "Hypervisor Top Level Functional Specification" for details + // Compliance with "Hv#1" requires leaves up to 0x4000_000a + cpuid.push(CpuIdEntry { + function: 0x40000000, + eax: 0x4000000a, // Maximum cpuid leaf + ebx: 0x756e694c, // "Linu" + ecx: 0x564b2078, // "x KV" + edx: 0x7648204d, // "M Hv" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000001, + eax: 0x31237648, // "Hv#1" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000002, + eax: 0x3839, // "Build number" + ebx: 0xa0000, // "Version" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x4000_0003, + eax: (1 << 1) // AccessPartitionReferenceCounter | (1 << 2) // AccessSynicRegs | (1 << 3) // AccessSyntheticTimerRegs | (1 << 9), // AccessPartitionReferenceTsc - edx: 1 << 3, // CPU dynamic partitioning - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0004, - eax: 1 << 5, // Recommend relaxed timing - ..Default::default() - }); - for i in 0x4000_0005..=0x4000_000a { + edx: 1 << 3, // CPU dynamic partitioning + ..Default::default() + }); cpuid.push(CpuIdEntry { - function: i, + function: 0x4000_0004, + eax: 1 << 5, // Recommend relaxed timing ..Default::default() }); + for i in 0x4000_0005..=0x4000_000a { + cpuid.push(CpuIdEntry { + function: i, + ..Default::default() + }); + } } } - Ok(cpuid) + if use_custom_profile { + // Final compatibility checks to ensure that the CPUID values we return are compatible both with the CPU profile and the host we are currently running on. + let host_adjusted_to_profile = host_adjusted_to_profile.expect("The profile adjusted cpuid entries should exist as we checked that we have a custom CPU profile"); + + // Check that the host's cpuid is indeed compatible with the adjusted profile. This is not by construction. + info!("checking compatibility between host adjusted to profile and the host itself"); + CpuidFeatureEntry::check_cpuid_compatibility(&host_adjusted_to_profile, &host_cpuid) + .map_err(|_| Error::CpuProfileCpuidIncompatibility)?; + Ok(host_adjusted_to_profile) + } else { + Ok(host_cpuid) + } } #[allow(clippy::too_many_arguments)] From 563b274694c5344daabfabb196e5499f0c055588 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 9 Dec 2025 17:21:01 +0100 Subject: [PATCH 122/178] arch: Include Skylake and Sapphire rapids CPU profiles We include CPU profiles corresponding to Intel Skylake and Sapphire rapids server that we generated using our WIP CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/Cargo.toml | 2 + arch/src/x86_64/cpu_profile.rs | 22 +- .../x86_64/cpu_profiles/sapphire-rapids.json | 3002 +++++++++++++++++ .../cpu_profiles/sapphire-rapids.json.license | 3 + arch/src/x86_64/cpu_profiles/skylake.json | 2834 ++++++++++++++++ .../x86_64/cpu_profiles/skylake.json.license | 3 + 6 files changed, 5861 insertions(+), 5 deletions(-) create mode 100644 arch/src/x86_64/cpu_profiles/sapphire-rapids.json create mode 100644 arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license create mode 100644 arch/src/x86_64/cpu_profiles/skylake.json create mode 100644 arch/src/x86_64/cpu_profiles/skylake.json.license diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 035eb9718c..1a4d267966 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -20,6 +20,8 @@ libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } +# We currently use this for (de-)serializing CPU profile data +serde_json = { workspace = true } thiserror = { workspace = true } uuid = { workspace = true } vm-memory = { workspace = true, features = ["backend-bitmap", "backend-mmap"] } diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index 568046e275..4b922adba7 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -19,7 +19,9 @@ use crate::x86_64::cpuid_definitions::{Parameters, deserialize_from_hex, seriali pub enum CpuProfile { #[default] Host, + #[cfg(feature = "kvm")] Skylake, + #[cfg(feature = "kvm")] SapphireRapids, } @@ -36,16 +38,26 @@ impl CpuProfile { pub(in crate::x86_64) fn data(&self, amx: bool) -> Option { let mut data: CpuProfileData = match self { Self::Host => None, - Self::Skylake => todo!(), - Self::SapphireRapids => todo!(), + Self::Skylake => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/skylake.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); + }) + .expect("should be able to deserialize pre-generated data"), + ), + Self::SapphireRapids => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); + }) + .expect("should be able to deserialize pre-generated data"), + ), }?; if !amx { // In this case we will need to wipe out the AMX tile state components (if they are included in the profile) for adj in data.adjustments.iter_mut() { if adj.0.sub_leaf.start() != adj.0.sub_leaf.end() { - // The generated profiles produce as many sub-leaf entries as possible, and only use ranges for - // values not found. continue; } let sub_leaf = *adj.0.sub_leaf.start(); @@ -74,7 +86,7 @@ impl CpuProfile { } // This will need to be addressed before upstreaming. // We will probably need one profile per hypervisor. - unimplemented!() + unreachable!() } } diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json new file mode 100644 index 0000000000..aacb85a747 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json @@ -0,0 +1,3002 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000020", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000806f8", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x80000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0xf1bf07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x1b415f6e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0xa7c04010", + "mask": "0x18000400" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00001c30", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000017", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000009", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000602e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x0000001f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EAX" + }, + { + "replacements": "0x00002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EBX" + }, + { + "replacements": "0x00000ac0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EBX" + }, + { + "replacements": "0x00000b00", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "ECX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "ECX" + }, + { + "replacements": "0x00000006", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000017", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000001", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x04002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00080040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00004010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x6153206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x69687070", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x52206572", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x64697061", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000073", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license new file mode 100644 index 0000000000..7f7e3b5e1b --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/skylake.json b/arch/src/x86_64/cpu_profiles/skylake.json new file mode 100644 index 0000000000..df5e9b24a2 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.json @@ -0,0 +1,2834 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000016", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00050654", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x80000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0xd19f07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x0000000c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0xa4000000", + "mask": "0x18000400" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000009", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000002e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x0000000f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000017", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x6b53206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6b616c79", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000065", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.json.license b/arch/src/x86_64/cpu_profiles/skylake.json.license new file mode 100644 index 0000000000..7f7e3b5e1b --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 From fd04dca65618d0187454f78b7ed5ec7bb751fa84 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 10 Dec 2025 11:00:16 +0100 Subject: [PATCH 123/178] arch: CPUID definitions data structures We introduce data structures to describe values within the registers modified by the CPUID instruction. These data structures will later be used by the upcoming CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/mod.rs | 74 ++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs index 75e316ceaa..61700298bf 100644 --- a/arch/src/x86_64/cpuid_definitions/mod.rs +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -44,6 +44,80 @@ pub struct Parameters { pub register: CpuidReg, } +/// Describes a policy for how the corresponding CPUID data should be considered when building +/// a CPU profile. +/// +/// This enum is mostly intended for the CPU profile generation tool, but it's debug representation +/// might also appear in logs if/when CPUID compatibility checks fail at runtime. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ProfilePolicy { + /// Store the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will be set into the modified + /// CPUID instruction(s). + Inherit, + /// Ignore the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will then instead get + /// extracted from the host. + /// + /// This variant is typically set for data that has no effect on migration compatibility, + /// but there may be some exceptions such as data which is necessary to run the VM at all, + /// but must coincide with whatever is on the host. + Passthrough, + /// Set the following hardcoded value in the CPU profile. + /// + /// This variant is typically used for features/values that don't work well with live migration (even when using the exact same physical CPU model). + Static(u32), +} + +/// A description of a range of bits in a register populated by the CPUID instruction with specific parameters. +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinition { + /// A short name for the value obtainable through CPUID + pub short: &'static str, + /// A description of the value obtainable through CPUID + pub description: &'static str, + /// The range of bits in the output register corresponding to this feature or value. + /// + /// This is not a `RangeInclusive` because that type does unfortunately not implement `Copy`. + pub bits_range: (u8, u8), + /// The policy corresponding to this value when building CPU profiles. + pub policy: ProfilePolicy, +} + +/// Describes values within a register populated by the CPUID instruction with specific parameters. +/// +/// NOTE: The only way to interact with this value (beyond this crate) is via the const [`Self::as_slice()`](Self::as_slice) method. +pub struct ValueDefinitions(&'static [ValueDefinition]); +impl ValueDefinitions { + /// Constructor permitting at most 32 entries. + const fn new(cpuid_descriptions: &'static [ValueDefinition]) -> Self { + // Note that this function is only called within this module, at compile time, hence it is fine to have some + // additional sanity checks such as the following assert. + assert!(cpuid_descriptions.len() <= 32); + Self(cpuid_descriptions) + } + /// Converts this into a slice representation. This is the only way to read values of this type. + pub const fn as_slice(&self) -> &'static [ValueDefinition] { + self.0 + } +} + +/// Describes multiple CPUID outputs. +/// +/// Each wrapped [`ValueDefinitions`] corresponds to the given [`Parameters`] in the same tuple. +/// +pub struct CpuidDefinitions( + [(Parameters, ValueDefinitions); NUM_PARAMETERS], +); + +impl CpuidDefinitions { + pub const fn as_slice(&self) -> &[(Parameters, ValueDefinitions); NUM_PARAMETERS] { + &self.0 + } +} + #[cfg(test)] mod tests { use proptest::prelude::*; From a8f9757f9d812a51421c890e2733ed8d7e06af24 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 10 Dec 2025 10:40:49 +0100 Subject: [PATCH 124/178] arch: Intel CPUID definitions We introduce CPUID definitions for Intel CPUs that will be utilized by the upcoming CPU Profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .typos.toml | 4 + arch/src/x86_64/cpuid_definitions/intel.rs | 4779 ++++++++++++++++++++ arch/src/x86_64/cpuid_definitions/mod.rs | 2 + 3 files changed, 4785 insertions(+) create mode 100644 arch/src/x86_64/cpuid_definitions/intel.rs diff --git a/.typos.toml b/.typos.toml index 5dff00d28c..063770942f 100644 --- a/.typos.toml +++ b/.typos.toml @@ -26,3 +26,7 @@ fo = "fo" fpr = "fpr" # Public Linux API msg_controllen = "msg_controllen" +tme = "tme" +l3c_qm_conver_factor = "l3c_qm_conver_factor" +IA32_PMC_GPn_CFG_C = "IA32_PMC_GPn_CFG_C" +IA32_PMC_FXm_CFG_C = "IA32_PMC_FXm_CFG_C" diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs new file mode 100644 index 0000000000..96969c1f96 --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -0,0 +1,4779 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains CPUID definitions for Intel CPUs. +use std::ops::RangeInclusive; + +use super::{ + CpuidDefinitions, CpuidReg, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// Contains CPUID definitions described in "Intel Architecture Instruction Set Extensions and Future Features" +/// +/// ## Missing leaves +/// +/// The following known CPUID leaves are left out of this table: +/// - 0x3 (Only relevant for Intel Pentium III), +/// - 0x12 (Only relevant for SGX which is deprecated), +/// - 0x19 (Key locker leaf. These features are not in scope for CPU profiles for the time being) +/// - 0x1a (Native Model ID Enumeration leaf), +/// - 0x1b (PCONFIG Information Sub-leaf. This is not in scope for CPU profiles for the time being), +/// - 0x27 (L3 Cache Intel RDT Monitoring Capability Asymmetric Enumeration), +/// - 0x28 (Intel Resource Director Technology Allocation Asymmetric Enumeration), +/// - 0x21 (Only relevant for Intel TDX which is not in scope fore CPU profiles for the time being), +/// - 0x40000000 - 0x4FFFFFFF (Reserved for hypervisors), +/// +/// ### How we produced this table +/// +/// We first ran the [`cpuidgen` tool](https://gitlab.com/x86-cpuid.org/x86-cpuid-db), whose +/// output is licensed under the SPDX Creative Commons Zero 1.0 Universal License. We then wrote a +/// throw-away Rust script to modify the output into something more similar to Rust code. Following +/// this we used macros and other functionality in the [Helix editor](https://helix-editor.com/) to +/// get actual Rust code. +/// +/// We then read through the CPUID section (1.4) of the Intel Architecture Instruction Set +/// Extensions and Future Features manual and manually inserted several leaf definitions that +/// we noticed were missing from the table we had produced. During this process we also changed +/// a few of the short names and descriptions to be more inline with what is written in the +/// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every +/// single [`ValueDefinition`] and manually appended those. +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { + CpuidDefinitions([ + // ========================================================================================= + // Basic CPUID Information + // ========================================================================================= + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_std_leaf", + description: "Maximum Input value for Basic CPUID Information", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "CPU vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // TODO: Do we really want to inherit these values from the corresponding CPU, or should we zero it out or set something else here? + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "stepping", + description: "Stepping ID", + bits_range: (0, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_model", + description: "Base CPU model ID", + bits_range: (4, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_family_id", + description: "Base CPU family ID", + bits_range: (8, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cpu_type", + description: "CPU type", + bits_range: (12, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_model", + description: "Extended CPU model ID", + bits_range: (16, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_family", + description: "Extended CPU family ID", + bits_range: (20, 27), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "brand_id", + description: "Brand index", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "clflush_size", + description: "CLFLUSH instruction cache line size", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "n_logical_cpu", + description: "Logical CPU count", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "local_apic_id", + description: "Initial local APIC physical ID", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sse3", + description: "Streaming SIMD Extensions 3 (SSE3)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pclmulqdq", + description: "PCLMULQDQ instruction support", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dtes64", + description: "64-bit DS save area", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "monitor", + description: "MONITOR/MWAIT support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ds_cpl", + description: "CPL Qualified Debug Store", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + // TODO: Ideally configurable by the user (host must have this otherwise CHV will not run) + ValueDefinition { + short: "vmx", + description: "Virtual Machine Extensions", + bits_range: (5, 5), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "smx", + description: "Safer Mode Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "est", + description: "Enhanced Intel SpeedStep", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tm2", + description: "Thermal Monitor 2", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ssse3", + description: "Supplemental SSE3", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "cnxt_id", + description: "L1 Context ID", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "sdbg", + description: "Silicon Debug", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fma", + description: "FMA extensions using YMM state", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx16", + description: "CMPXCHG16B instruction support", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xtpr", + description: "xTPR Update Control", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pdcm", + description: "Perfmon and Debug Capability", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pcid", + description: "Process-context identifiers", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dca", + description: "Direct Cache Access", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_1", + description: "SSE4.1", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_2", + description: "SSE4.2", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "x2apic", + description: "X2APIC support", + bits_range: (21, 21), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "movbe", + description: "MOVBE instruction support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "popcnt", + description: "POPCNT instruction support", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "tsc_deadline_timer", + description: "APIC timer one-shot operation", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aes", + description: "AES instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsave", + description: "XSAVE (and related instructions) support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "osxsave", + description: "XSAVE (and related instructions) are enabled by OS", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx", + description: "AVX instructions support", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "f16c", + description: "Half-precision floating-point conversion support", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdrand", + description: "RDRAND instruction support", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + // TODO: If set by CHV set to 0 and write comment + ValueDefinition { + short: "guest_status", + description: "System is running as guest; (para-)virtualized system", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fpu", + description: "Floating-Point Unit on-chip (x87)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vme", + description: "Virtual-8086 Mode Extensions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "de", + description: "Debugging Extensions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse", + description: "Page Size Extension", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tsc", + description: "Time Stamp Counter", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "msr", + description: "Model-Specific Registers (RDMSR and WRMSR support)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pae", + description: "Physical Address Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mce", + description: "Machine Check Exception", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx8", + description: "CMPXCHG8B instruction", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apic", + description: "APIC on-chip", + bits_range: (9, 9), + policy: ProfilePolicy::Static(1), + }, + // MSR related + ValueDefinition { + short: "sep", + description: "SYSENTER, SYSEXIT, and associated MSRs", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mtrr", + description: "Memory Type Range Registers", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pge", + description: "Page Global Extensions", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mca", + description: "Machine Check Architecture", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmov", + description: "Conditional Move Instruction", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pat", + description: "Page Attribute Table", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse36", + description: "Page Size Extension (36-bit)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "psn", + description: "Processor Serial Number", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "clfsh", + description: "CLFLUSH instruction", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ds", + description: "Debug Store", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "acpi", + description: "Thermal monitor and clock control", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mmx", + description: "MMX instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fxsr", + description: "FXSAVE and FXRSTOR instructions", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse", + description: "SSE instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse2", + description: "SSE2 instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ss", + description: "Self Snoop", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "htt", + description: "Hyper-threading", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tm", + description: "Thermal Monitor", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pbe", + description: "Pending Break Enable", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Cache and TLB Information + // ========================================================================================= + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "iteration_count", + description: "Number of times this leaf must be queried", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc1", + description: "Descriptor #1", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc2", + description: "Descriptor #2", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc3", + description: "Descriptor #3", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "eax_invalid", + description: "Descriptors 1-3 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc4", + description: "Descriptor #4", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc5", + description: "Descriptor #5", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc6", + description: "Descriptor #6", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc7", + description: "Descriptor #7", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ebx_invalid", + description: "Descriptors 4-7 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc8", + description: "Descriptor #8", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc9", + description: "Descriptor #9", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc10", + description: "Descriptor #10", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc11", + description: "Descriptor #11", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ecx_invalid", + description: "Descriptors 8-11 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc12", + description: "Descriptor #12", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc13", + description: "Descriptor #13", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc14", + description: "Descriptor #14", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc15", + description: "Descriptor #15", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "edx_invalid", + description: "Descriptors 12-15 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // Deterministic Cache Parameters + // ========================================================================================= + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_type", + description: "Cache type field", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_level", + description: "Cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + // TODO: Could there be a problem migrating from a CPU with self-initializing cache to one without? + ValueDefinition { + short: "cache_self_init", + description: "Self-initializing cache level", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "fully_associative", + description: "Fully-associative cache", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_threads_sharing", + description: "Number logical CPUs sharing this cache", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_cores_on_die", + description: "Number of cores in the physical package", + bits_range: (26, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_linesize", + description: "System coherency line size (0-based)", + bits_range: (0, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_npartitions", + description: "Physical line partitions (0-based)", + bits_range: (12, 21), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_nways", + description: "Ways of associativity (0-based)", + bits_range: (22, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cache_nsets", + description: "Cache number of sets (0-based)", + bits_range: (0, 30), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "wbinvd_rll_no_guarantee", + description: "WBINVD/INVD not guaranteed for Remote Lower-Level caches", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ll_inclusive", + description: "Cache is inclusive of Lower-Level caches", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "complex_indexing", + description: "Not a direct-mapped cache (complex function)", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // MONITOR/MWAIT + // ========================================================================================= + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "min_mon_size", + description: "Smallest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_mon_size", + description: "Largest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "mwait_ext", + description: "Enumeration of MONITOR/MWAIT extensions is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mwait_irq_break", + description: "Interrupts as a break-event for MWAIT is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "n_c0_substates", + description: "Number of C0 sub C-states supported using MWAIT", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c1_substates", + description: "Number of C1 sub C-states supported using MWAIT", + bits_range: (4, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c2_substates", + description: "Number of C2 sub C-states supported using MWAIT", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c3_substates", + description: "Number of C3 sub C-states supported using MWAIT", + bits_range: (12, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c4_substates", + description: "Number of C4 sub C-states supported using MWAIT", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c5_substates", + description: "Number of C5 sub C-states supported using MWAIT", + bits_range: (20, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c6_substates", + description: "Number of C6 sub C-states supported using MWAIT", + bits_range: (24, 27), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c7_substates", + description: "Number of C7 sub C-states supported using MWAIT", + bits_range: (28, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Thermal and Power Management + // ========================================================================================= + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "dtherm", + description: "Digital temperature sensor", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost", + description: "Intel Turbo Boost", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "arat", + description: "Always-Running APIC Timer (not affected by p-state)", + bits_range: (2, 2), + // The timer is emulated by KVM and thus always always-running :) + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "pln", + description: "Power Limit Notification (PLN) event", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ecmd", + description: "Clock modulation duty cycle extension", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pts", + description: "Package thermal management", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp", + description: "HWP (Hardware P-states) base registers are supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_notify", + description: "HWP notification (IA32_HWP_INTERRUPT MSR)", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_act_window", + description: "HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_epp", + description: "HWP Energy Performance Preference", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_pkg_req", + description: "HWP Package Level Request", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hdc_base_regs", + description: "HDC base registers are supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost_3_0", + description: "Intel Turbo Boost Max 3.0", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_capabilities", + description: "HWP Highest Performance change", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_peci_override", + description: "HWP PECI override", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_flexible", + description: "Flexible HWP", + bits_range: (17, 17), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_fast", + description: "IA32_HWP_REQUEST MSR fast access mode", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hfi", + description: "HW_FEEDBACK MSRs supported", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_ignore_idle", + description: "Ignoring idle logical CPU HWP req is supported", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thread_director", + description: "Intel thread director support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "therm_interrupt_bit25", + description: "IA32_THERM_INTERRUPT MSR bit 25 is supported", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_therm_thresholds", + description: "Digital thermometer thresholds", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "aperfmperf", + description: "MPERF/APERF MSRs (effective frequency interface)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "epb", + description: "IA32_ENERGY_PERF_BIAS MSR support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thrd_director_nclasses", + description: "Number of classes, Intel thread director", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "perfcap_reporting", + description: "Performance capability reporting", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "encap_reporting", + description: "Energy efficiency capability reporting", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "feedback_sz", + description: "Feedback interface structure size, in 4K pages", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "this_lcpu_hwfdbk_idx", + description: "This logical CPU hardware feedback interface index", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "leaf7_n_subleaves", + description: "Number of leaf 0x7 subleaves", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fsgsbase", + description: "FSBASE/GSBASE read/write support", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "tsc_adjust", + description: "IA32_TSC_ADJUST MSR supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // SGX is deprecated so we disable it unconditionally for all CPU profiles + ValueDefinition { + short: "sgx", + description: "Intel SGX (Software Guard Extensions)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bmi1", + description: "Bit manipulation extensions group 1", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // TSX related which is riddled with CVEs. Consider two profiles, or making it opt-in/out. QEMU always has a CPU model with and without TSX. + ValueDefinition { + short: "hle", + description: "Hardware Lock Elision", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx2", + description: "AVX2 instruction set", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /*The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). + + Keep in mind however that in my limited understanding this isn't about enabling or disabling a feature, but it describes critical behaviour. + Hence I am wondering whether it should be a hard error if the host does not have this bit set, but the desired CPU profile does? + + TODO: Check what KVM_GET_SUPPORTED_CPUID actually gives here (on the Skylake server) + */ + ValueDefinition { + short: "fdp_excptn_only", + description: "FPU Data Pointer updated only on x87 exceptions", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "smep", + description: "Supervisor Mode Execution Protection", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "bmi2", + description: "Bit manipulation extensions group 2", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "erms", + description: "Enhanced REP MOVSB/STOSB", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + /* + The instruction enabled by this seems rather powerful. Are we sure that doesn't have security implications? + I included this because it seems like QEMU does (to the best of my understanding). + */ + ValueDefinition { + short: "invpcid", + description: "INVPCID instruction (Invalidate Processor Context ID)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + // This is TSX related. TSX is riddled with CVEs: Consider two profiles (one with it disabled) or an opt-in/out feature. + ValueDefinition { + short: "rtm", + description: "Intel restricted transactional memory", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "rdt_m", + description: "Supports Intel Resource Director Technology Monitoring Capability if 1", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + // The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). TODO: Is it OK to just set this to 1? + ValueDefinition { + short: "zero_fcs_fds", + description: "Deprecates FPU CS and FPU DS values if 1", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + // This has been deprecated + ValueDefinition { + short: "mpx", + description: "Intel memory protection extensions", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // This might be useful for certain high performance applications, but it also seems like a rather niche and advanced feature. QEMU does also not automatically enable this from what we can tell. + // TODO: Should we make this OPT-IN? + ValueDefinition { + short: "rdt_a", + description: "Intel RDT-A. Supports Intel Resource Director Technology Allocation Capability if 1", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + // TODO: Do the wider avx512 zmm registers work out of the box when the hardware supports it? + ValueDefinition { + short: "avx512f", + description: "AVX-512 foundation instructions", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512dq", + description: "AVX-512 double/quadword instructions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdseed", + description: "RDSEED instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "adx", + description: "ADCX/ADOX instructions", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "smap", + description: "Supervisor mode access prevention", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512ifma", + description: "AVX-512 integer fused multiply add", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clflushopt", + description: "CLFLUSHOPT instruction", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clwb", + description: "CLWB instruction", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "intel_pt", + description: "Intel processor trace", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512pf", + description: "AVX-512 prefetch instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512er", + description: "AVX-512 exponent/reciprocal instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512cd", + description: "AVX-512 conflict detection instructions", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sha_ni", + description: "SHA/SHA256 instructions", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512bw", + description: "AVX-512 byte/word instructions", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512vl", + description: "AVX-512 VL (128/256 vector length) extensions", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "prefetchwt1", + description: "PREFETCHWT1 (Intel Xeon Phi only)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512vbmi", + description: "AVX-512 Vector byte manipulation instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // Also set by QEMU for CPU models from what we can tell + ValueDefinition { + short: "umip", + description: "User mode instruction protection", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // Also set by QEMU for CPU models from what we can tell + ValueDefinition { + short: "pku", + description: "Protection keys for user-space", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ospke", + description: "OS protection keys enable", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "waitpkg", + description: "WAITPKG instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_vbmi2", + description: "AVX-512 vector byte manipulation instructions group 2", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cet_ss", + description: "CET shadow stack features", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "gfni", + description: "Galois field new instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vaes", + description: "Vector AES instructions", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vpclmulqdq", + description: "VPCLMULQDQ 256-bit instruction support", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_vnni", + description: "Vector neural network instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bitalg", + description: "AVX-512 bitwise algorithms", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + // Seems to be TDX related which is experimental in CHV. We disable this for CPU profiles for now, but could potentially add it as an opt-in feature eventually. + ValueDefinition { + short: "tme", + description: "Intel total memory encryption", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vpopcntdq", + description: "AVX-512: POPCNT for vectors of DWORD/QWORD", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "la57", + description: "57-bit linear addresses (five-level paging)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mawau_val_lm", + description: "BNDLDX/BNDSTX MAWAU value in 64-bit mode", + bits_range: (17, 21), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "rdpid", + description: "RDPID instruction", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + // We leave key locker support out for CPU profiles for the time being. We may want this to be opt-in in the future though + ValueDefinition { + short: "key_locker", + description: "Intel key locker support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bus_lock_detect", + description: "OS bus-lock detection", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cldemote", + description: "CLDEMOTE instruction", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdiri", + description: "MOVDIRI instruction", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdir64b", + description: "MOVDIR64B instruction", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "enqcmd", + description: "Enqueue stores supported (ENQCMD{,S})", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // SGX support is deprecated so we disable it unconditionally for CPU profiles + ValueDefinition { + short: "sgx_lc", + description: "Intel SGX launch configuration", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pks", + description: "Protection keys for supervisor-mode pages", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // SGX is deprecated + ValueDefinition { + short: "sgx_keys", + description: "Intel SGX attestation services", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_4vnniw", + description: "AVX-512 neural network instructions (Intel Xeon Phi only)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_4fmaps", + description: "AVX-512 multiply accumulation single precision (Intel Xeon Phi only)", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fsrm", + description: "Fast short REP MOV", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "uintr", + description: "CPU supports user interrupts", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vp2intersect", + description: "VP2INTERSECT{D,Q} instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "srdbs_ctrl", + description: "SRBDS mitigation MSR available: If 1, enumerates support for the IA32_MCU_OPT_CTRL MSR and indicates that its bit 0 (RNGDS_MITG_DIS) is also supported.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "md_clear", + description: "VERW MD_CLEAR microcode support", + bits_range: (10, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "rtm_always_abort", + description: "XBEGIN (RTM transaction) always aborts", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tsx_force_abort", + description: "MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "serialize", + description: "SERIALIZE instruction", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hybrid_cpu", + description: "The CPU is identified as a 'hybrid part'", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + // TODO: This is TSX related which is riddled with CVEs. We could consider an additional profile enabling TSX in the future, but we leave it out for now. + ValueDefinition { + short: "tsxldtrk", + description: "TSX suspend/resume load address tracking", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + // Might be relevant for confidential computing + ValueDefinition { + short: "pconfig", + description: "PCONFIG instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "arch_lbr", + description: "Intel architectural LBRs", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ibt", + description: "CET indirect branch tracking", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_bf16", + description: "AMX-BF16: tile bfloat16 support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_fp16", + description: "AVX-512 FP16 instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile", + description: "AMX-TILE: tile architecture support", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_int8", + description: "AMX-INT8: tile 8-bit integer support", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "spec_ctrl", + description: "Speculation Control (IBRS/IBPB: indirect branch restrictions)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "intel_stibp", + description: "Single thread indirect branch predictors", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + // MSR related + ValueDefinition { + short: "flush_l1d", + description: "FLUSH L1D cache: IA32_FLUSH_CMD MSR", + bits_range: (28, 28), + policy: ProfilePolicy::Passthrough, + }, + // MSR related + ValueDefinition { + short: "arch_capabilities", + description: "Intel IA32_ARCH_CAPABILITIES MSR", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "core_capabilities", + description: "IA32_CORE_CAPABILITIES MSR", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "spec_ctrl_ssbd", + description: "Speculative store bypass disable", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sha512", + description: "SHA-512 extensions", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm3", + description: "SM3 instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm4", + description: "SM4 instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // RAO-INT is deprecated and removed from many compilers as far as we are aware. + // This policy can be changed if requested in the future. + ValueDefinition { + short: "RAO-INT", + description: "RAO-INT instructions", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_vnni", + description: "AVX-VNNI instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bf16", + description: "AVX-512 bfloat16 instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /* + Not set in QEMU from what we can tell, but according seems to be fine to expose this to guests + if we understood https://www.phoronix.com/news/Intel-Linux-LASS-KVM correctly. It is also + our understanding that this feature can enable guests opting in to more security (possibly at the cost of some performance). + */ + ValueDefinition { + short: "lass", + description: "Linear address space separation", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmpccxadd", + description: "CMPccXADD instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "arch_perfmon_ext", + description: "ArchPerfmonExt: leaf 0x23 is supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fzrm", + description: "Fast zero-length REP MOVSB", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrs", + description: "Fast short REP STOSB", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrc", + description: "Fast Short REP CMPSB/SCASB", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fred", + description: "FRED: Flexible return and event delivery transitions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lkgs", + description: "LKGS: Load 'kernel' (userspace) GS", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "wrmsrns", + description: "WRMSRNS instruction (WRMSR-non-serializing)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nmi_src", + description: "NMI-source reporting with FRED event data", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "amx_fp16", + description: "AMX-FP16: FP16 tile operations", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hreset", + description: "History reset support", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_ifma", + description: "Integer fused multiply add", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lam", + description: "Linear address masking", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rd_wr_msrlist", + description: "RDMSRLIST/WRMSRLIST instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "invd_disable_post_bios_done", + description: "If 1, supports INVD execution prevention after BIOS Done", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movrs", + description: "MOVRS", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "intel_ppin", + description: "Protected processor inventory number (PPIN{,_CTL} MSRs)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pbndkb", + description: "PBNDKB instruction supported and enumerates the existence of the IA32_TSE_CAPABILITY MSR", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "asymmetric-rdt-M", + description: "At least one logical processor supports Asymmetrical Intel RDT Monitoring Capability", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "asymmetric-rdt-A", + description: "At least one logical processor supports Asymmetrical Intel RDT Allocation Capability", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "MSR_IMM", + description: "Immediate forms of the RDMSR and WRMSRNS instructions are supported", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_vnni_int8", + description: "AVX-VNNI-INT8 instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_ne_convert", + description: "AVX-NE-CONVERT instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the value will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinition { + short: "amx_complex", + description: "AMX-COMPLEX instructions (starting from Granite Rapids)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_vnni_int16", + description: "AVX-VNNI-INT16 instructions", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "utmr", + description: "If 1, supports user-timer events", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "prefetchit_0_1", + description: "PREFETCHIT0/1 instructions", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "user_msr", + description: "If 1, supports the URDMSR and UWRMSR instructions", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "uiret_uif", + description: "If 1, UIRET sets UIF to the value of bit 1 of the RFLAGS image loaded from the stack", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cet_sss", + description: "CET supervisor shadow stacks safe to use", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx10", + description: "If 1, supports the Intel AVX10 instructions and indicates the presence of leaf 0x24", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apx_f", + description: "If 1, the processor provides foundational support for Intel Advanced Performance Extensions", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mwait", + description: "If 1, MWAIT is supported even if (0x1 ECX bit 3 (monitor) is enumerated as 0)", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "slsm", + description: "If 1, indicates bit 0 of the IA32_INTEGRITY_STATUS MSR is supported. Bit 0 of this MSR indicates whether static lockstep is active on this logical processor", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "intel_psfd", + description: "If 1, indicates bit 7 of the IA32_SPEC_CTRL_MSR is supported. Bit 7 of this MSR disables fast store forwarding predictor without disabling speculative store bypass", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ipred_ctrl", + description: "MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S}", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "rrsba_ctrl", + description: "MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S}", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ddp_ctrl", + description: "MSR bit IA32_SPEC_CTRL.DDPD_U", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "bhi_ctrl", + description: "MSR bit IA32_SPEC_CTRL.BHI_DIS_S", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "mcdt_no", + description: "MCDT mitigation not needed", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "uclock_disable", + description: "UC-lock disable is supported", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Direct Cache Access Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x9, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "dca_cap_msr_value", + description: "Value of bits [31:0] of IA32_PLATFORM_DCA_CAP MSR (address 1f8H)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring + // =================================================================================================================== + // We will just zero out everything to do with PMU for CPU profiles + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_version", + description: "Performance monitoring unit version ID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_n_gcounters", + description: "Number of general PMU counters per logical CPU", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_gcounters_nbits", + description: "Bitwidth of PMU general counters", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_cpuid_ebx_bits", + description: "Length of leaf 0xa EBX bit vector", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "no_core_cycle_evt", + description: "Core cycle event not available", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_insn_retired_evt", + description: "Instruction retired event not available", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_refcycle_evt", + description: "Reference cycles event not available", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_ref_evt", + description: "LLC-reference event not available", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_miss_evt", + description: "LLC-misses event not available", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_insn_ret_evt", + description: "Branch instruction retired event not available", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_mispredict_evt", + description: "Branch mispredict retired event not available", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_td_slots_evt", + description: "Topdown slots event not available", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_fcounters_bitmap", + description: "Fixed-function PMU counters support bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_n_fcounters", + description: "Number of fixed PMU counters", + bits_range: (0, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_fcounters_nbits", + description: "Bitwidth of PMU fixed counters", + bits_range: (5, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "anythread_depr", + description: "AnyThread deprecation", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Extended Topology Enumeration + // =================================================================================================================== + + // Leaf 0xB must be set by CHV itself (and do all necessary checks) + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_nr", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Main Leaf + // =================================================================================================================== + // TODO: Implement CPUID compatibility checks in CHV for this leaf + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_x87", + description: "XCR0.X87 (bit 0) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_sse", + description: "XCR0.SEE (bit 1) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx", + description: "XCR0.AVX (bit 2) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndregs", + description: "XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 registers)", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndcsr", + description: "XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS registers)", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_avx512_opmask", + description: "XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 registers)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_zmm_hi256", + description: "XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 registers)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_hi16_zmm", + description: "XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 registers)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xcr0_ia32_xss", + description: "XCR0.IA32_XSS (bit 8) used for IA32_XSS", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_pkru", + description: "XCR0.PKRU (bit 9) supported (XSAVE PKRU registers)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_ia32_xss_bits", + description: "XCR0.IA32_XSS (bit 10 - 16) used for IA32_XSS", + bits_range: (10, 16), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. + ValueDefinition { + short: "xcr0_tileconfig", + description: "XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not ebabled by the user. + ValueDefinition { + short: "xcr0_tiledata", + description: "XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + // This value can be changed by the OS and must thus be passthrough + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_xcr0_enabled", + description: "XSAVE/XRSTOR area byte size, for XCR0 enabled features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + // This may be passthrough because we restrict each individual state component + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_max", + description: "XSAVE/XRSTOR area max byte size, all CPU features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + // TODO: Do we know of any state components corresponding to the upper bits in XCR0? Perhaps it would be + // better to have `ProfilePolicy::Static(0)` here? + ValueDefinitions::new(&[ValueDefinition { + short: "xcr0_upper_bits", + description: "Reports the valid bit fields of the upper 32 bits of the XCR0 register", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xsaveopt", + description: "XSAVEOPT instruction", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsavec", + description: "XSAVEC instruction", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xgetbv1", + description: "XGETBV instruction with ECX = 1", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // TODO: Can this have security implications in terms of supervisor state getting exposed? + ValueDefinition { + short: "xsaves", + description: "XSAVES/XRSTORS instructions (and XSS MSR)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd", + description: "Extended feature disable support", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + /*NOTE: This will depend on which CPU features (in CHV) are enabled and pre-computation can potentially lead to a combinatorial explosion. Luckily we can deal with each component (and its size) separately, hence we can just passthrough whatever we get from the host here.*/ + ValueDefinition { + short: "xsave_sz_xcr0_xmms_enabled", + description: "XSAVE area size, all XCR0 and IA32_XSS features enabled", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + /* Reports the supported bits of the lower IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] = 1*/ + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_7bits", + description: "Used for XCR0", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pt", + description: "PT state, supported", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_bit9", + description: "Used for XCR0", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pasid", + description: "PASID state, supported", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_cet_u", + description: "CET user state, supported", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_cet_p", + description: "CET supervisor state, supported", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_hdc", + description: "HDC state, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_uintr", + description: "UINTR state, supported", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_lbr", + description: "LBR state, supported", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_hwp", + description: "HWP state, supported", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_bits", + description: "Used for XCR0", + bits_range: (17, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + /* Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1*/ + ValueDefinitions::new(&[ValueDefinition { + short: "ia32_xss_upper", + description: " Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaves + // =================================================================================================================== + + /* LEAF 0xd sub-leaf n >=2 : + If ECX contains an invalid sub-leaf index, EAX/EBX/ECX/EDX return 0. Sub-leaf n (0 ≤ n ≤ 31) is + invalid if sub-leaf 0 returns 0 in EAX[n] and sub-leaf 1 returns 0 in ECX[n]. Sub-leaf n (32 ≤ n ≤ 63) + is invalid if sub-leaf 0 returns 0 in EDX[n-32] and sub-leaf 1 returns 0 in EDX[n-32]. + */ + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Intel MPX is deprecated hence we zero out these sub-leaves + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-eax-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ebx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ecx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-edx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to + // the policy declared here at runtime for those values. + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_rmid_max", + description: "RMID max, within this core, all types (0-based)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3-cache-rdt-monitoring", + description: "Supports L3 Cache Intel RDT Monitoring if 1", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3c_qm_bitwidth", + description: "L3 QoS-monitoring counter bitwidth (24-based)", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_overflow_bit", + description: "QM_CTR MSR bit 61 is an overflow bit", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT CTM support", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT MBM support", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_conver_factor", + description: "QM_CTR MSR conversion factor to bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_rmid_max", + description: "L3 QoS-monitoring max RMID", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cqm_occup_llc", + description: "L3 QoS occupancy monitoring supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_total", + description: "L3 QoS total bandwidth monitoring supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_local", + description: "L3 QoS local bandwidth monitoring supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + //TODO: These features may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "cat_l3", + description: "L3 Cache Allocation Technology supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_l2", + description: "L2 Cache Allocation Technology supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba", + description: "Memory Bandwidth Allocation supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 1) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L3_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L3_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + //TODO: These feature may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3_cat_non_cpu_agents", + description: "L3_CAT for non-CPU agent is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cdp_l3", + description: "L3/L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L3/L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 2) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L2_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L2_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::ECX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ + ValueDefinition { + short: "cdp_l2", + description: "L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 3) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // TODO: We might need some way to opt in to use Intel MBA technology in guests with non-host CPU profiles. + ValueDefinition { + short: "mba_max_delay", + description: "Max MBA throttling value; minus-one notation", + bits_range: (0, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "per_thread_mba", + description: "Per-thread MBA controls are supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba_delay_linear", + description: "Delay values are linear", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "mba_cos_max", + description: "MBA max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 5) + // =================================================================================================================== + // + // TODO: We may want to have some way to opt-in to use Intel RDT for guests with non-host CPU profiles. + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_max_throttle", + description: "Max Core throttling level supported by the corresponding ResID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "core_scope", + description: "If 1, indicates the logical processor scope of the IA32_QoS_Core_BW_Thrtl_n MSRs. Other values are reserved", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cba_delay_linear", + description: "The response of the bandwidth control is approximately linear", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_cos_max", + description: "Core max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // SGX is already disabled and deprecated so we don't need to worry about leaf 0x12 and its subleaves + + // =================================================================================================================== + // Intel Processor Trace Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pt_max_subleaf", + description: "Maximum leaf 0x14 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cr3_filtering", + description: "IA32_RTIT_CR3_MATCH is accessible", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_cyc", + description: "Configurable PSB and cycle-accurate mode", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_filtering", + description: "IP/TraceStop filtering; Warm-reset PT MSRs preservation", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_timing", + description: "MTC timing packet; COFI-based packets suppression", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ptwrite", + description: "PTWRITE support", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "power_event_trace", + description: "Power Event Trace support", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_pmi_preserve", + description: "PSB and PMI preservation support", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "event_trace", + description: "Event Trace packet generation through IA32_RTIT_CTL.EventEn", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tnt_disable", + description: "TNT packet generation disable through IA32_RTIT_CTL.DisTNT", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "topa_output", + description: "ToPA output scheme support", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "topa_multiple_entries", + description: "ToPA tables can hold multiple entries", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "single_range_output", + description: "Single-range output scheme supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "trance_transport_output", + description: "Trace Transport subsystem output support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_payloads_lip", + description: "IP payloads have LIP values (CS base included)", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Processor Trace Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "num_address_ranges", + description: "Filtering number of configurable Address Ranges", + bits_range: (0, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_periods_bmp", + description: "Bitmap of supported MTC period encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cycle_thresholds_bmp", + description: "Bitmap of supported Cycle Threshold encodings", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_periods_bmp", + description: "Bitmap of supported Configurable PSB frequency encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Time Stamp Counter and Core Crystal Clock Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_denominator", + description: "Denominator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_numerator", + description: "Numerator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_crystal_hz", + description: "Core crystal clock nominal frequency, in Hz", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Frequency Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_base_mhz", + description: "Processor base frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_max_mhz", + description: "Processor max frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "bus_mhz", + description: "Bus reference frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // System-On-Chip Vendor Attribute Enumeration Main Leaf + // =================================================================================================================== + + // System-On-Chip should probably not be supported for CPU profiles for the foreseeable feature. + ( + Parameters { + leaf: 0x17, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "soc_max_subleaf", + description: "Maximum leaf 0x17 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Deterministic Address Translation Parameters + // =================================================================================================================== + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tlb_max_subleaf", + description: "Maximum leaf 0x18 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_4k_page", + description: "TLB 4KB-page entries supported", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_2m_page", + description: "TLB 2MB-page entries supported", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_4m_page", + description: "TLB 4MB-page entries supported", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_1g_page", + description: "TLB 1GB-page entries supported", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "hard_partitioning", + description: "(Hard/Soft) partitioning between logical CPUs sharing this structure", + bits_range: (8, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "n_way_associative", + description: "Ways of associativity", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_sets", + description: "Number of sets", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_type", + description: "Translation cache type (TLB type)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_cache_level", + description: "Translation cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "is_fully_associative", + description: "Fully-associative structure", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_max_addressable_ids", + description: "Max number of addressable IDs for logical CPUs sharing this TLB - 1", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // We don't support key locker for now (leaf 0x19): Hence we zero out leaf 0x19 for CPU profiles We zero LEAF + // 0x1A (Native Model ID Enumeration) out for CPU profiles LEAF 0x1B (PCONFIG) is zeroed out for CPU profiles + // for now + + // =================================================================================================================== + // Last Branch Records Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_depth_8", + description: "Max stack depth (number of LBR entries) = 8", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_16", + description: "Max stack depth (number of LBR entries) = 16", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_24", + description: "Max stack depth (number of LBR entries) = 24", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_32", + description: "Max stack depth (number of LBR entries) = 32", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_40", + description: "Max stack depth (number of LBR entries) = 40", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_48", + description: "Max stack depth (number of LBR entries) = 48", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_56", + description: "Max stack depth (number of LBR entries) = 56", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_64", + description: "Max stack depth (number of LBR entries) = 64", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_deep_c_reset", + description: "LBRs maybe cleared on MWAIT C-state > C1", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_ip_is_lip", + description: "LBR IP contain Last IP, otherwise effective IP", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_cpl", + description: "CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_filter", + description: "Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_call_stack", + description: "Call-stack mode (IA32_LBR_CTL[3] = 1) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_mispredict", + description: "Branch misprediction bit supported (IA32_LBR_x_INFO[63])", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_timed_lbr", + description: "Timed LBRs (CPU cycles since last LBR entry) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_type", + description: "Branch type field (IA32_LBR_INFO_x[59:56]) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_events_gpc_bmp", + description: "LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Tile Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_max_palette", + description: "Highest palette ID / subleaf ID", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Tile Palette 1 Sub-leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_palette_size", + description: "AMX palette total tiles size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile_size", + description: "AMX single tile's size, in bytes", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_tile_row_size", + description: "AMX tile single row's size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_palette_nr_tiles", + description: "AMX palette number of tiles", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_tile_nr_rows", + description: "AMX tile max number of rows", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // TMUL Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tmul_info_max", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x1e", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tmul_maxk", + description: "TMUL unit maximum height, K (rows or columns)", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tmul_maxn", + description: "TMUL unit maximum SIMD dimension, N (column bytes)", + bits_range: (8, 23), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // TMUL Information Sub-leaf 1 + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the relevant feature bits that userspace applications must check will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_int8", + description: "If 1, the processor supports tile computational operations on 8-bit integers", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_bf16", + description: "If 1, the processor supports tile computational operations on bfloat16 numbers", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_complex", + description: "If 1, the processor supports the AMX-COMPLEX instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp16", + description: "If 1, the processor supports tile computational operations on FP16 numbers", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp8", + description: "If 1, the processor supports tile computational operations on FP8 numbers", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_transpose", + description: "If 1, the processor supports the AMX-TRANSPOSE instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tf32", + description: "If 1, the processor supports the AMX-TF32 (FP19) instructions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_avx512", + description: "If 1, the processor supports the AMX-AVX512 instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_movrs", + description: "If 1, the processor supports the AMX-MOVRS instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // V2 Extended Topology Enumeration + // =================================================================================================================== + + // The values in leaf 0x1f must be set by CHV itself. + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_level", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor History Reset + // =================================================================================================================== + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_nr_subleaves", + description: "CPUID 0x20 max subleaf + 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_thread_director", + description: "HRESET of Intel thread director is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // TDX + // =================================================================================================================== + + // TDX is not supported by CPU profiles for now. We just zero out this leaf for CPU profiles for the time being. + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_0", + description: "TDX vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "subleaf_0", + description: "If 1, subleaf 0 exists", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_1", + description: "If 1, subleaf 1 exists", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_2", + description: "If 1, subleaf 2 exists", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_3", + description: "If 1, subleaf 3 exists", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_4", + description: "If 1, subleaf 4 exists", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_5", + description: "If 1, subleaf 5 exists. The processor supports Architectural PEBS. The IA32_PEBS_BASE and IA32_PEBS_INDEX MSRs exist", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "unitmask2", + description: "IA32_PERFEVTSELx MSRs UnitMask2 is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "eq_bit", + description: "equal flag in the IA32_PERFEVTSELx MSR is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "RDPMC_USR_DISABLE", + description: "RDPMC_USR_DISABLE", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "num_slots_per_cycle", + description: "Number of slots per cycle. This number can be multiplied by the number of cycles (from CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.CORE or IA32_FIXED_CTR1) to determine the total number of slots", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_gp_counters_bitmap", + description: "General-purpose PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_f_counters_bitmap", + description: "Fixed PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_acr_bitmap", + description: "Bitmap of Auto Counter Reload (ACR) general-purpose counters that can be reloaded", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 3 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_cycles_evt", + description: "Core cycles event supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "insn_retired_evt", + description: "Instructions retired event supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ref_cycles_evt", + description: "Reference cycles event supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_refs_evt", + description: "Last-level cache references event supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_misses_evt", + description: "Last-level cache misses event supported", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_insn_ret_evt", + description: "Branch instruction retired event supported", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_mispr_evt", + description: "Branch mispredict retired event supported", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_slots_evt", + description: "Topdown slots event supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_backend_bound_evt", + description: "Topdown backend bound event supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_bad_spec_evt", + description: "Topdown bad speculation event supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_frontend_bound_evt", + description: "Topdown frontend bound event supported", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_retiring_evt", + description: "Topdown retiring event support", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_inserts", + description: "LBR support", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 4 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 5 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "architectural_pebs_counters", + description: "General-purpose counters support Architectural PEBS. Bit vector of general-purpose counters for which the Architectural PEBS mechanism is available", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_pdist_counters", + description: "General-purpose counters for which PEBS support PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_counters", + description: "Fixed-function counters support Architectural PEBS. Bit vector of fixed-function counters for which the Architectural PEBS mechanism is available. If ECX[x] == 1, then the IA32_PMC_FXm_CFG_C MSR is available, and PEBS is supported", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_pdist_counters", + description: "Fixed-function counters for which PEBS supports PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Converged Vector ISA Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "converged_vector_isa_max_sub_leaves", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x24", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_10_version", + description: "Reports the intel AVX10 Converged Vector ISA version", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_10_lengths", + description: "Reserved at 111", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Hypervisor reserved CPUID leaves are set elsewhere + + // =================================================================================================================== + // Extended Function CPUID Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_ext_leaf", + description: "Maximum extended CPUID leaf supported", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "Vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "Vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "Vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // 0x80000001.EAX and EBX are both Reserved on Intel hence we just zero them out + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lahf_lm", + description: "LAHF and SAHF in 64-bit mode", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lzcnt", + description: "LZCNT advanced bit manipulation", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "prefetchw", + description: "3DNow PREFETCH/PREFETCHW support", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "syscall", + description: "SYSCALL and SYSRET instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nx", + description: "Execute Disable Bit available", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pdpe1gb", + description: "1-GB large page support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "rdtscp", + description: "RDTSCP instruction and IA32_TSC_AUX are available", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lm", + description: "Long mode (x86-64, 64-bit support)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // The profile generation tool will actually modify the brand id string before + // acting on the policy set here. + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_0", + description: "CPU brand ID string, bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_1", + description: "CPU brand ID string, bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_2", + description: "CPU brand ID string, bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_3", + description: "CPU brand ID string, bytes 12 - 15", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_4", + description: "CPU brand ID string bytes, 16 - 19", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_5", + description: "CPU brand ID string bytes, 20 - 23", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_6", + description: "CPU brand ID string bytes, 24 - 27", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_7", + description: "CPU brand ID string bytes, 28 - 31", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_8", + description: "CPU brand ID string, bytes 32 - 35", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_9", + description: "CPU brand ID string, bytes 36 - 39", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_10", + description: "CPU brand ID string, bytes 40 - 43", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_11", + description: "CPU brand ID string, bytes 44 - 47", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000006, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l2_line_size", + description: "L2 cache line size, in bytes", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_nlines", + description: "L2 cache number of lines per tag", + bits_range: (8, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_assoc", + description: "L2 cache associativity", + bits_range: (12, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_size_kb", + description: "L2 cache size, in KB", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // EAX, EBX and ECX of 0x8000_0007 are all reserved (=0) on Intel + ( + Parameters { + leaf: 0x80000007, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // TODO: We may want some mechanism to let users opt-in to using an invariant TSC provided by the hardware (when available). + // TODO: Probably unconditionally set by CHV + ValueDefinition { + short: "constant_tsc", + description: "TSC ticks at constant rate across all P and C states", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "phys_addr_bits", + description: "Max physical address bits", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "virt_addr_bits", + description: "Max virtual address bits", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "guest_phys_addr_bits", + description: "Max nested-paging guest physical address bits", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "wbnoinvd", + description: "WBNOINVD supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }]), + ), + ]) +}; diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs index 61700298bf..dad213cc0f 100644 --- a/arch/src/x86_64/cpuid_definitions/mod.rs +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -10,6 +10,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::x86_64::CpuidReg; +pub mod intel; + pub(in crate::x86_64) fn serialize_as_hex( input: &u32, serializer: S, From fe5512e707f2e1b9bdfba92901d6cf863c592773 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 10 Dec 2025 11:13:01 +0100 Subject: [PATCH 125/178] arch: KVM CPUID definitions We introduce CPUID definitions defined for the KVM hypervisor. These definitions will later be utilized by the upcoming CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/kvm.rs | 209 +++++++++++++++++++++++ arch/src/x86_64/cpuid_definitions/mod.rs | 2 + 2 files changed, 211 insertions(+) create mode 100644 arch/src/x86_64/cpuid_definitions/kvm.rs diff --git a/arch/src/x86_64/cpuid_definitions/kvm.rs b/arch/src/x86_64/cpuid_definitions/kvm.rs new file mode 100644 index 0000000000..3282e04222 --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/kvm.rs @@ -0,0 +1,209 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains CPUID definitions for the KVM hypervisor. + +use std::ops::RangeInclusive; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::{ + CpuidDefinitions, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// CPUID features defined for the KVM hypervisor. +/// +/// See https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html +pub const KVM_CPUID_DEFINITIONS: CpuidDefinitions<6> = const { + CpuidDefinitions([ + //===================================================================== + // KVM CPUID Signature + // =================================================================== + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_hypervisor_leaf", + description: "The maximum valid leaf between 0x4000_0000 and 0x4FFF_FFF", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ebx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ecx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_edx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + //===================================================================== + // KVM CPUID Features + // =================================================================== + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "kvm_feature_clocksource", + description: "kvmclock available at MSRs 0x11 and 0x12", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_nop_io_delay", + description: "Not necessary to perform delays on PIO operations", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_mmu_op", + description: "Deprecated", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource2", + description: "kvmclock available at MSRs 0x4b564d00 and 0x4b564d01", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf", + description: "async pf can be enabled by writing to MSR 0x4b564d02", + bits_range: (4, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_steal_time", + description: "steal time can be enabled by writing to msr 0x4b564d03", + bits_range: (5, 5), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_eoi", + description: "paravirtualized end of interrupt handler can be enabled by writing to msr 0x4b564d04", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_unhalt", + description: "guest checks this feature bit before enabling paravirtualized spinlock support", + bits_range: (7, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_tlb_flush", + description: "guest checks this feature bit before enabling paravirtualized tlb flush", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_vmexit", + description: "paravirtualized async PF VM EXIT can be enabled by setting bit 2 when writing to msr 0x4b564d02", + bits_range: (10, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_send_ipi", + description: "guest checks this feature bit before enabling paravirtualized send IPIs", + bits_range: (11, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_poll_control", + description: "host-side polling on HLT can be disabled by writing to msr 0x4b564d05.", + bits_range: (12, 12), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_sched_yield", + description: "guest checks this feature bit before using paravirtualized sched yield.", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_int", + description: "guest checks this feature bit before using the second async pf control msr 0x4b564d06 and async pf acknowledgment msr 0x4b564d07.", + bits_range: (14, 14), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_msi_ext_dest_id", + description: "guest checks this feature bit before using extended destination ID bits in MSI address bits 11-5.", + bits_range: (15, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_hc_map_gpa_range", + description: "guest checks this feature bit before using the map gpa range hypercall to notify the page state change", + bits_range: (16, 16), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_migration_control", + description: "guest checks this feature bit before using MSR_KVM_MIGRATION_CONTROL", + bits_range: (17, 17), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource_stable_bit", + description: "host will warn if no guest-side per-cpu warps are expected in kvmclock", + bits_range: (24, 24), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "kvm_hints_realtime", + description: "guest checks this feature bit to determine that vCPUs are never preempted for an unlimited time allowing optimizations", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }]), + ), + ]) +}; diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs index dad213cc0f..9a1246ce6b 100644 --- a/arch/src/x86_64/cpuid_definitions/mod.rs +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -11,6 +11,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::x86_64::CpuidReg; pub mod intel; +#[cfg(feature = "kvm")] +pub mod kvm; pub(in crate::x86_64) fn serialize_as_hex( input: &u32, From 329d56ce9445fde7dd619837b127ff48b8715127 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 10 Dec 2025 12:42:00 +0100 Subject: [PATCH 126/178] arch: Improve CPUID incompatibility logging We use the Intel CPUID definitions to provide more information when CPUID compatibility checks fail (when both the source and destination VM run on Intel CPUs). Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/mod.rs | 62 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index f81aaeedec..0152db35a1 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -537,8 +537,62 @@ impl CpuidFeatureEntry { let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list); let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list); - // Loop on feature bit and check if the 'source vm' feature is a subset - // of those of the 'destination vm' feature + // If both processors are Intel then we can use the existing Intel CPUID definitions to log more + // precise information about potential errors + let both_intel = { + // Check if the vendor string is "GenuineIntel". This assumes that `leaf_0` is the entry + // corresponding to CPUID leaf 0. + let is_intel = |leaf_0: &CpuIdEntry| { + leaf_0.ebx == 0x756e_6547 && leaf_0.ecx == 0x6c65_746e && leaf_0.edx == 0x4965_6e69 + }; + let src_0 = src_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + let dest_0 = dest_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + src_0 + .zip(dest_0) + .is_some_and(|(src, dest)| is_intel(src) & is_intel(dest)) + }; + let extra_reporting = |entry: &CpuidFeatureEntry, src_reg: u32, dest_reg: u32| { + if let Some((_, defs)) = cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS + .as_slice() + .iter() + .find(|(param, _)| { + (param.leaf == entry.function) + && (param.sub_leaf.contains(&entry.index) + && (param.register == entry.feature_reg)) + }) + { + for def in defs.as_slice() { + let mask = (def.bits_range.0..=def.bits_range.1) + .fold(0, |acc, next| acc | (1 << next)); + + let src_val = src_reg & mask; + let dest_val = dest_reg & mask; + + let is_compatible = match entry.compatible_check { + CpuidCompatibleCheck::BitwiseSubset => (src_val & (!dest_val)) == 0, + CpuidCompatibleCheck::NumNotGreater => src_val <= dest_val, + CpuidCompatibleCheck::Equal => src_val == dest_val, + }; + if !is_compatible { + info!( + "CPUID incompatibility for value definition='{:?}' detected in leaf={:#02x}, sub-leaf={:#02x}, register={:?}, compatibility_check={:?}, source VM value='{:#04x}' destination VM value='{:#04x}'", + def, + entry.function, + entry.index, + entry.feature_reg, + entry.compatible_check, + src_val, + dest_val + ); + } + } + } + }; + let mut compatible = true; for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features .iter() @@ -566,7 +620,9 @@ impl CpuidFeatureEntry { src_vm_feature, dest_vm_feature ); - + if both_intel { + extra_reporting(entry, *src_vm_feature, *dest_vm_feature); + } compatible = false; } } From 3c6e4c9892de527a72fc5ffee8459760c853f76f Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 10 Dec 2025 16:15:45 +0100 Subject: [PATCH 127/178] arch: CPU profile generation CLI This commit introduces a CLI for generating a CPU profile closely matching the CPU of the machine the CLI is executed on. The idea is to have a simple way to add more CPU profiles corresponding to physical CPUs. Note however that with the current setup one still needs a little bit of manual work to integrate the generated CPU profile data into cloud hypervisor itself. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- Cargo.lock | 1 + arch/Cargo.toml | 9 + arch/src/bin/generate-cpu-profile.rs | 38 +++ arch/src/x86_64/cpu_profile_generation.rs | 278 ++++++++++++++++++++++ arch/src/x86_64/mod.rs | 2 + 5 files changed, 328 insertions(+) create mode 100644 arch/src/bin/generate-cpu-profile.rs create mode 100644 arch/src/x86_64/cpu_profile_generation.rs diff --git a/Cargo.lock b/Cargo.lock index 3fd222f5d5..759e0045ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -114,6 +114,7 @@ version = "0.1.0" dependencies = [ "anyhow", "byteorder", + "clap", "fdt", "hypervisor", "libc", diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 1a4d267966..8e1f5c0a2b 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -5,16 +5,25 @@ name = "arch" rust-version.workspace = true version = "0.1.0" +# TODO: Consider making this a binary of the main package instead +[[bin]] +name = "generate-cpu-profile" +path = "src/bin/generate-cpu-profile.rs" +required-features = ["cpu_profile_generation"] + [features] default = [] fw_cfg = [] kvm = ["hypervisor/kvm"] sev_snp = [] tdx = [] +# Currently cpu profiles can only be generated with KVM +cpu_profile_generation = ["dep:clap", "kvm"] [dependencies] anyhow = { workspace = true } byteorder = { workspace = true } +clap = { workspace = true, optional = true } hypervisor = { path = "../hypervisor" } libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } diff --git a/arch/src/bin/generate-cpu-profile.rs b/arch/src/bin/generate-cpu-profile.rs new file mode 100644 index 0000000000..ee367906f6 --- /dev/null +++ b/arch/src/bin/generate-cpu-profile.rs @@ -0,0 +1,38 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +#![cfg(all( + target_arch = "x86_64", + feature = "cpu_profile_generation", + feature = "kvm" +))] +use std::io::BufWriter; + +use anyhow::Context; +use clap::{Arg, Command}; + +fn main() -> anyhow::Result<()> { + let cmd_arg = Command::new("generate-cpu-profile") + .version(env!("CARGO_PKG_VERSION")) + .arg_required_else_help(true) + .arg( + Arg::new("name") + .help("The name to give the CPU profile") + .num_args(1) + .required(true), + ) + .get_matches(); + + let profile_name = cmd_arg.get_one::("name").unwrap(); + + let hypervisor = hypervisor::new().context("Could not obtain hypervisor")?; + // TODO: Consider letting the user provide a file path as a target instead of writing to stdout. + // The way it is now should be sufficient for a PoC however. + let writer = BufWriter::new(std::io::stdout().lock()); + arch::x86_64::cpu_profile_generation::generate_profile_data( + writer, + hypervisor.as_ref(), + profile_name, + ) +} diff --git a/arch/src/x86_64/cpu_profile_generation.rs b/arch/src/x86_64/cpu_profile_generation.rs new file mode 100644 index 0000000000..8a74a2bca7 --- /dev/null +++ b/arch/src/x86_64/cpu_profile_generation.rs @@ -0,0 +1,278 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::io::Write; +use std::ops::RangeInclusive; + +use anyhow::{Context, anyhow}; +use hypervisor::arch::x86::CpuIdEntry; +use hypervisor::{CpuVendor, Hypervisor, HypervisorError, HypervisorType}; + +use crate::x86_64::cpu_profile::CpuProfileData; +#[cfg(feature = "kvm")] +use crate::x86_64::cpuid_definitions::CpuidDefinitions; +use crate::x86_64::cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS; +use crate::x86_64::cpuid_definitions::kvm::KVM_CPUID_DEFINITIONS; +use crate::x86_64::cpuid_definitions::{Parameters, ProfilePolicy}; +use crate::x86_64::{CpuidOutputRegisterAdjustments, CpuidReg}; + +/// Generate CPU profile data and convert it to a string, embeddable as Rust code, which is +/// written to the given `writer` (e.g. a File). +// +// NOTE: The MVP only works with KVM as the hypervisor and Intel CPUs. +#[cfg(feature = "kvm")] +pub fn generate_profile_data( + mut writer: impl Write, + hypervisor: &dyn Hypervisor, + profile_name: &str, +) -> anyhow::Result<()> { + let cpu_vendor = hypervisor.get_cpu_vendor(); + if cpu_vendor != CpuVendor::Intel { + unimplemented!("CPU profiles can only be generated for Intel CPUs at this point in time"); + } + + let hypervisor_type = hypervisor.hypervisor_type(); + // This is just a reality check. + if hypervisor_type != HypervisorType::Kvm { + unimplemented!( + "CPU profiles can only be generated when using KVM as the hypervisor at this point in time" + ); + } + + let brand_string_bytes = cpu_brand_string_bytes(cpu_vendor, profile_name)?; + let cpuid = supported_cpuid(hypervisor)?; + let cpuid = overwrite_brand_string(cpuid, brand_string_bytes); + let supported_cpuid_sorted = sort_entries(cpuid); + + generate_cpu_profile_data_with( + hypervisor_type, + cpu_vendor, + &supported_cpuid_sorted, + &INTEL_CPUID_DEFINITIONS, + &KVM_CPUID_DEFINITIONS, + &mut writer, + ) +} + +/// Prepare the bytes which the brand string should consist of +fn cpu_brand_string_bytes(cpu_vendor: CpuVendor, profile_name: &str) -> anyhow::Result<[u8; 48]> { + let cpu_vendor_str: String = serde_json::to_string(&cpu_vendor) + .expect("Should be possible to serialize CPU vendor to a string"); + let cpu_vendor_str = cpu_vendor_str.trim_start_matches('"').trim_end_matches('"'); + let mut brand_string_bytes = [0_u8; 4 * 3 * 4]; + if cpu_vendor_str.len() + 1 + profile_name.len() > brand_string_bytes.len() { + return Err(anyhow!( + "The profile name is too long. Try using a shorter name" + )); + } + for (b, brand_byte) in cpu_vendor_str + .as_bytes() + .iter() + .chain(std::iter::once(&b' ')) + .chain(profile_name.as_bytes()) + .zip(brand_string_bytes.iter_mut()) + { + *brand_byte = *b; + } + Ok(brand_string_bytes) +} +/// Computes [`CpuProfileData`] based on the given sorted vector of CPUID entries, hypervisor type, cpu_vendor +/// and cpuid_definitions. +/// +/// The computed [`CpuProfileData`] is then converted to a string representation, embeddable as Rust code, which is +/// then written by the given `writer`. +/// +// TODO: Consider making a snapshot test or two for this function. +fn generate_cpu_profile_data_with( + hypervisor_type: HypervisorType, + cpu_vendor: CpuVendor, + supported_cpuid_sorted: &[CpuIdEntry], + processor_cpuid_definitions: &CpuidDefinitions, + hypervisor_cpuid_definitions: &CpuidDefinitions, + mut writer: &mut impl Write, +) -> anyhow::Result<()> { + let mut adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)> = Vec::new(); + + for (parameter, values) in processor_cpuid_definitions + .as_slice() + .iter() + .chain(hypervisor_cpuid_definitions.as_slice().iter()) + { + for (sub_leaf_range, maybe_matching_register_output_value) in + extract_parameter_matches(parameter, supported_cpuid_sorted) + { + // If the compatibility target (current host) has multiple sub-leaves matching the parameter's range + // then we want to specialize: + let mut mask: u32 = 0; + let mut replacements: u32 = 0; + for value in values.as_slice() { + // Reality check on the bit range listed in `value` + { + assert!(value.bits_range.0 <= value.bits_range.1); + assert!(value.bits_range.1 < 32); + } + + match value.policy { + ProfilePolicy::Passthrough => { + // The profile should take whatever we get from the host, hence there is no adjustment, but our + // mask needs to retain all bits in the range of bits corresponding to this value + let (first_bit_pos, last_bit_pos) = value.bits_range; + mask |= bit_range_mask(first_bit_pos, last_bit_pos); + } + ProfilePolicy::Static(overwrite_value) => { + replacements |= overwrite_value << value.bits_range.0; + } + ProfilePolicy::Inherit => { + // The value is supposed to be obtained from the compatibility target if it exists + let (first_bit_pos, last_bit_pos) = value.bits_range; + if let Some(matching_register_value) = maybe_matching_register_output_value + { + let extraction_mask = bit_range_mask(first_bit_pos, last_bit_pos); + let value = matching_register_value & extraction_mask; + replacements |= value; + } + } + } + } + adjustments.push(( + Parameters { + leaf: parameter.leaf, + sub_leaf: sub_leaf_range, + register: parameter.register, + }, + CpuidOutputRegisterAdjustments { mask, replacements }, + )); + } + } + + let profile_data = CpuProfileData { + hypervisor: hypervisor_type, + cpu_vendor, + adjustments, + }; + + serde_json::to_writer_pretty(&mut writer, &profile_data) + .context("failed to serialize the generated profile data to the given writer")?; + writer + .flush() + .context("CPU profile generation failed: Unable to flush cpu profile data") +} + +/// Get as many of the supported CPUID entries from the hypervisor as possible. +fn supported_cpuid(hypervisor: &dyn Hypervisor) -> anyhow::Result> { + // Check for AMX compatibility. If this is supported we need to call arch_prctl before requesting the supported + // CPUID entries from the hypervisor. We simply call the enable_amx_state_components method on the hypervisor and + // ignore any AMX not supported error to achieve this. + match hypervisor.enable_amx_state_components() { + Ok(()) => {} + Err(HypervisorError::CouldNotEnableAmxStateComponents(amx_err)) => { + if matches!( + amx_err, + hypervisor::arch::x86::AmxGuestSupportError::AmxGuestTileRequest { .. } + ) { + return Err(amx_err).context("Unable to enable AMX state tiles for guests"); + } + } + Err(_) => unreachable!("Unexpected error when checking AMX support"), + } + + hypervisor + .get_supported_cpuid() + .context("CPU profile data generation failed") +} + +/// Overwrite the Processor brand string with the given `brand_string_bytes` +fn overwrite_brand_string( + mut cpuid: Vec, + brand_string_bytes: [u8; 48], +) -> Vec { + let mut iter = brand_string_bytes + .as_chunks::<4>() + .0 + .iter() + .map(|c| u32::from_le_bytes(*c)); + let mut overwrite = |leaf: u32| CpuIdEntry { + function: leaf, + index: 0, + flags: 0, + eax: iter.next().unwrap_or(0), + ebx: iter.next().unwrap_or(0), + ecx: iter.next().unwrap_or(0), + edx: iter.next().unwrap_or(0), + }; + for leaf in [0x80000002, 0x80000003, 0x80000004] { + if let Some(entry) = cpuid + .iter_mut() + .find(|entry| (entry.function == leaf) && (entry.index == 0)) + { + *entry = overwrite(leaf); + } else { + cpuid.push(overwrite(leaf)); + } + } + cpuid +} + +/// Sort the CPUID entries by function and index +fn sort_entries(mut cpuid: Vec) -> Vec { + cpuid.sort_unstable_by(|entry, other_entry| { + let fn_cmp = entry.function.cmp(&other_entry.function); + if fn_cmp == core::cmp::Ordering::Equal { + entry.index.cmp(&other_entry.index) + } else { + fn_cmp + } + }); + cpuid +} +/// Returns a `u32` where each bit between `first_bit_pos` and `last_bit_pos` is set (including both ends) and all other bits are 0. +fn bit_range_mask(first_bit_pos: u8, last_bit_pos: u8) -> u32 { + (first_bit_pos..=last_bit_pos).fold(0, |acc, next| acc | (1 << next)) +} + +/// Returns a vector of exact parameter matches ((sub_leaf ..= sub_leaf), register_value) interleaved by +/// the sub_leaf ranges specified by `param` that did not match any cpuid entry. +fn extract_parameter_matches( + param: &Parameters, + supported_cpuid_sorted: &[CpuIdEntry], +) -> Vec<(RangeInclusive, Option)> { + let register_value = |entry: &CpuIdEntry| -> u32 { + match param.register { + CpuidReg::EAX => entry.eax, + CpuidReg::EBX => entry.ebx, + CpuidReg::ECX => entry.ecx, + CpuidReg::EDX => entry.edx, + } + }; + let mut out = Vec::new(); + let param_range = param.sub_leaf.clone(); + let mut range_for_consideration = param_range.clone(); + let range_end = *range_for_consideration.end(); + for sub_leaf_entry in supported_cpuid_sorted + .iter() + .filter(|entry| entry.function == param.leaf && param_range.contains(&entry.index)) + { + let matching_subleaf = sub_leaf_entry.index; + + // If we are in the middle of the range, it means there is no entry matching the first few sub-leaves within the range + let current_range_start = *range_for_consideration.start(); + if current_range_start < matching_subleaf { + let range_not_matching = RangeInclusive::new(current_range_start, matching_subleaf - 1); + out.push((range_not_matching, None)); + } + + out.push(( + RangeInclusive::new(matching_subleaf, matching_subleaf), + Some(register_value(sub_leaf_entry)), + )); + if matching_subleaf == range_end { + return out; + } + // Update range_for_consideration: Note that we must have index + 1 <= range_end + range_for_consideration = RangeInclusive::new(matching_subleaf + 1, range_end); + } + // We did not find the last entry within the range hence we push the final range for consideration together with no matching register value + out.push((range_for_consideration, None)); + out +} diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 0152db35a1..709ee7e57c 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. pub mod cpu_profile; +#[cfg(feature = "cpu_profile_generation")] +pub mod cpu_profile_generation; pub mod cpuid_definitions; pub mod interrupts; pub mod layout; From 92a944224260833c160817c098f87a4977bf8cb2 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 9 Feb 2026 18:01:15 +0100 Subject: [PATCH 128/178] arch: Change numeric serialization for CPU profiles From working with serialized CPU profile data we noticed the following: 1. It is easier to read shorter hex strings when looking up CPUID leaves in the serialized CPU profile data. 2. It is preferable to also have sub-leaf ranges hex encoded. 3. We still want to keep serializing the adjustments to 10 character hex strings. Some of the convenience functions introduced here will also be utilized in the upcoming MSR adjustments PR in order to serialize register addresses. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/lib.rs | 40 ++++++- arch/src/x86_64/cpu_profile.rs | 82 +++++++++++++- arch/src/x86_64/cpuid_definitions/mod.rs | 137 +++++++++-------------- 3 files changed, 169 insertions(+), 90 deletions(-) diff --git a/arch/src/lib.rs b/arch/src/lib.rs index 362ae79e45..28c095fff6 100644 --- a/arch/src/lib.rs +++ b/arch/src/lib.rs @@ -9,12 +9,13 @@ //! Supported platforms: x86_64, aarch64, riscv64. use std::collections::BTreeMap; +use std::io::Write; use std::str::FromStr; use std::sync::Arc; use std::{fmt, result}; use serde::de::IntoDeserializer; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use thiserror::Error; #[cfg(target_arch = "x86_64")] @@ -83,6 +84,43 @@ impl FromStr for CpuProfile { } } +// We introduce some utilities for serializing u32 values as hex. +// These are only necessary for (de-)serializing CPU profile data. + +/// Serializes the given `input` as a hex string (starting with "0x") +fn serialize_u32_hex( + input: &u32, + serializer: S, +) -> std::result::Result { + eval_u32_hex(*input, |hex| serializer.serialize_str(hex)) +} + +/// Converts `input` into a hex string representation (starting with "0x", but the length may vary) and +/// applies the given `callback` to it. +fn eval_u32_hex(input: u32, callback: F) -> T +where + F: FnOnce(&str) -> T, +{ + // two bytes for "0x" prefix and at most eight for the hex encoded number + let mut buffer = [0_u8; 10]; + let mut write_slice = &mut buffer[..]; + write!(write_slice, "{input:#x}").expect("This write should be infallible"); + let len = 10 - write_slice.len(); + let str = core::str::from_utf8(&buffer[..len]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + callback(str) +} + +/// Deserializes a u32 from a hex string representation +fn deserialize_u32_hex<'de, D: Deserializer<'de>>( + deserializer: D, +) -> std::result::Result { + let hex = <&'de str as Deserialize>::deserialize(deserializer)?; + u32::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("{hex} is not a hex encoded 32 bit integer")) + }) +} + /// Type for memory region types. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum RegionType { diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index 4b922adba7..70b60b0caa 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -3,14 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::io::Write; + use hypervisor::arch::x86::CpuIdEntry; use hypervisor::{CpuVendor, HypervisorType}; use log::error; +use serde::ser::SerializeStruct; use serde::{Deserialize, Serialize}; use thiserror::Error; +use crate::deserialize_u32_hex; use crate::x86_64::CpuidReg; -use crate::x86_64::cpuid_definitions::{Parameters, deserialize_from_hex, serialize_as_hex}; +use crate::x86_64::cpuid_definitions::Parameters; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(rename_all = "kebab-case")] @@ -121,16 +125,39 @@ be taken care of in a follow up MR. */ /// Used for adjusting an entire cpuid output register (EAX, EBX, ECX or EDX) -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] pub(super) struct CpuidOutputRegisterAdjustments { - #[serde(serialize_with = "serialize_as_hex")] - #[serde(deserialize_with = "deserialize_from_hex")] + #[serde(deserialize_with = "deserialize_u32_hex")] pub(in crate::x86_64) replacements: u32, /// Used to zero out the area `replacements` occupy. This mask is not necessarily !replacements, as replacements may pack values of different types (i.e. it is wrong to think of it as a bitset conceptually speaking). - #[serde(serialize_with = "serialize_as_hex")] - #[serde(deserialize_with = "deserialize_from_hex")] + #[serde(deserialize_with = "deserialize_u32_hex")] pub(in crate::x86_64) mask: u32, } + +/* +We want to serialize the values as 10 bytes, starting with 0x, +regardless of the value. This makes it easier for humans to compare different serialized values. +*/ +impl Serialize for CpuidOutputRegisterAdjustments { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("CpuidOutputRegisterAdjustments", 2)?; + let mut serialize_field = |key, value| { + // two bytes for "0x" prefix and eight for the hex encoded number + let mut buffer = [0_u8; 10]; + write!(&mut buffer[..], "{value:#010x}").expect("This write should be infallible"); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + s.serialize_field(key, str) + }; + serialize_field("replacements", self.replacements)?; + serialize_field("mask", self.mask)?; + s.end() + } +} + impl CpuidOutputRegisterAdjustments { pub(in crate::x86_64) fn adjust(self, cpuid_output_register: &mut u32) { let temp_register_copy = *cpuid_output_register; @@ -252,3 +279,46 @@ impl CpuidOutputRegisterAdjustments { #[derive(Debug, Error)] #[error("Required CPUID entries not found")] pub struct MissingCpuidEntriesError; + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::CpuidOutputRegisterAdjustments; + + // Check that serializing and then deserializing `CpuidOutputResiterAdjustments` results in the same value we started with. + // + // Also check that the serialized numeric values satisfy our expectations: They are 10-byte hex encoded strings + proptest! { + #[test] + fn cpuid_output_register_adjustments_serialization_works(replacements in any::(), mask in any::()) { + // Randomly generate these values. Several of the generated values will not represent anything that may be + // produced in practice, but (de-)serialization does not take such domain knowledge into account (if that changes + // then this test will need to be updated). + let adjustments = CpuidOutputRegisterAdjustments { + replacements, + mask + }; + let serialized = serde_json::to_string(&adjustments).unwrap(); + let deserialized: CpuidOutputRegisterAdjustments = serde_json::from_str(&serialized).unwrap(); + prop_assert_eq!(&deserialized, &adjustments); + let json = serde_json::to_value(adjustments).unwrap(); + let replacements_str = json.get("replacements").unwrap().as_str().unwrap(); + let mask_str = json.get("mask").unwrap().as_str().unwrap(); + let check_str_invariants = |value: &str| { + prop_assert!(value.starts_with("0x")); + prop_assert_eq!(value.len(),10); + prop_assert!(value.as_bytes().iter().all(|byte| byte.is_ascii())); + let is_hex_digit = |byte: &u8| -> bool { + byte.is_ascii_digit() | (*byte == b'a') | (*byte == b'b') | (*byte == b'c') | (*byte == b'd') | (*byte == b'e') | (*byte == b'f') + }; + prop_assert!( + value.as_bytes()[2..].iter().all(is_hex_digit) + ); + Ok(()) + }; + check_str_invariants(replacements_str)?; + check_str_invariants(mask_str)?; + } + } +} diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs index 9a1246ce6b..c959061f22 100644 --- a/arch/src/x86_64/cpuid_definitions/mod.rs +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -3,51 +3,70 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::Write; use std::ops::RangeInclusive; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde::{Deserialize, Serialize}; use crate::x86_64::CpuidReg; +use crate::{deserialize_u32_hex, serialize_u32_hex}; pub mod intel; #[cfg(feature = "kvm")] pub mod kvm; -pub(in crate::x86_64) fn serialize_as_hex( - input: &u32, - serializer: S, -) -> Result { - // two bytes for "0x" prefix and eight for the hex encoded number - let mut buffer = [0_u8; 10]; - let _ = write!(&mut buffer[..], "{input:#010x}"); - let str = core::str::from_utf8(&buffer[..]) - .expect("the buffer should be filled with valid UTF-8 bytes"); - serializer.serialize_str(str) -} - -pub(in crate::x86_64) fn deserialize_from_hex<'de, D: Deserializer<'de>>( - deserializer: D, -) -> Result { - let hex = <&'de str as Deserialize>::deserialize(deserializer)?; - u32::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { - ::custom(format!("{hex} is not a hex encoded 32 bit integer")) - }) -} - /// Parameters for inspecting CPUID definitions. #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] pub struct Parameters { // The leaf (EAX) parameter used with the CPUID instruction - #[serde(serialize_with = "serialize_as_hex")] - #[serde(deserialize_with = "deserialize_from_hex")] + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] pub leaf: u32, // The sub-leaf (ECX) parameter used with the CPUID instruction + #[serde( + serialize_with = "serialize_range_hex", + deserialize_with = "deserialize_range_hex" + )] pub sub_leaf: RangeInclusive, // The register we are interested in inspecting which gets filled by the CPUID instruction pub register: CpuidReg, } +// Only used for (de-)serialization +#[derive(Debug, Serialize, Deserialize)] +struct ProvisionalRangeInclusive { + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + start: u32, + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + end: u32, +} + +fn serialize_range_hex( + input: &RangeInclusive, + serializer: S, +) -> Result { + let provisional = ProvisionalRangeInclusive { + start: *input.start(), + end: *input.end(), + }; + provisional.serialize(serializer) +} + +fn deserialize_range_hex<'de, D: serde::Deserializer<'de>>( + deserializer: D, +) -> Result, D::Error> { + let ProvisionalRangeInclusive { start, end } = + ProvisionalRangeInclusive::deserialize(deserializer)?; + Ok(start..=end) +} + /// Describes a policy for how the corresponding CPUID data should be considered when building /// a CPU profile. /// @@ -125,59 +144,17 @@ impl CpuidDefinitions { #[cfg(test)] mod tests { use proptest::prelude::*; - use serde::Deserialize; - use super::{Parameters, deserialize_from_hex, serialize_as_hex}; + use super::Parameters; use crate::x86_64::CpuidReg; - /* - Check that the leaves get the string representation we expect. - This does not really matter from a functionality point of view, but we want - to read it in the expected format when manually viewing the generated CPU - profile files. - - Also assert that deserialization gives the original value back - */ - #[test] - fn hex_serialization() { - for (leaf, expected) in [ - 0x0_u32, 0x7, 0xd, 0x1e, 0x40000000, 0x4fffffff, 0x80000000, 0x8fffffff, - ] - .into_iter() - .zip([ - "0x00000000", - "0x00000007", - "0x0000000d", - "0x0000001e", - "0x40000000", - "0x4fffffff", - "0x80000000", - "0x8fffffff", - ]) { - let mut v = Vec::new(); - let mut serializer = serde_json::Serializer::new(&mut v); - serialize_as_hex(&leaf, &mut serializer).unwrap(); - let serialized = str::from_utf8(&v[..]).unwrap(); - // JSON Strings have surrounding "" hence we trim that - let serialized_trimmed = serialized - .strip_prefix('"') - .unwrap() - .strip_suffix('"') - .unwrap(); - dbg!(serialized_trimmed); - assert_eq!(serialized_trimmed, expected); - // Also check that we can deserialize this back to the original value - let mut deserializer = serde_json::Deserializer::from_str(serialized); - let deserialized = deserialize_from_hex(&mut deserializer).unwrap(); - assert_eq!(deserialized, leaf); - } - } - // Check that serializing and then deserializing a value of type `Parameter` results in the // same value we started with. + // + // Also check that the serialized numeric values are hex strings proptest! { #[test] - fn parameter_serialization_roundtrip_works(leaf in 0u32..u32::MAX, x1 in 0u32..100, x2 in 0u32..100, reg in 0..4) { + fn parameter_serialization_roundtrip_works(leaf in any::(), x1 in 0u32..100, x2 in 0u32..100, reg in 0..4) { let sub_leaf_range_start = std::cmp::min(x1, x2); let sub_leaf_range_end = std::cmp::max(x1,x2); let sub_leaf = sub_leaf_range_start..=sub_leaf_range_end; @@ -196,19 +173,13 @@ mod tests { let serialized = serde_json::to_string(&cpuid_parameters).unwrap(); let deserialized: Parameters = serde_json::from_str(&serialized).unwrap(); prop_assert_eq!(&deserialized, &cpuid_parameters); - } - } - // Check that `deserialize_from_hex` does not succeed if the stringified u32 does not start with 0x - proptest! { - #[test] - fn hex_deserialization_requires_prefix(leaf in any::().prop_map(|leaf| std::iter::once('"').chain(leaf.to_string().chars()).chain(std::iter::once('"')).collect::())) { - let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); - // Check that standard deserialization works - let result = ::deserialize(&mut deserializer); - prop_assert!(result.is_ok()); - let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); - prop_assert!(deserialize_from_hex(&mut deserializer).is_err()); + // Check that all numeric values are hex strings when serialized to json + let params_json = serde_json::to_value(cpuid_parameters).unwrap(); + prop_assert!(params_json.get("leaf").unwrap().as_str().unwrap().starts_with("0x")); + let sub_leaf_map = params_json.get("sub_leaf").unwrap().as_object().unwrap(); + prop_assert!(sub_leaf_map.get("start").unwrap().as_str().unwrap().starts_with("0x")); + prop_assert!(sub_leaf_map.get("end").unwrap().as_str().unwrap().starts_with("0x")); } } } From 7bbbd02373b056cb48483f8cb1c5c3e39cad3a4b Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 6 Feb 2026 16:19:40 +0100 Subject: [PATCH 129/178] arch: Passthrough policies for mutable CPUID fields CPUID leaf 0x1 ECX[27] and 0x7 ECX[4] can change at runtime, hence it does not make sense to set their values in the CPU profile. Note that Section 21.1.5 of the Intel SDM Vol.1, mentions a few other mutable CPUID fields as well, but we deem it safe to leave them as the are. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 96969c1f96..d3d1e1fa24 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -350,7 +350,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { short: "osxsave", description: "XSAVE (and related instructions) are enabled by OS", bits_range: (27, 27), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Passthrough, }, ValueDefinition { short: "avx", @@ -1428,7 +1428,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { short: "ospke", description: "OS protection keys enable", bits_range: (4, 4), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Passthrough, }, ValueDefinition { short: "waitpkg", From 1f77620a36fbe4288559c0762d0b1f27965d0ff3 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 6 Feb 2026 16:31:56 +0100 Subject: [PATCH 130/178] arch: Update CPU profiles (passthrough mutable fields) We regenerate the CPU profiles to take the policy changes from the previous commit into account. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpu_profiles/sapphire-rapids.json | 4 ++-- arch/src/x86_64/cpu_profiles/skylake.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json index aacb85a747..0d20cea878 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json @@ -97,7 +97,7 @@ }, { "replacements": "0x76fa3223", - "mask": "0x80000000" + "mask": "0x88000000" } ], [ @@ -657,7 +657,7 @@ }, { "replacements": "0x1b415f6e", - "mask": "0x00000000" + "mask": "0x00000010" } ], [ diff --git a/arch/src/x86_64/cpu_profiles/skylake.json b/arch/src/x86_64/cpu_profiles/skylake.json index df5e9b24a2..d6a641e8fc 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.json +++ b/arch/src/x86_64/cpu_profiles/skylake.json @@ -97,7 +97,7 @@ }, { "replacements": "0x76fa3223", - "mask": "0x80000000" + "mask": "0x88000000" } ], [ @@ -657,7 +657,7 @@ }, { "replacements": "0x0000000c", - "mask": "0x00000000" + "mask": "0x00000010" } ], [ From e4ae34a5c9b02dde148ec0c5ca80f9bbc2b51bd0 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 13 Feb 2026 18:59:30 +0100 Subject: [PATCH 131/178] arch: Regenerate CPU profiles after updating the serialization format After updating the serialization format we were no longer able to start CHV with a CPU profile. This is because deserialization now expects the sub-leaf ranges to be hex encoded strings. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../x86_64/cpu_profiles/sapphire-rapids.json | 1228 ++++++++--------- arch/src/x86_64/cpu_profiles/skylake.json | 1156 ++++++++-------- 2 files changed, 1192 insertions(+), 1192 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json index 0d20cea878..ecbf7bc28c 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json @@ -4,10 +4,10 @@ "adjustments": [ [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -18,10 +18,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -32,10 +32,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -46,10 +46,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -60,10 +60,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -74,10 +74,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -88,10 +88,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -102,10 +102,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -116,10 +116,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -130,10 +130,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -144,10 +144,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -158,10 +158,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -172,10 +172,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -186,10 +186,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -200,10 +200,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -214,10 +214,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -228,10 +228,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EAX" }, @@ -242,10 +242,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EAX" }, @@ -256,10 +256,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -270,10 +270,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -284,10 +284,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -298,10 +298,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EBX" }, @@ -312,10 +312,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -326,10 +326,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EBX" }, @@ -340,10 +340,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -354,10 +354,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -368,10 +368,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -382,10 +382,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "ECX" }, @@ -396,10 +396,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "ECX" }, @@ -410,10 +410,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "ECX" }, @@ -424,10 +424,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -438,10 +438,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -452,10 +452,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -466,10 +466,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EDX" }, @@ -480,10 +480,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EDX" }, @@ -494,10 +494,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EDX" }, @@ -508,10 +508,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -522,10 +522,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -536,10 +536,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -550,10 +550,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -564,10 +564,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -578,10 +578,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -592,10 +592,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -606,10 +606,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -620,10 +620,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -634,10 +634,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -648,10 +648,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -662,10 +662,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -676,10 +676,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -690,10 +690,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -704,10 +704,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -718,10 +718,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -732,10 +732,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -746,10 +746,10 @@ ], [ { - "leaf": "0x00000009", + "leaf": "0x9", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -760,10 +760,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -774,10 +774,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -788,10 +788,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -802,10 +802,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -816,10 +816,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -830,10 +830,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EAX" }, @@ -844,10 +844,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -858,10 +858,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EBX" }, @@ -872,10 +872,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -886,10 +886,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "ECX" }, @@ -900,10 +900,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -914,10 +914,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EDX" }, @@ -928,10 +928,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -942,10 +942,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -956,10 +956,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -970,10 +970,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -984,10 +984,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -998,10 +998,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1012,10 +1012,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1026,10 +1026,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1040,10 +1040,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -1054,10 +1054,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -1068,10 +1068,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -1082,10 +1082,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 4 + "start": "0x3", + "end": "0x4" }, "register": "EAX" }, @@ -1096,10 +1096,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 4 + "start": "0x3", + "end": "0x4" }, "register": "EBX" }, @@ -1110,10 +1110,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 4 + "start": "0x3", + "end": "0x4" }, "register": "ECX" }, @@ -1124,10 +1124,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 4 + "start": "0x3", + "end": "0x4" }, "register": "EDX" }, @@ -1138,10 +1138,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -1152,10 +1152,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "EAX" }, @@ -1166,10 +1166,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "EAX" }, @@ -1180,10 +1180,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "EAX" }, @@ -1194,10 +1194,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "EAX" }, @@ -1208,10 +1208,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 16 + "start": "0xa", + "end": "0x10" }, "register": "EAX" }, @@ -1222,10 +1222,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 17, - "end": 17 + "start": "0x11", + "end": "0x11" }, "register": "EAX" }, @@ -1236,10 +1236,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 18, - "end": 18 + "start": "0x12", + "end": "0x12" }, "register": "EAX" }, @@ -1250,10 +1250,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 19, - "end": 63 + "start": "0x13", + "end": "0x3f" }, "register": "EAX" }, @@ -1264,10 +1264,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EBX" }, @@ -1278,10 +1278,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "EBX" }, @@ -1292,10 +1292,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "EBX" }, @@ -1306,10 +1306,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "EBX" }, @@ -1320,10 +1320,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "EBX" }, @@ -1334,10 +1334,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 16 + "start": "0xa", + "end": "0x10" }, "register": "EBX" }, @@ -1348,10 +1348,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 17, - "end": 17 + "start": "0x11", + "end": "0x11" }, "register": "EBX" }, @@ -1362,10 +1362,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 18, - "end": 18 + "start": "0x12", + "end": "0x12" }, "register": "EBX" }, @@ -1376,10 +1376,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 19, - "end": 63 + "start": "0x13", + "end": "0x3f" }, "register": "EBX" }, @@ -1390,10 +1390,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -1404,10 +1404,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "ECX" }, @@ -1418,10 +1418,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "ECX" }, @@ -1432,10 +1432,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "ECX" }, @@ -1446,10 +1446,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "ECX" }, @@ -1460,10 +1460,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 16 + "start": "0xa", + "end": "0x10" }, "register": "ECX" }, @@ -1474,10 +1474,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 17, - "end": 17 + "start": "0x11", + "end": "0x11" }, "register": "ECX" }, @@ -1488,10 +1488,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 18, - "end": 18 + "start": "0x12", + "end": "0x12" }, "register": "ECX" }, @@ -1502,10 +1502,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 19, - "end": 63 + "start": "0x13", + "end": "0x3f" }, "register": "ECX" }, @@ -1516,10 +1516,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1530,10 +1530,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -1544,10 +1544,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1558,10 +1558,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1572,10 +1572,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1586,10 +1586,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1600,10 +1600,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1614,10 +1614,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1628,10 +1628,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1642,10 +1642,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1656,10 +1656,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1670,10 +1670,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -1684,10 +1684,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -1698,10 +1698,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -1712,10 +1712,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -1726,10 +1726,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -1740,10 +1740,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "ECX" }, @@ -1754,10 +1754,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EDX" }, @@ -1768,10 +1768,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -1782,10 +1782,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -1796,10 +1796,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EDX" }, @@ -1810,10 +1810,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1824,10 +1824,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1838,10 +1838,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1852,10 +1852,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1866,10 +1866,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1880,10 +1880,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1894,10 +1894,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1908,10 +1908,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1922,10 +1922,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1936,10 +1936,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1950,10 +1950,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1964,10 +1964,10 @@ ], [ { - "leaf": "0x00000017", + "leaf": "0x17", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1978,10 +1978,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1992,10 +1992,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2006,10 +2006,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EBX" }, @@ -2020,10 +2020,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2034,10 +2034,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "ECX" }, @@ -2048,10 +2048,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2062,10 +2062,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EDX" }, @@ -2076,10 +2076,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2090,10 +2090,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2104,10 +2104,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2118,10 +2118,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2132,10 +2132,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2146,10 +2146,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -2160,10 +2160,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -2174,10 +2174,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2188,10 +2188,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2202,10 +2202,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2216,10 +2216,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2230,10 +2230,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EAX" }, @@ -2244,10 +2244,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2258,10 +2258,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EBX" }, @@ -2272,10 +2272,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2286,10 +2286,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "ECX" }, @@ -2300,10 +2300,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2314,10 +2314,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EDX" }, @@ -2328,10 +2328,10 @@ ], [ { - "leaf": "0x00000020", + "leaf": "0x20", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2342,10 +2342,10 @@ ], [ { - "leaf": "0x00000020", + "leaf": "0x20", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2356,10 +2356,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2370,10 +2370,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2384,10 +2384,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2398,10 +2398,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2412,10 +2412,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2426,10 +2426,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2440,10 +2440,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2454,10 +2454,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -2468,10 +2468,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -2482,10 +2482,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -2496,10 +2496,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -2510,10 +2510,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -2524,10 +2524,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -2538,10 +2538,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EBX" }, @@ -2552,10 +2552,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -2566,10 +2566,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EDX" }, @@ -2580,10 +2580,10 @@ ], [ { - "leaf": "0x00000024", + "leaf": "0x24", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2594,10 +2594,10 @@ ], [ { - "leaf": "0x00000024", + "leaf": "0x24", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2610,8 +2610,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2624,8 +2624,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2638,8 +2638,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2652,8 +2652,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2666,8 +2666,8 @@ { "leaf": "0x80000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2680,8 +2680,8 @@ { "leaf": "0x80000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2694,8 +2694,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2708,8 +2708,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2722,8 +2722,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2736,8 +2736,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2750,8 +2750,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2764,8 +2764,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2778,8 +2778,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2792,8 +2792,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2806,8 +2806,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2820,8 +2820,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2834,8 +2834,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2848,8 +2848,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2862,8 +2862,8 @@ { "leaf": "0x80000006", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2876,8 +2876,8 @@ { "leaf": "0x80000007", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2890,8 +2890,8 @@ { "leaf": "0x80000008", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2904,8 +2904,8 @@ { "leaf": "0x80000008", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2918,8 +2918,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2932,8 +2932,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2946,8 +2946,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2960,8 +2960,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2974,8 +2974,8 @@ { "leaf": "0x40000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2988,8 +2988,8 @@ { "leaf": "0x40000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, diff --git a/arch/src/x86_64/cpu_profiles/skylake.json b/arch/src/x86_64/cpu_profiles/skylake.json index d6a641e8fc..ffd77fd00f 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.json +++ b/arch/src/x86_64/cpu_profiles/skylake.json @@ -4,10 +4,10 @@ "adjustments": [ [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -18,10 +18,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -32,10 +32,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -46,10 +46,10 @@ ], [ { - "leaf": "0x00000000", + "leaf": "0x0", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -60,10 +60,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -74,10 +74,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -88,10 +88,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -102,10 +102,10 @@ ], [ { - "leaf": "0x00000001", + "leaf": "0x1", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -116,10 +116,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -130,10 +130,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -144,10 +144,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -158,10 +158,10 @@ ], [ { - "leaf": "0x00000002", + "leaf": "0x2", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -172,10 +172,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -186,10 +186,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -200,10 +200,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -214,10 +214,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -228,10 +228,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EAX" }, @@ -242,10 +242,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EAX" }, @@ -256,10 +256,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -270,10 +270,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -284,10 +284,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -298,10 +298,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EBX" }, @@ -312,10 +312,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -326,10 +326,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EBX" }, @@ -340,10 +340,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -354,10 +354,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -368,10 +368,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -382,10 +382,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "ECX" }, @@ -396,10 +396,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "ECX" }, @@ -410,10 +410,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "ECX" }, @@ -424,10 +424,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -438,10 +438,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -452,10 +452,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -466,10 +466,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EDX" }, @@ -480,10 +480,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EDX" }, @@ -494,10 +494,10 @@ ], [ { - "leaf": "0x00000004", + "leaf": "0x4", "sub_leaf": { - "start": 5, - "end": 4294967295 + "start": "0x5", + "end": "0xffffffff" }, "register": "EDX" }, @@ -508,10 +508,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -522,10 +522,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -536,10 +536,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -550,10 +550,10 @@ ], [ { - "leaf": "0x00000005", + "leaf": "0x5", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -564,10 +564,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -578,10 +578,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -592,10 +592,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -606,10 +606,10 @@ ], [ { - "leaf": "0x00000006", + "leaf": "0x6", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -620,10 +620,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -634,10 +634,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -648,10 +648,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -662,10 +662,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -676,10 +676,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -690,10 +690,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -704,10 +704,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -718,10 +718,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -732,10 +732,10 @@ ], [ { - "leaf": "0x00000007", + "leaf": "0x7", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -746,10 +746,10 @@ ], [ { - "leaf": "0x00000009", + "leaf": "0x9", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -760,10 +760,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -774,10 +774,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -788,10 +788,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -802,10 +802,10 @@ ], [ { - "leaf": "0x0000000a", + "leaf": "0xa", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -816,10 +816,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -830,10 +830,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EAX" }, @@ -844,10 +844,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -858,10 +858,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EBX" }, @@ -872,10 +872,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -886,10 +886,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "ECX" }, @@ -900,10 +900,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -914,10 +914,10 @@ ], [ { - "leaf": "0x0000000b", + "leaf": "0xb", "sub_leaf": { - "start": 1, - "end": 4294967295 + "start": "0x1", + "end": "0xffffffff" }, "register": "EDX" }, @@ -928,10 +928,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -942,10 +942,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -956,10 +956,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -970,10 +970,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -984,10 +984,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -998,10 +998,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1012,10 +1012,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1026,10 +1026,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1040,10 +1040,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -1054,10 +1054,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -1068,10 +1068,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -1082,10 +1082,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -1096,10 +1096,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EAX" }, @@ -1110,10 +1110,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EBX" }, @@ -1124,10 +1124,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -1138,10 +1138,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "ECX" }, @@ -1152,10 +1152,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "ECX" }, @@ -1166,10 +1166,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EDX" }, @@ -1180,10 +1180,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EDX" }, @@ -1194,10 +1194,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -1208,10 +1208,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "EAX" }, @@ -1222,10 +1222,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "EAX" }, @@ -1236,10 +1236,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "EAX" }, @@ -1250,10 +1250,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "EAX" }, @@ -1264,10 +1264,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 63 + "start": "0xa", + "end": "0x3f" }, "register": "EAX" }, @@ -1278,10 +1278,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EBX" }, @@ -1292,10 +1292,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "EBX" }, @@ -1306,10 +1306,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "EBX" }, @@ -1320,10 +1320,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "EBX" }, @@ -1334,10 +1334,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "EBX" }, @@ -1348,10 +1348,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 63 + "start": "0xa", + "end": "0x3f" }, "register": "EBX" }, @@ -1362,10 +1362,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -1376,10 +1376,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 6, - "end": 6 + "start": "0x6", + "end": "0x6" }, "register": "ECX" }, @@ -1390,10 +1390,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 7, - "end": 7 + "start": "0x7", + "end": "0x7" }, "register": "ECX" }, @@ -1404,10 +1404,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 8, - "end": 8 + "start": "0x8", + "end": "0x8" }, "register": "ECX" }, @@ -1418,10 +1418,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 9, - "end": 9 + "start": "0x9", + "end": "0x9" }, "register": "ECX" }, @@ -1432,10 +1432,10 @@ ], [ { - "leaf": "0x0000000d", + "leaf": "0xd", "sub_leaf": { - "start": 10, - "end": 63 + "start": "0xa", + "end": "0x3f" }, "register": "ECX" }, @@ -1446,10 +1446,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1460,10 +1460,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -1474,10 +1474,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1488,10 +1488,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1502,10 +1502,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1516,10 +1516,10 @@ ], [ { - "leaf": "0x0000000f", + "leaf": "0xf", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1530,10 +1530,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1544,10 +1544,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1558,10 +1558,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1572,10 +1572,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -1586,10 +1586,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EDX" }, @@ -1600,10 +1600,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -1614,10 +1614,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EBX" }, @@ -1628,10 +1628,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EDX" }, @@ -1642,10 +1642,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "ECX" }, @@ -1656,10 +1656,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -1670,10 +1670,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "ECX" }, @@ -1684,10 +1684,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EDX" }, @@ -1698,10 +1698,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -1712,10 +1712,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -1726,10 +1726,10 @@ ], [ { - "leaf": "0x00000010", + "leaf": "0x10", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EDX" }, @@ -1740,10 +1740,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1754,10 +1754,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1768,10 +1768,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1782,10 +1782,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -1796,10 +1796,10 @@ ], [ { - "leaf": "0x00000014", + "leaf": "0x14", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -1810,10 +1810,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1824,10 +1824,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1838,10 +1838,10 @@ ], [ { - "leaf": "0x00000015", + "leaf": "0x15", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1852,10 +1852,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1866,10 +1866,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1880,10 +1880,10 @@ ], [ { - "leaf": "0x00000016", + "leaf": "0x16", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -1894,10 +1894,10 @@ ], [ { - "leaf": "0x00000017", + "leaf": "0x17", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1908,10 +1908,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1922,10 +1922,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "EBX" }, @@ -1936,10 +1936,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "ECX" }, @@ -1950,10 +1950,10 @@ ], [ { - "leaf": "0x00000018", + "leaf": "0x18", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "EDX" }, @@ -1964,10 +1964,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -1978,10 +1978,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -1992,10 +1992,10 @@ ], [ { - "leaf": "0x0000001c", + "leaf": "0x1c", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2006,10 +2006,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2020,10 +2020,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2034,10 +2034,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -2048,10 +2048,10 @@ ], [ { - "leaf": "0x0000001d", + "leaf": "0x1d", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "ECX" }, @@ -2062,10 +2062,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2076,10 +2076,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2090,10 +2090,10 @@ ], [ { - "leaf": "0x0000001e", + "leaf": "0x1e", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2104,10 +2104,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "EAX" }, @@ -2118,10 +2118,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "EBX" }, @@ -2132,10 +2132,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "ECX" }, @@ -2146,10 +2146,10 @@ ], [ { - "leaf": "0x0000001f", + "leaf": "0x1f", "sub_leaf": { - "start": 0, - "end": 4294967295 + "start": "0x0", + "end": "0xffffffff" }, "register": "EDX" }, @@ -2160,10 +2160,10 @@ ], [ { - "leaf": "0x00000020", + "leaf": "0x20", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2174,10 +2174,10 @@ ], [ { - "leaf": "0x00000020", + "leaf": "0x20", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2188,10 +2188,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2202,10 +2202,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2216,10 +2216,10 @@ ], [ { - "leaf": "0x00000021", + "leaf": "0x21", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2230,10 +2230,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2244,10 +2244,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2258,10 +2258,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2272,10 +2272,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EAX" }, @@ -2286,10 +2286,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 1, - "end": 1 + "start": "0x1", + "end": "0x1" }, "register": "EBX" }, @@ -2300,10 +2300,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 2, - "end": 2 + "start": "0x2", + "end": "0x2" }, "register": "EAX" }, @@ -2314,10 +2314,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 3, - "end": 3 + "start": "0x3", + "end": "0x3" }, "register": "EAX" }, @@ -2328,10 +2328,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -2342,10 +2342,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 4, - "end": 4 + "start": "0x4", + "end": "0x4" }, "register": "EBX" }, @@ -2356,10 +2356,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EAX" }, @@ -2370,10 +2370,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EBX" }, @@ -2384,10 +2384,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "ECX" }, @@ -2398,10 +2398,10 @@ ], [ { - "leaf": "0x00000023", + "leaf": "0x23", "sub_leaf": { - "start": 5, - "end": 5 + "start": "0x5", + "end": "0x5" }, "register": "EDX" }, @@ -2412,10 +2412,10 @@ ], [ { - "leaf": "0x00000024", + "leaf": "0x24", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2426,10 +2426,10 @@ ], [ { - "leaf": "0x00000024", + "leaf": "0x24", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2442,8 +2442,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2456,8 +2456,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2470,8 +2470,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2484,8 +2484,8 @@ { "leaf": "0x80000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2498,8 +2498,8 @@ { "leaf": "0x80000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2512,8 +2512,8 @@ { "leaf": "0x80000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2526,8 +2526,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2540,8 +2540,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2554,8 +2554,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2568,8 +2568,8 @@ { "leaf": "0x80000002", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2582,8 +2582,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2596,8 +2596,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2610,8 +2610,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2624,8 +2624,8 @@ { "leaf": "0x80000003", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2638,8 +2638,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2652,8 +2652,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2666,8 +2666,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2680,8 +2680,8 @@ { "leaf": "0x80000004", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2694,8 +2694,8 @@ { "leaf": "0x80000006", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2708,8 +2708,8 @@ { "leaf": "0x80000007", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2722,8 +2722,8 @@ { "leaf": "0x80000008", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2736,8 +2736,8 @@ { "leaf": "0x80000008", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2750,8 +2750,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2764,8 +2764,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EBX" }, @@ -2778,8 +2778,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "ECX" }, @@ -2792,8 +2792,8 @@ { "leaf": "0x40000000", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, @@ -2806,8 +2806,8 @@ { "leaf": "0x40000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EAX" }, @@ -2820,8 +2820,8 @@ { "leaf": "0x40000001", "sub_leaf": { - "start": 0, - "end": 0 + "start": "0x0", + "end": "0x0" }, "register": "EDX" }, From dd47149ed35ab145d12b1aad67a08ad04518b627 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 18 Feb 2026 11:53:10 +0100 Subject: [PATCH 132/178] misc: move MSR retrieval to Hypervisor trait (KVM) This refactors MSR handling for KVM by moving the retrieval logic into the Hypervisor trait. It streamlines the integration with CpuId entries and simplifies the implementation of CpuProfiles (follow-up work). The MSRs vector acts as a buffer: supported MSRs are written during VM snapshot and later restored by KVM (for example, after live migration). With CPU profiles, some of these MSRs may become invalid, so we need a mechanism to update the MSR buffer accordingly. Since the hypervisor crate (where KvmVm resides) has no awareness of CPU profiles, it is more appropriate for the buffer to live in CpuManager, allowing dynamic updates based on the active CPU profile. Currently, this change only addresses KVM for rapid progress in our fork. Upstreaming will require proper consideration for other hypervisors, e.g., MSHV. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- hypervisor/src/hypervisor.rs | 5 +++++ hypervisor/src/kvm/mod.rs | 29 ++++++++++++++++++++++------- hypervisor/src/mshv/mod.rs | 5 +++++ hypervisor/src/vm.rs | 8 +++++++- vmm/src/cpu.rs | 22 +++++++++++++++------- vmm/src/vm.rs | 4 ++-- 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 05852a230f..ee50f0cefd 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -16,6 +16,8 @@ use thiserror::Error; #[cfg(target_arch = "x86_64")] use crate::arch::x86::CpuIdEntry; #[cfg(target_arch = "x86_64")] +use crate::arch::x86::MsrEntry; +#[cfg(target_arch = "x86_64")] use crate::cpu::CpuVendor; #[cfg(feature = "tdx")] use crate::kvm::TdxCapabilities; @@ -128,6 +130,9 @@ pub trait Hypervisor: Send + Sync { /// Get the supported CpuID /// fn get_supported_cpuid(&self) -> Result>; + /// Get the supported MSRs. + #[cfg(target_arch = "x86_64")] + fn get_supported_msrs(&self) -> Result>; /// /// Check particular extensions if any /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 76147e37bd..85b095ea61 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -567,8 +567,6 @@ struct KvmMemorySlot { /// Wrapper over KVM VM ioctls. pub struct KvmVm { fd: Arc, - #[cfg(target_arch = "x86_64")] - msrs: Vec, #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] sev_fd: Option, #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] @@ -825,6 +823,7 @@ impl vm::Vm for KvmVm { &self, id: u32, vm_ops: Option>, + #[cfg(target_arch = "x86_64")] msrs: Vec, ) -> vm::Result> { let fd = self .fd @@ -844,7 +843,7 @@ impl vm::Vm for KvmVm { let vcpu = KvmVcpu { fd, #[cfg(target_arch = "x86_64")] - msrs: self.msrs.clone(), + msrs, vm_ops, #[cfg(target_arch = "x86_64")] hyperv_synic: AtomicBool::new(false), @@ -1632,7 +1631,6 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(Arc::new(KvmVm { fd: Arc::new(fd), - msrs, dirty_log_slots: RwLock::new(HashMap::new()), #[cfg(feature = "sev_snp")] sev_fd, @@ -1672,6 +1670,23 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(v) } + #[cfg(target_arch = "x86_64")] + fn get_supported_msrs(&self) -> hypervisor::Result> { + let msr_list = self.get_msr_list()?; + let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; + let mut msrs: Vec = vec![ + MsrEntry { + ..Default::default() + }; + num_msrs + ]; + let indices = msr_list.as_slice(); + for (pos, index) in indices.iter().enumerate() { + msrs[pos].index = *index; + } + Ok(msrs) + } + #[cfg(target_arch = "aarch64")] /// /// Retrieve AArch64 host maximum IPA size supported by KVM. @@ -1792,7 +1807,7 @@ impl KvmVcpu { /// let kvm = KvmHypervisor::new().unwrap(); /// let hypervisor = Arc::new(kvm); /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); -/// let vcpu = vm.create_vcpu(0, None).unwrap(); +/// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// ``` impl cpu::Vcpu for KvmVcpu { /// @@ -2860,7 +2875,7 @@ impl cpu::Vcpu for KvmVcpu { /// let hv = Arc::new(kvm); /// let vm = hv.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); - /// let vcpu = vm.create_vcpu(0, None).unwrap(); + /// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// let state = vcpu.state().unwrap(); /// ``` fn state(&self) -> cpu::Result { @@ -3099,7 +3114,7 @@ impl cpu::Vcpu for KvmVcpu { /// let hv = Arc::new(kvm); /// let vm = hv.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); - /// let vcpu = vm.create_vcpu(0, None).unwrap(); + /// let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); /// let state = vcpu.state().unwrap(); /// vcpu.set_state(&state).unwrap(); /// ``` diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 644bc32368..888bd0738b 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -395,6 +395,10 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(cpuid) } + fn get_supported_msrs(&self) -> hypervisor::Result> { + todo!() + } + /// Get maximum number of vCPUs fn get_max_vcpus(&self) -> u32 { // TODO: Using HV_MAXIMUM_PROCESSORS would be better @@ -1938,6 +1942,7 @@ impl vm::Vm for MshvVm { &self, id: u32, vm_ops: Option>, + #[cfg(target_arch = "x86_64")] _msrs: Vec, ) -> vm::Result> { let id: u8 = id.try_into().unwrap(); let vcpu_fd = self diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index f671204c0c..e0d0e3b057 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -30,6 +30,7 @@ use crate::arch::aarch64::gic::{Vgic, VgicConfig}; use crate::arch::riscv64::aia::{Vaia, VaiaConfig}; #[cfg(feature = "tdx")] use crate::arch::x86::CpuIdEntry; +use crate::arch::x86::MsrEntry; use crate::cpu::Vcpu; use crate::{IoEventAddress, IrqRoutingEntry}; @@ -324,7 +325,12 @@ pub trait Vm: Send + Sync + Any { /// Unregister an event that will, when signaled, trigger the `gsi` IRQ. fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; /// Creates a new KVM vCPU file descriptor and maps the memory corresponding - fn create_vcpu(&self, id: u32, vm_ops: Option>) -> Result>; + fn create_vcpu( + &self, + id: u32, + vm_ops: Option>, + #[cfg(target_arch = "x86_64")] msrs: Vec, + ) -> Result>; #[cfg(target_arch = "aarch64")] fn create_vgic(&self, config: &VgicConfig) -> Result>>; #[cfg(target_arch = "riscv64")] diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 02e984c6b0..b7bbd1ef5e 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -47,7 +47,7 @@ use hypervisor::arch::aarch64::gic::Vgic; use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1}; #[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::CpuIdEntry; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] +#[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::MsrEntry; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use hypervisor::arch::x86::SpecialRegisters; @@ -531,9 +531,10 @@ impl Vcpu { vm: &dyn hypervisor::Vm, vm_ops: Option>, #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, + #[cfg(target_arch = "x86_64")] msrs: Vec, ) -> Result { let vcpu = vm - .create_vcpu(apic_id, vm_ops) + .create_vcpu(apic_id, vm_ops, msrs) .map_err(|e| Error::VcpuCreate(e.into()))?; // Initially the cpuid per vCPU is the one supported by this VM. Ok(Vcpu { @@ -718,6 +719,8 @@ pub struct CpuManager { interrupt_controller: Option>>, #[cfg(target_arch = "x86_64")] cpuid: Vec, + #[cfg(target_arch = "x86_64")] + msrs: Vec, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm: Arc, vcpus_kill_signalled: Arc, @@ -951,6 +954,9 @@ impl CpuManager { interrupt_controller: None, #[cfg(target_arch = "x86_64")] cpuid: Vec::new(), + msrs: hypervisor + .get_supported_msrs() + .map_err(|e| Error::VcpuCreate(e.into()))?, vm, vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), @@ -1024,6 +1030,8 @@ impl CpuManager { Some(self.vm_ops.clone()), #[cfg(target_arch = "x86_64")] self.hypervisor.get_cpu_vendor(), + #[cfg(target_arch = "x86_64")] + self.msrs.clone(), )?; if let Some(snapshot) = snapshot { @@ -3397,7 +3405,7 @@ mod unit_tests { hv.check_required_extensions().unwrap(); // Calling get_lapic will fail if there is no irqchip before hand. vm.create_irq_chip().unwrap(); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let klapic_before: LapicState = vcpu.get_lapic().unwrap(); // Compute the value that is expected to represent LVT0 and LVT1. @@ -3422,7 +3430,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); setup_fpu(vcpu.as_ref()).unwrap(); let expected_fpu: FpuState = FpuState { @@ -3448,7 +3456,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); setup_msrs(vcpu.as_ref()).unwrap(); // This test will check against the last MSR entry configured (the tenth one). @@ -3476,7 +3484,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); expected_regs.set_rflags(0x0000000000000002u64); @@ -3502,7 +3510,7 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); - let vcpu = vm.create_vcpu(0, None).unwrap(); + let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); expected_regs.set_rflags(0x0000000000000002u64); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index e2ed4225fa..ecc7fc6aa7 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -4021,7 +4021,7 @@ mod unit_tests { mem.write_slice(&code, load_addr) .expect("Writing code to memory failed"); - let mut vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); + let mut vcpu = vm.create_vcpu(0, None, vec![]).expect("new Vcpu failed"); let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); vcpu_sregs.cs.base = 0; @@ -4158,7 +4158,7 @@ pub fn test_vm() { mem.write_slice(&code, load_addr) .expect("Writing code to memory failed"); - let mut vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); + let mut vcpu = vm.create_vcpu(0, None, vec![]).expect("new Vcpu failed"); let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); vcpu_sregs.cs.base = 0; From 27b843af1dfc07f0664048ab90789504b32f7a3a Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Sun, 25 Jan 2026 16:47:35 +0100 Subject: [PATCH 133/178] hypervisor: Add get_msr_based_features method In order to apply CPU profiles we need to modify the MSR-based features according to the profile. The first step is to obtain these MSR-based features from the hypervisor as introduced in this commit. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- hypervisor/src/hypervisor.rs | 11 ++++++++++- hypervisor/src/kvm/mod.rs | 32 ++++++++++++++++++++++++++++++++ hypervisor/src/kvm/x86_64/mod.rs | 1 + hypervisor/src/mshv/mod.rs | 5 +++++ vmm/src/seccomp_filters.rs | 7 +++++++ 5 files changed, 55 insertions(+), 1 deletion(-) diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index ee50f0cefd..5888d76181 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -62,6 +62,10 @@ pub enum HypervisorError { #[error("Failed to get the list of supported MSRs")] GetMsrList(#[source] anyhow::Error), /// + /// Failed to get MSRs from the hypervisor. + #[error("Failed to get MSRs")] + GetMsr(#[source] anyhow::Error), + /// /// API version is not compatible /// #[error("Incompatible API version")] @@ -130,9 +134,14 @@ pub trait Hypervisor: Send + Sync { /// Get the supported CpuID /// fn get_supported_cpuid(&self) -> Result>; - /// Get the supported MSRs. #[cfg(target_arch = "x86_64")] + /// Get the supported MSRs. fn get_supported_msrs(&self) -> Result>; + #[cfg(target_arch = "x86_64")] + /// + /// Get the MSR-based features supported by the hardware and hypervisor + /// + fn get_msr_based_features(&self) -> Result>; /// /// Check particular extensions if any /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 85b095ea61..64f1124d41 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -34,6 +34,7 @@ use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; #[cfg(feature = "sev_snp")] use log::debug; +use log::trace; #[cfg(target_arch = "x86_64")] use log::warn; use vmm_sys_util::errno; @@ -1687,6 +1688,37 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(msrs) } + #[cfg(target_arch = "x86_64")] + fn get_msr_based_features(&self) -> hypervisor::Result> { + let list = self + .kvm + .get_msr_feature_index_list() + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))?; + let list_len = list.as_fam_struct_ref().nmsrs; + trace!("number of MSR-based feature register addresses:={list_len}"); + let kvm_msrs: Vec = list + .as_slice() + .iter() + .copied() + .map(|index| kvm_msr_entry { + index, + ..Default::default() + }) + .collect(); + let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); + let num_writes = self + .kvm + .get_msrs(&mut kvm_msrs) + .map_err(|e| hypervisor::HypervisorError::GetMsr(e.into()))?; + trace!("number of MSR-based feature MSRs written to by KVM:={num_writes}"); + Ok(kvm_msrs + .as_slice() + .iter() + .copied() + .map(MsrEntry::from) + .collect()) + } + #[cfg(target_arch = "aarch64")] /// /// Retrieve AArch64 host maximum IPA size supported by KVM. diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index 62185fd84e..47f52d2be2 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -66,6 +66,7 @@ pub fn check_required_kvm_extensions(kvm: &Kvm) -> KvmResult<()> { check_extension!(Cap::VcpuEvents); check_extension!(Cap::Xcrs); check_extension!(Cap::Xsave); + check_extension!(Cap::GetMsrFeatures); Ok(()) } diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 888bd0738b..0d0293dddb 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -395,9 +395,14 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(cpuid) } + #[cfg(target_arch = "x86_64")] fn get_supported_msrs(&self) -> hypervisor::Result> { todo!() } + #[cfg(target_arch = "x86_64")] + fn get_msr_based_features(&self) -> hypervisor::Result> { + unimplemented!() + } /// Get maximum number of vCPUs fn get_max_vcpus(&self) -> u32 { diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index bd85b0776e..797a7b2cc7 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -443,6 +443,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> const KVM_GET_FPU: u64 = 0x81a0_ae8c; const KVM_GET_LAPIC: u64 = 0x8400_ae8e; const KVM_GET_MSR_INDEX_LIST: u64 = 0xc004_ae02; + const KVM_GET_MSR_FEATURE_INDEX_LIST: u64 = 0xc004_ae0a; const KVM_GET_MSRS: u64 = 0xc008_ae88; const KVM_GET_SREGS: u64 = 0x8138_ae83; const KVM_GET_TSC_KHZ: u64 = 0xaea3; @@ -472,6 +473,12 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_FPU)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_LAPIC)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSR_INDEX_LIST)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_GET_MSR_FEATURE_INDEX_LIST + )?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSRS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SREGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_TSC_KHZ)?], From 846d221c55d215ceb1a44a44dba4b5500cc2e52f Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 18 Feb 2026 11:25:47 +0100 Subject: [PATCH 134/178] hypervisor: Add get_msr_index_list method to the hypervisor trait We want to deny MSRs that are not available, or should not be available for a given CPU profile, but in order to do that we must also have some means to see which MSRs are supported by the host. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- hypervisor/src/hypervisor.rs | 6 ++++++ hypervisor/src/kvm/mod.rs | 13 +++++++++++++ hypervisor/src/mshv/mod.rs | 7 +++++++ 3 files changed, 26 insertions(+) diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 5888d76181..4a70fb7ae3 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -142,6 +142,12 @@ pub trait Hypervisor: Send + Sync { /// Get the MSR-based features supported by the hardware and hypervisor /// fn get_msr_based_features(&self) -> Result>; + + /// + /// Get the MSR indices supported by the hardware and hypervisor + /// + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> Result>; /// /// Check particular extensions if any /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 64f1124d41..b7b9a9c9e8 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -1719,6 +1719,19 @@ impl hypervisor::Hypervisor for KvmHypervisor { .collect()) } + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> hypervisor::Result> { + let list = self.get_msr_list()?; + let num_msrs = list.as_fam_struct_ref().nmsrs; + let actual_num_msrs = list.as_slice().len(); + assert_eq!( + actual_num_msrs, num_msrs as usize, + "BUG: the length of the MSR Index LIST FAM wrapper does not coincide with + the nmrs field value " + ); + Ok(list.as_slice().to_vec()) + } + #[cfg(target_arch = "aarch64")] /// /// Retrieve AArch64 host maximum IPA size supported by KVM. diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 0d0293dddb..f92c647b83 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -399,6 +399,13 @@ impl hypervisor::Hypervisor for MshvHypervisor { fn get_supported_msrs(&self) -> hypervisor::Result> { todo!() } + + #[cfg(target_arch = "x86_64")] + fn get_msr_index_list(&self) -> hypervisor::Result> { + // TODO: We should probably implement this before upstreaming + unimplemented!() + } + #[cfg(target_arch = "x86_64")] fn get_msr_based_features(&self) -> hypervisor::Result> { unimplemented!() From 96aa3bf3fb30ca9753b8760d9843e6fd4bac6b8e Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 20 Feb 2026 11:00:52 +0100 Subject: [PATCH 135/178] misc: Don't expose MSR buffer directly through the hypervisor We realized that we only need a method for obtaining supported MSR indices and the buffer can be built from that in the `CpuManager`'s constructor. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- hypervisor/src/hypervisor.rs | 3 --- hypervisor/src/kvm/mod.rs | 17 ----------------- hypervisor/src/mshv/mod.rs | 7 +------ hypervisor/src/vm.rs | 7 ++++++- vmm/src/cpu.rs | 29 ++++++++++++++++++++++------- 5 files changed, 29 insertions(+), 34 deletions(-) diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 4a70fb7ae3..629306c3cf 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -135,9 +135,6 @@ pub trait Hypervisor: Send + Sync { /// fn get_supported_cpuid(&self) -> Result>; #[cfg(target_arch = "x86_64")] - /// Get the supported MSRs. - fn get_supported_msrs(&self) -> Result>; - #[cfg(target_arch = "x86_64")] /// /// Get the MSR-based features supported by the hardware and hypervisor /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index b7b9a9c9e8..b34d6224be 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -1671,23 +1671,6 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(v) } - #[cfg(target_arch = "x86_64")] - fn get_supported_msrs(&self) -> hypervisor::Result> { - let msr_list = self.get_msr_list()?; - let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; - let mut msrs: Vec = vec![ - MsrEntry { - ..Default::default() - }; - num_msrs - ]; - let indices = msr_list.as_slice(); - for (pos, index) in indices.iter().enumerate() { - msrs[pos].index = *index; - } - Ok(msrs) - } - #[cfg(target_arch = "x86_64")] fn get_msr_based_features(&self) -> hypervisor::Result> { let list = self diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index f92c647b83..72eb00eff0 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -395,14 +395,9 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(cpuid) } - #[cfg(target_arch = "x86_64")] - fn get_supported_msrs(&self) -> hypervisor::Result> { - todo!() - } - #[cfg(target_arch = "x86_64")] fn get_msr_index_list(&self) -> hypervisor::Result> { - // TODO: We should probably implement this before upstreaming + // TODO: We need to implement this before upstreaming unimplemented!() } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index e0d0e3b057..e567f2276e 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -30,6 +30,7 @@ use crate::arch::aarch64::gic::{Vgic, VgicConfig}; use crate::arch::riscv64::aia::{Vaia, VaiaConfig}; #[cfg(feature = "tdx")] use crate::arch::x86::CpuIdEntry; +#[cfg(target_arch = "x86_64")] use crate::arch::x86::MsrEntry; use crate::cpu::Vcpu; use crate::{IoEventAddress, IrqRoutingEntry}; @@ -325,11 +326,15 @@ pub trait Vm: Send + Sync + Any { /// Unregister an event that will, when signaled, trigger the `gsi` IRQ. fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; /// Creates a new KVM vCPU file descriptor and maps the memory corresponding + /// + /// The `msr_buffer` is used to store MSR state. The entries given here are + /// expected to hold indices/register addresses supported by both the host's + /// hardware and the hypervisor. fn create_vcpu( &self, id: u32, vm_ops: Option>, - #[cfg(target_arch = "x86_64")] msrs: Vec, + #[cfg(target_arch = "x86_64")] msr_buffer: Vec, ) -> Result>; #[cfg(target_arch = "aarch64")] fn create_vgic(&self, config: &VgicConfig) -> Result>>; diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index b7bbd1ef5e..49002d47dc 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -525,16 +525,17 @@ impl Vcpu { /// * `vm` - The virtual machine this vcpu will get attached to. /// * `vm_ops` - Optional object for exit handling. /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) + /// * `msr_buffer`(x86_64 only) - A buffer for supported MSRs. pub fn new( id: u32, apic_id: u32, vm: &dyn hypervisor::Vm, vm_ops: Option>, #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, - #[cfg(target_arch = "x86_64")] msrs: Vec, + #[cfg(target_arch = "x86_64")] msr_buffer: Vec, ) -> Result { let vcpu = vm - .create_vcpu(apic_id, vm_ops, msrs) + .create_vcpu(apic_id, vm_ops, msr_buffer) .map_err(|e| Error::VcpuCreate(e.into()))?; // Initially the cpuid per vCPU is the one supported by this VM. Ok(Vcpu { @@ -720,7 +721,8 @@ pub struct CpuManager { #[cfg(target_arch = "x86_64")] cpuid: Vec, #[cfg(target_arch = "x86_64")] - msrs: Vec, + /// A buffer for MSRs supported by the hardware and hypervisor + msr_buffer: Vec, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm: Arc, vcpus_kill_signalled: Arc, @@ -954,9 +956,8 @@ impl CpuManager { interrupt_controller: None, #[cfg(target_arch = "x86_64")] cpuid: Vec::new(), - msrs: hypervisor - .get_supported_msrs() - .map_err(|e| Error::VcpuCreate(e.into()))?, + #[cfg(target_arch = "x86_64")] + msr_buffer: Self::construct_msr_buffer(hypervisor.as_ref())?, vm, vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), @@ -984,6 +985,20 @@ impl CpuManager { }))) } + #[cfg(target_arch = "x86_64")] + fn construct_msr_buffer(hypervisor: &dyn hypervisor::Hypervisor) -> Result> { + let msr_indices = hypervisor + .get_msr_index_list() + .map_err(|e| Error::VcpuCreate(e.into()))?; + Ok(msr_indices + .into_iter() + .map(|index| MsrEntry { + index, + ..Default::default() + }) + .collect()) + } + #[cfg(target_arch = "x86_64")] pub fn populate_cpuid( &mut self, @@ -1031,7 +1046,7 @@ impl CpuManager { #[cfg(target_arch = "x86_64")] self.hypervisor.get_cpu_vendor(), #[cfg(target_arch = "x86_64")] - self.msrs.clone(), + self.msr_buffer.clone(), )?; if let Some(snapshot) = snapshot { From a0d6833bd97c972b40f3b303f37b8abae66274a8 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 20 Feb 2026 15:04:02 +0100 Subject: [PATCH 136/178] hypervisor: msr_filter method on Vm trait In order to ensure that MSRs that are not compatible with a given CPU profile do no get accessed by the guests we need to introduce functionality to deny such MSRs via filters. The implementation introduced here is mostly a temporary workaround until https://github.com/rust-vmm/kvm/pull/359 is integrated in CHV. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- hypervisor/src/kvm/mod.rs | 69 ++++++++++++++++++++++++++++++++++++-- hypervisor/src/lib.rs | 23 +++++++++++++ hypervisor/src/mshv/mod.rs | 9 +++++ hypervisor/src/vm.rs | 31 +++++++++++++++-- vmm/src/seccomp_filters.rs | 2 ++ 5 files changed, 130 insertions(+), 4 deletions(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index b34d6224be..a8c516c51f 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -73,13 +73,13 @@ use x86_64::check_required_kvm_extensions; #[cfg(target_arch = "x86_64")] pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState}; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, }; +#[cfg(target_arch = "x86_64")] +use crate::{ClockData, MsrFilterRange}; use crate::{ CpuState, HypervisorType, HypervisorVmConfig, InterruptSourceConfig, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_GUEST_MEMFD, @@ -212,6 +212,8 @@ const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; const TDG_VP_VMCALL_SUCCESS: u64 = 0; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; +/// Maximum number of MSR ranges that KVM can filter +pub const KVM_MSR_FILTER_MAX_RANGES: usize = 16; #[cfg(feature = "tdx")] ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); @@ -692,6 +694,69 @@ impl KvmVm { /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// ``` impl vm::Vm for KvmVm { + #[cfg(target_arch = "x86_64")] + fn msr_filter<'a>(&self, filter: &[MsrFilterRange<'a>], default_deny: bool) -> vm::Result<()> { + // Found here https://github.com/torvalds/linux/blob/master/include/uapi/linux/kvm.h#L929C9-L929C31 + const KVM_CAP_MSR_FILTER: u64 = 189; + // Can be computed from https://github.com/torvalds/linux/blob/master/include/uapi/linux/kvm.h#L1458 + const KVM_X86_SET_MSR_FILTER: u64 = 0x4188aec6; + + let cap_result = self.fd.check_extension_raw(KVM_CAP_MSR_FILTER); + if cap_result <= 0 { + return Err(vm::HypervisorVmError::MissingMsrFilterCapability { + error_code: cap_result, + }); + } + // Workaround until https://github.com/rust-vmm/kvm/pull/359 is merged + #[repr(C)] + #[derive(Clone, Copy, Default)] + struct KvmMsrFilterRange { + flags: u32, + nmrs: u32, + base: u32, + bitmap: *const u8, + } + + #[repr(C)] + struct KvmMsrFilter { + flags: u32, + ranges: [KvmMsrFilterRange; KVM_MSR_FILTER_MAX_RANGES], + } + + let mut kvm_filter = KvmMsrFilter { + flags: u32::from(default_deny), + ranges: [Default::default(); KVM_MSR_FILTER_MAX_RANGES], + }; + + let num_ranges = kvm_filter.ranges.len(); + if num_ranges > KVM_MSR_FILTER_MAX_RANGES { + return Err(vm::HypervisorVmError::TooManyMsrFilterRanges { + num_ranges, + num_permitted_ranges: KVM_MSR_FILTER_MAX_RANGES, + }); + } + + for (range, kvm_range) in filter.iter().zip(kvm_filter.ranges.iter_mut()) { + kvm_range.flags = range.flags; + kvm_range.nmrs = range.nmsrs; + kvm_range.base = range.base; + kvm_range.bitmap = range.bitmap.as_ptr(); + } + // SAFETY: SYSCALL with valid parameters. All raw pointers are derived from references that are valid for the duration of this entire method call. + let result = unsafe { + libc::ioctl( + self.fd.as_raw_fd(), + KVM_X86_SET_MSR_FILTER, + (&raw const kvm_filter).cast::(), + ) + }; + if result == 0 { + Ok(()) + } else { + Err(vm::HypervisorVmError::MsrFilter { error_code: result }) + } + } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] fn sev_snp_init(&self, guest_policy: igvm_defs::SnpPolicy) -> vm::Result<()> { self.sev_fd diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 3b76dc6add..0d5dc91d74 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -212,6 +212,29 @@ pub enum VcpuInit { Mshv(mshv_bindings::MshvVcpuInit), } +#[cfg(target_arch = "x86_64")] +/// Parameters for filtering read and/or write accesses to a range of MSRs. +#[derive(Debug, Clone, Copy, Default)] +pub struct MsrFilterRange<'a> { + /// The type of operation(s) to filter: `1 << 0`, `1 << 1`, `(1 << 0) | (1 << 1)` refers to read, write, read and write respectively. + // TODO: Consider using an enum here + pub flags: u32, + /// The number of MSRs in this filter range. + pub nmsrs: u32, + /// The first MSR index the bitmap starts at. + pub base: u32, + /// For bit position P ( 0 <= P <= nmsrs), the operations in `flags` are allowed for MSR:= base + P if the bit is set, otherwise they are denied. + pub bitmap: &'a [u8], +} + +impl<'a> MsrFilterRange<'a> { + /// Modify the `flags` so that the ops in the bitmap refer to both reads and writes. + pub fn with_read_write_flags(mut self) -> Self { + self.flags = 1 | (1 << 1); + self + } +} + #[derive(Debug, Clone, PartialEq)] pub enum RegList { #[cfg(all(feature = "kvm", any(target_arch = "aarch64", target_arch = "riscv64")))] diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 72eb00eff0..de98c256e9 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -2620,4 +2620,13 @@ impl vm::Vm for MshvVm { } Ok(()) } + + #[cfg(target_arch = "x86_64")] + fn msr_filter<'a>( + &self, + _filter: &[crate::MsrFilterRange<'a>], + _default_deny: bool, + ) -> vm::Result<()> { + todo!() + } } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index e567f2276e..41a7d27ba9 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -22,8 +22,6 @@ use igvm_defs::SnpPolicy; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::{Vgic, VgicConfig}; #[cfg(target_arch = "riscv64")] @@ -33,6 +31,8 @@ use crate::arch::x86::CpuIdEntry; #[cfg(target_arch = "x86_64")] use crate::arch::x86::MsrEntry; use crate::cpu::Vcpu; +#[cfg(target_arch = "x86_64")] +use crate::{ClockData, MsrFilterRange}; use crate::{IoEventAddress, IrqRoutingEntry}; /// @@ -63,6 +63,22 @@ pub enum HypervisorVmError { #[error("Failed to create Vcpu")] CreateVcpu(#[source] anyhow::Error), /// + /// Could not filter the given MSRs because too many MSR filter ranges were provided. + /// + #[error( + "Too many separate MSR ranges to filter. Number of given ranges:={num_ranges}, but number of permitted ranges:={num_permitted_ranges}" + )] + TooManyMsrFilterRanges { + num_ranges: usize, + num_permitted_ranges: usize, + }, + #[error( + "Could not filter the given MSR ranges: Failed to confirm MSR filtering capability: error_code:={error_code}" + )] + MissingMsrFilterCapability { error_code: i32 }, + #[error("Could not filter the given MSR ranges. Error code:={error_code}")] + MsrFilter { error_code: i32 }, + /// /// Identity map address error /// #[error("Failed to set identity map address")] @@ -325,6 +341,17 @@ pub trait Vm: Send + Sync + Any { fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; /// Unregister an event that will, when signaled, trigger the `gsi` IRQ. fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; + #[cfg(target_arch = "x86_64")] + /// Filter the given ranges of MSRs. This can be used to specify certain MSRs + /// that guests may not access. + /// + /// If the `default_deny` flag is set, MSRs that do not match any of the given + /// ranges, will be automatically denied, otherwise they are allowed. + /// + /// # Important + /// + /// This method should be called once before creating any vCPUs and never again. + fn msr_filter<'a>(&self, filter: &[MsrFilterRange<'a>], default_deny: bool) -> Result<()>; /// Creates a new KVM vCPU file descriptor and maps the memory corresponding /// /// The `msr_buffer` is used to store MSR state. The entries given here are diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 797a7b2cc7..8b4996ccc5 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -84,6 +84,7 @@ mod kvm { pub const KVM_CHECK_EXTENSION: u64 = 0xae03; pub const KVM_GET_VCPU_MMAP_SIZE: u64 = 0xae04; pub const KVM_CREATE_VCPU: u64 = 0xae41; + pub const KVM_X86_SET_MSR_FILTER: u64 = 0x4188aec6; pub const KVM_CREATE_IRQCHIP: u64 = 0xae60; pub const KVM_RUN: u64 = 0xae80; pub const KVM_SET_MP_STATE: u64 = 0x4004_ae99; @@ -235,6 +236,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen Ok(or![ and![Cond::new(1, ArgLen::Dword, Eq, KVM_CHECK_EXTENSION)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_DEVICE,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_X86_SET_MSR_FILTER,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_IRQCHIP,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VCPU)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VM)?], From d40d0676266967150acff999320e49872f807939 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 25 Feb 2026 21:11:50 +0100 Subject: [PATCH 137/178] arch: Disable UINTR state components User Interrupts (UINTR) is a modern Intel CPU feature that allows interrupts to be sent between userspace tasks without going through the OS Kernel. To the best of our knowledge this feature is currently not supported by QEMU at all, and we expect there to be rather few projects intended to run in the cloud utilizing this feature. Thus in order to make it possible to run CHV with CPU profiles inside a QEMU VM, we decide to disable UINTR for non-host CPU profiles for now. If/when UINTR gains more adoption in the future, we can then reconsider this policy and potentially introduce new CPU profiles supporting this feature. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 132 +++++++++++++++++++-- 1 file changed, 125 insertions(+), 7 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index d3d1e1fa24..82999eb4fb 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2392,8 +2392,20 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { }, ValueDefinition { short: "xcr0_ia32_xss_bits", - description: "XCR0.IA32_XSS (bit 10 - 16) used for IA32_XSS", - bits_range: (10, 16), + description: "XCR0.IA32_XSS (bit 10 - 13) used for IA32_XSS", + bits_range: (10, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_ia32_xss_UINTR", + description: "XCR0.IA32_XSS (bit 14) used for UINTR in IA32_XSS", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_bits_15_16", + description: "XCR0.IA32_XSS (bit 15 - 16) used for IA32_XSS", + bits_range: (15, 16), policy: ProfilePolicy::Inherit, }, // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. @@ -2568,7 +2580,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { short: "xss_uintr", description: "UINTR state, supported", bits_range: (14, 14), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_lbr", @@ -2719,12 +2731,118 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { policy: ProfilePolicy::Static(0), }]), ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 13), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 13), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 13), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // We decided to disable UINTR for CPU profiles, hence we zero out these sub-leaves + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(14, 14), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-uintr-zero", + description: "This leaf has been zeroed out because UINTR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to // the policy declared here at runtime for those values. ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 63), + sub_leaf: RangeInclusive::new(15, 63), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2737,7 +2855,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 63), + sub_leaf: RangeInclusive::new(15, 63), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2750,7 +2868,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<153> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 63), + sub_leaf: RangeInclusive::new(15, 63), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ From bdfd8c10c9698a2eb53bcb384c00bf2fa89b3d18 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Thu, 26 Feb 2026 09:31:09 +0100 Subject: [PATCH 138/178] arch: Disable CET in CPUID for CPU profiles Control-flow enforcement technology (CET) is currently not supported by QEMU. In order to make it possible to run CHV with CPU profiles inside a QEMU VM we decide to also disable this feature for CPU profiles. We might want to revisit this decission and possibly include it in a "V2" for some of our CPU profiles. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 140 +++++++++++++++++++-- 1 file changed, 129 insertions(+), 11 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 82999eb4fb..49822c1250 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -1446,7 +1446,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { short: "cet_ss", description: "CET shadow stack features", bits_range: (7, 7), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "gfni", @@ -1668,7 +1668,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { short: "ibt", description: "CET indirect branch tracking", bits_range: (20, 20), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "amx_bf16", @@ -2005,7 +2005,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { short: "cet_sss", description: "CET supervisor shadow stacks safe to use", bits_range: (18, 18), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "avx10", @@ -2392,8 +2392,20 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { }, ValueDefinition { short: "xcr0_ia32_xss_bits", - description: "XCR0.IA32_XSS (bit 10 - 13) used for IA32_XSS", - bits_range: (10, 13), + description: "XCR0.IA32_XSS (bit 10) used for IA32_XSS", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_ia32_xss_cet", + description: "XCR0.IA32_XSS (bits 11 - 12) used for IA32_XSS", + bits_range: (11, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_ia32_xss_bits", + description: "XCR0.IA32_XSS (bit 13) used for IA32_XSS", + bits_range: (13, 13), policy: ProfilePolicy::Inherit, }, ValueDefinition { @@ -2562,13 +2574,13 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { short: "xss_cet_u", description: "CET user state, supported", bits_range: (11, 11), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_cet_p", description: "CET supervisor state, supported", bits_range: (12, 12), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_hdc", @@ -2734,7 +2746,113 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 13), + sub_leaf: RangeInclusive::new(5, 10), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 10), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 10), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // We leave CET out of CPU profiles for the time being + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-eax-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-ebx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-ecx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(11, 12), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-11-12-edx-cet-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2747,7 +2865,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 13), + sub_leaf: RangeInclusive::new(13, 13), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2760,7 +2878,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<160> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 13), + sub_leaf: RangeInclusive::new(13, 13), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ From d4bdb7793aa01fff8824e26dafb737cc7abb52fb Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 3 Mar 2026 16:14:38 +0100 Subject: [PATCH 139/178] arch: Change CPU profile policy for MCA Machine Check Architecture (MCA) By setting the profile policy to Static(0) for the MCA bit we indicate to guests that the MCG_CAP MSR and other machine check related MSRS are not available. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 49822c1250..3ea13e6233 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -469,7 +469,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "mca", description: "Machine Check Architecture", bits_range: (14, 14), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "cmov", From 48815aab7af6da4e7b50f0b69299aa06ede21015 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 25 Feb 2026 16:38:46 +0100 Subject: [PATCH 140/178] arch: Change CPUID profile policy for WAITPKG We change the CPUID policy for WAITPKG because we encountered problems with it when testing CPU profiles with MSRs. This is also off by default for CPU models in QEMU, but we may still potentially want to revisit this decision in the future. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 3ea13e6233..665058501e 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -1430,11 +1430,12 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { bits_range: (4, 4), policy: ProfilePolicy::Passthrough, }, + // TODO: Revisit this decision. Setting this to 0 for now in order to be compatible with QEMU ValueDefinition { short: "waitpkg", description: "WAITPKG instructions", bits_range: (5, 5), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "avx512_vbmi2", From 1dcb7031daf4b3fe03ccac0b4776096f9ca9e825 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 11 Mar 2026 16:07:02 +0100 Subject: [PATCH 141/178] arch: More Inherit for CPUID leaf 0x7.EDX MD_CLEAR (bit 10), STIBP (bit 27) and L1D_FLUSH (bit 28) all advertise certain processor features. The passthrough policy is usually not appropriate in that case, because one no longer has a guarantee that if the CPU profile can be applied on both the source and destination then the live migration CPUID compatibility checks must succeed. We fix this issue by instead utilizing the Inherit policy for these bits. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 665058501e..fe9ebb8430 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -1618,7 +1618,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "md_clear", description: "VERW MD_CLEAR microcode support", bits_range: (10, 10), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "rtm_always_abort", @@ -1707,14 +1707,14 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "intel_stibp", description: "Single thread indirect branch predictors", bits_range: (27, 27), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, // MSR related ValueDefinition { short: "flush_l1d", description: "FLUSH L1D cache: IA32_FLUSH_CMD MSR", bits_range: (28, 28), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, // MSR related ValueDefinition { From d7becd7b0b445768c69dc97a410a617d323c510d Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 13 Mar 2026 14:35:59 +0100 Subject: [PATCH 142/178] arch: Disable PKU and OSPKE for non-host CPU profiles Protection keys are not supported for CPU profiles and thus disabled Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index fe9ebb8430..e66e75fdb5 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -1417,18 +1417,19 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { bits_range: (2, 2), policy: ProfilePolicy::Inherit, }, - // Also set by QEMU for CPU models from what we can tell + // TODO: This is however set by QEMU for CPU models from what we can tell? ValueDefinition { short: "pku", description: "Protection keys for user-space", bits_range: (3, 3), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, + // NOTE: This field is mutable in principle and can be changed by the OS (TODO: Under which circumstances?) ValueDefinition { short: "ospke", description: "OS protection keys enable", bits_range: (4, 4), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Static(0), }, // TODO: Revisit this decision. Setting this to 0 for now in order to be compatible with QEMU ValueDefinition { From ce5aee524c862aaa08d64a304841f85fb14334d7 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 26 Jan 2026 10:31:27 +0100 Subject: [PATCH 143/178] arch: Change CPU profile generation tool In preparation for making CPU profiles MSR aware we prepare for having two pieces of data associated with a CPU profile: CPUID and MSR adjustments. We thus rename the pre-existing CpuProfileData struct to CpuIdProfileData and adapt the CPU profile generation tool accordingly. We also make the CPU profile generation tool write directly to file and automatically introduce the required license file as well. This makes the profile generation process more convenient. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/bin/generate-cpu-profile.rs | 10 +-- arch/src/x86_64/cpu_profile.rs | 25 ++---- arch/src/x86_64/cpu_profile_generation.rs | 94 ++++++++++++++++++++--- arch/src/x86_64/mod.rs | 2 +- 4 files changed, 93 insertions(+), 38 deletions(-) diff --git a/arch/src/bin/generate-cpu-profile.rs b/arch/src/bin/generate-cpu-profile.rs index ee367906f6..4710fd277e 100644 --- a/arch/src/bin/generate-cpu-profile.rs +++ b/arch/src/bin/generate-cpu-profile.rs @@ -7,7 +7,6 @@ feature = "cpu_profile_generation", feature = "kvm" ))] -use std::io::BufWriter; use anyhow::Context; use clap::{Arg, Command}; @@ -27,12 +26,5 @@ fn main() -> anyhow::Result<()> { let profile_name = cmd_arg.get_one::("name").unwrap(); let hypervisor = hypervisor::new().context("Could not obtain hypervisor")?; - // TODO: Consider letting the user provide a file path as a target instead of writing to stdout. - // The way it is now should be sufficient for a PoC however. - let writer = BufWriter::new(std::io::stdout().lock()); - arch::x86_64::cpu_profile_generation::generate_profile_data( - writer, - hypervisor.as_ref(), - profile_name, - ) + arch::x86_64::cpu_profile_generation::generate_profile_data(hypervisor.as_ref(), profile_name) } diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index 70b60b0caa..d2084ee8bd 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -30,7 +30,7 @@ pub enum CpuProfile { } impl CpuProfile { - /// Loads pre-generated data associated with a CPU profile. + /// Loads pre-generated CPUID data associated with a CPU profile. /// /// If the `amx` flag is false then the AMX tile state components will be /// zeroed out from the associated profile data. This is necessary because @@ -39,8 +39,8 @@ impl CpuProfile { // // We can only generate CPU profiles for the KVM hypervisor for the time being. #[cfg(feature = "kvm")] - pub(in crate::x86_64) fn data(&self, amx: bool) -> Option { - let mut data: CpuProfileData = match self { + pub(in crate::x86_64) fn cpuid_data(&self, amx: bool) -> Option { + let mut data: CpuIdProfileData = match self { Self::Host => None, Self::Skylake => Some( serde_json::from_slice(include_bytes!("cpu_profiles/skylake.json")) @@ -84,7 +84,7 @@ impl CpuProfile { } #[cfg(not(feature = "kvm"))] - pub(in crate::x86_64) fn data(&self, _amx: bool) -> Option { + pub(in crate::x86_64) fn cpuid_data(&self, _amx: bool) -> Option { if matches!(*self, Self::Host) { return None; } @@ -94,7 +94,7 @@ impl CpuProfile { } } -/// Every [`CpuProfile`] different from `Host` has associated [`CpuProfileData`]. +/// Every [`CpuProfile`] different from `Host` has associated [`CpuIdProfileData`]. /// /// New constructors of this struct may only be generated through the CHV CLI (when built from source with /// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to @@ -102,7 +102,7 @@ impl CpuProfile { /// CPU profile. #[derive(Debug, Clone, Serialize, Deserialize)] #[allow(dead_code)] -pub struct CpuProfileData { +pub struct CpuIdProfileData { /// The hypervisor used when generating this CPU profile. pub(in crate::x86_64) hypervisor: HypervisorType, /// The vendor of the CPU belonging to the host that generated this CPU profile. @@ -111,19 +111,6 @@ pub struct CpuProfileData { pub(in crate::x86_64) adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)>, } -/* TODO: The [`CpuProfile`] struct will likely need a few more iterations. The following -section should explain why: - -# MSR restrictions - -CPU profiles also need to restrict which MSRs may be manipulated by the guest as various physical CPUs -can have differing supported MSRs. - -The CPU profile will thus necessarily need to contain some data related to MSR restrictions. That will -be taken care of in a follow up MR. - -*/ - /// Used for adjusting an entire cpuid output register (EAX, EBX, ECX or EDX) #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] pub(super) struct CpuidOutputRegisterAdjustments { diff --git a/arch/src/x86_64/cpu_profile_generation.rs b/arch/src/x86_64/cpu_profile_generation.rs index 8a74a2bca7..8ef3fac900 100644 --- a/arch/src/x86_64/cpu_profile_generation.rs +++ b/arch/src/x86_64/cpu_profile_generation.rs @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 // +use std::fs::File; use std::io::Write; use std::ops::RangeInclusive; @@ -9,7 +10,7 @@ use anyhow::{Context, anyhow}; use hypervisor::arch::x86::CpuIdEntry; use hypervisor::{CpuVendor, Hypervisor, HypervisorError, HypervisorType}; -use crate::x86_64::cpu_profile::CpuProfileData; +use crate::x86_64::cpu_profile::CpuIdProfileData; #[cfg(feature = "kvm")] use crate::x86_64::cpuid_definitions::CpuidDefinitions; use crate::x86_64::cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS; @@ -23,7 +24,6 @@ use crate::x86_64::{CpuidOutputRegisterAdjustments, CpuidReg}; // NOTE: The MVP only works with KVM as the hypervisor and Intel CPUs. #[cfg(feature = "kvm")] pub fn generate_profile_data( - mut writer: impl Write, hypervisor: &dyn Hypervisor, profile_name: &str, ) -> anyhow::Result<()> { @@ -45,16 +45,77 @@ pub fn generate_profile_data( let cpuid = overwrite_brand_string(cpuid, brand_string_bytes); let supported_cpuid_sorted = sort_entries(cpuid); + let Files { + cpuid_data_file, + cpuid_data_license_file, + } = create_files(profile_name)?; + generate_cpu_profile_data_with( hypervisor_type, cpu_vendor, &supported_cpuid_sorted, &INTEL_CPUID_DEFINITIONS, &KVM_CPUID_DEFINITIONS, - &mut writer, + cpuid_data_file, + cpuid_data_license_file, ) } +struct Files { + cpuid_data_file: File, + cpuid_data_license_file: File, +} +/// Create empty files with names derived from the name given to the CPU profile. +/// The name will be lowercase and spaces are replaced with "-". +fn create_files(profile_name: &str) -> anyhow::Result { + let profile_file_name = { + let mut name = String::new(); + for part in profile_name.split_whitespace().map(|s| s.to_lowercase()) { + if !name.is_empty() { + name.push('-'); + } + name.push_str(&part); + } + name + }; + + let cpuid_profile_file_name = { + let mut path = std::env::current_dir().context( + "CPU profile generation failed: Unable to get the current working directory", + )?; + path.push(format!( + "arch/src/x86_64/cpu_profiles/{profile_file_name}.cpuid.json" + )); + path + }; + + let cpuid_data_file = File::create(cpuid_profile_file_name.clone()).with_context(|| { + format!( + "CPU profile generation failed: Could not create file:={}", + cpuid_profile_file_name.to_string_lossy() + ) + })?; + + let cpuid_data_license_file_path = { + let mut path = cpuid_profile_file_name.clone(); + path.as_mut_os_string().push(".license"); + path + }; + + let cpuid_data_license_file = + File::create(cpuid_data_license_file_path.clone()).with_context(|| { + format!( + "CPU profile generation failed: Could not create file:={}", + cpuid_data_license_file_path.to_string_lossy() + ) + })?; + + Ok(Files { + cpuid_data_file, + cpuid_data_license_file, + }) +} + /// Prepare the bytes which the brand string should consist of fn cpu_brand_string_bytes(cpu_vendor: CpuVendor, profile_name: &str) -> anyhow::Result<[u8; 48]> { let cpu_vendor_str: String = serde_json::to_string(&cpu_vendor) @@ -90,7 +151,8 @@ fn generate_cpu_profile_data_with( supported_cpuid_sorted: &[CpuIdEntry], processor_cpuid_definitions: &CpuidDefinitions, hypervisor_cpuid_definitions: &CpuidDefinitions, - mut writer: &mut impl Write, + mut cpuid_data_file: impl Write, + mut cpuid_license_file: impl Write, ) -> anyhow::Result<()> { let mut adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)> = Vec::new(); @@ -146,17 +208,31 @@ fn generate_cpu_profile_data_with( } } - let profile_data = CpuProfileData { + let cpuid_profile_data = CpuIdProfileData { hypervisor: hypervisor_type, cpu_vendor, adjustments, }; - serde_json::to_writer_pretty(&mut writer, &profile_data) - .context("failed to serialize the generated profile data to the given writer")?; - writer + serde_json::to_writer_pretty(&mut cpuid_data_file, &cpuid_profile_data) + .context("Cpu profile generation failed: Could not serialize the generated cpuid profile data to the given writer")?; + cpuid_data_file + .flush() + .context("CPU profile generation failed: Unable to flush cpuid profile data")?; + let license_text = { + r#"SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 +"# + }; + cpuid_license_file + .write_all(license_text.as_bytes()) + .context( + "CPU profile generation failed: Unable to write to cpuid profile data license file", + )?; + cpuid_license_file .flush() - .context("CPU profile generation failed: Unable to flush cpu profile data") + .context("CPU profile generation failed: Unable to flush cpuid profile data license file") } /// Get as many of the supported CPUID entries from the hypervisor as possible. diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 709ee7e57c..5053d34835 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -738,7 +738,7 @@ pub fn generate_common_cpuid( let (host_adjusted_to_profile, profile_cpu_vendor) = { config .profile - .data(config.amx) + .cpuid_data(config.amx) .map_or((Ok(None), None), |profile_data| { ( CpuidOutputRegisterAdjustments::adjust_cpuid_entries( From 4bd67c500f6cae3f15bf35b5c8a14180817b1e4d Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 17 Feb 2026 14:55:32 +0100 Subject: [PATCH 144/178] arch: Lookup methods on CPUID definitions Introduce convenience methods for looking up CPUID value definitions. We will later use these methods to assert certain policies at compile time in order to stay consistent with MSR policies we introduce. Due to the current limitations of const generics we unfortunately need to duplicate a little bit code. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 14 ++++++ arch/src/x86_64/cpuid_definitions/kvm.rs | 14 ++++++ arch/src/x86_64/cpuid_definitions/mod.rs | 52 +++++++++++++++++++++- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index e66e75fdb5..f57785ff27 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -5015,3 +5015,17 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { ), ]) }; + +/// Compile time check that the given `BIT` in the CPUID output register specified by `params` is not +/// declared to be overwritten by `0` for non-host CPU profiles. +pub const fn assert_not_denied_cpuid_feature(params: &Parameters) { + if let Some(defs) = INTEL_CPUID_DEFINITIONS.get(params) + && let Some(def) = defs.find_bit::() + { + assert!(!matches!(def.policy, ProfilePolicy::Static(0))); + } else { + panic!("Unable to lookup CPUID value definition with the given parameters and feature bit"); + } +} + +// TODO: Also include assert_denied_cpuid_feature diff --git a/arch/src/x86_64/cpuid_definitions/kvm.rs b/arch/src/x86_64/cpuid_definitions/kvm.rs index 3282e04222..9523f4ffab 100644 --- a/arch/src/x86_64/cpuid_definitions/kvm.rs +++ b/arch/src/x86_64/cpuid_definitions/kvm.rs @@ -207,3 +207,17 @@ pub const KVM_CPUID_DEFINITIONS: CpuidDefinitions<6> = const { ), ]) }; + +/// Compile time check that the given `BIT` in the CPUID output register specified by `params` is not +/// declared to be overwritten by `0` for non-host CPU profiles. +pub const fn assert_not_denied_cpuid_feature(params: &Parameters) { + if let Some(defs) = KVM_CPUID_DEFINITIONS.get(params) + && let Some(def) = defs.find_bit::() + { + assert!(!matches!(def.policy, ProfilePolicy::Static(0))); + } else { + panic!("Unable to lookup CPUID value definition with the given parameters and feature bit"); + } +} + +// TODO: Also include assert_denied_cpuid_feature diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs index c959061f22..f45dc4a9e4 100644 --- a/arch/src/x86_64/cpuid_definitions/mod.rs +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -110,8 +110,6 @@ pub struct ValueDefinition { } /// Describes values within a register populated by the CPUID instruction with specific parameters. -/// -/// NOTE: The only way to interact with this value (beyond this crate) is via the const [`Self::as_slice()`](Self::as_slice) method. pub struct ValueDefinitions(&'static [ValueDefinition]); impl ValueDefinitions { /// Constructor permitting at most 32 entries. @@ -125,6 +123,22 @@ impl ValueDefinitions { pub const fn as_slice(&self) -> &'static [ValueDefinition] { self.0 } + + /// Lookup the [`ValueDefinition`] whose bits range contains the given `BIT`. + pub const fn find_bit(&self) -> Option<&ValueDefinition> { + let mut idx = 0; + let len = self.0.len(); + while idx < len { + let def = &self.0[idx]; + let start = def.bits_range.0; + let end = def.bits_range.1; + if (start <= BIT) & (end >= BIT) { + return Some(def); + } + idx += 1; + } + None + } } /// Describes multiple CPUID outputs. @@ -139,6 +153,40 @@ impl CpuidDefinitions { pub const fn as_slice(&self) -> &[(Parameters, ValueDefinitions); NUM_PARAMETERS] { &self.0 } + + /// Lookup the [`ValueDefinitions`] corresponding to the given `parameters`. + pub const fn get(&self, parameters: &Parameters) -> Option<&ValueDefinitions> { + let mut idx = 0; + let len = self.0.len(); + let leaf = parameters.leaf; + let sub_leaf_start = *parameters.sub_leaf.start(); + let sub_leaf_end = *parameters.sub_leaf.end(); + // Note that as of today const Rust is quite a bit more vorbose than normal Rust. + // This is why the following implementation doesn't look so idiomatic. + let is_eax = matches!(parameters.register, CpuidReg::EAX); + let is_ebx = matches!(parameters.register, CpuidReg::EBX); + let is_ecx = matches!(parameters.register, CpuidReg::ECX); + let is_edx = matches!(parameters.register, CpuidReg::EDX); + while idx < len { + let (param, defs) = &self.0[idx]; + let matching_leaf = leaf == param.leaf; + let matching_sub_leaf = (sub_leaf_start >= *param.sub_leaf.start()) + & (sub_leaf_end <= *param.sub_leaf.end()); + let matching_reg = { + match param.register { + CpuidReg::EAX => is_eax, + CpuidReg::EBX => is_ebx, + CpuidReg::ECX => is_ecx, + CpuidReg::EDX => is_edx, + } + }; + if matching_leaf & matching_sub_leaf & matching_reg { + return Some(defs); + } + idx += 1; + } + None + } } #[cfg(test)] From 09df0a5607efc288f75a7f88f7ad15e333170e19 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 7 Jan 2026 09:32:09 +0100 Subject: [PATCH 145/178] arch: Add data structures for MSR definitions In order to generate CPU profiles we also need definitions and policies for MSR-based features, as some CPU features are exposed through MSRs rather than CPUID. This commit introduces the MSR analogues of the data structures we previously introduced for CPUID definitions. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/mod.rs | 1 + arch/src/x86_64/msr_definitions/mod.rs | 96 ++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 arch/src/x86_64/msr_definitions/mod.rs diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 5053d34835..762e9d39d3 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -12,6 +12,7 @@ pub mod cpu_profile_generation; pub mod cpuid_definitions; pub mod interrupts; pub mod layout; +pub mod msr_definitions; pub mod regs; #[cfg(feature = "tdx")] diff --git a/arch/src/x86_64/msr_definitions/mod.rs b/arch/src/x86_64/msr_definitions/mod.rs new file mode 100644 index 0000000000..577821a704 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/mod.rs @@ -0,0 +1,96 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; + +use crate::{deserialize_u32_hex, serialize_u32_hex}; +/// The register address of an MSR +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct RegisterAddress( + #[serde( + serialize_with = "serialize_u32_hex", + deserialize_with = "deserialize_u32_hex" + )] + pub u32, +); + +/// Describes a policy for how the corresponding MSR data should be considered when building +/// a CPU profile. +/// +/// This is the MSR analogue of [cpuid_definitions::ProfilePolicy](crate::x86_64::cpuid_definitions::ProfilePolicy) +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ProfilePolicy { + /// Store the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will be set into the modified + /// MSR(s) + Inherit, + /// Ignore the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will then instead get + /// extracted from the host. + /// + /// This variant is typically set for data that has no effect on migration compatibility, + /// but there may be some exceptions such as data which is necessary to run the VM at all, + /// but must coincide with whatever is on the host. + Passthrough, + /// Set the following hardcoded value in the CPU profile. + /// + /// This variant is typically used for features/values that don't work well with live migration (even when using the exact same physical CPU model). + Static(u64), + /// Deny read and write accesses to this MSR. + /// + /// This can only be applied to an MSR in its entirety and not to individual bit ranges + Deny, +} + +/// A description of a range of bits in an MSR. +/// +/// This is the MSR analogue of [cpuid_definitions::ValueDefinition](crate::x86_64::cpuid_definitions::ValueDefinition) +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinition { + /// A short name for the value. + pub short: &'static str, + /// A description of the value. + pub description: &'static str, + /// The range of bits in the MSR corresponding to this feature or value. + /// + /// This is not a `RangeInclusive` because that type does unfortunately not implement `Copy`. + pub bits_range: (u8, u8), + /// The policy corresponding to this value when building CPU profiles. + pub policy: ProfilePolicy, +} + +/// Describes values within an MSR. +/// +/// NOTE: The only way to interact with this value (beyond this crate) is via the const [`Self::as_slice()`](Self::as_slice) method. +/// +/// This is the MSR analogue of [cpuid_definitions::ValueDefinitions](crate::x86_64::cpuid_definitions::ValueDefinitions) +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinitions(&'static [ValueDefinition]); +impl ValueDefinitions { + /// Constructor permitting at most 64 entries. + const fn new(msr_descriptions: &'static [ValueDefinition]) -> Self { + // Note that this function is only called within this module, at compile time, hence it is fine to have some + // additional sanity checks such as the following assert. + assert!(msr_descriptions.len() <= 64); + Self(msr_descriptions) + } + /// Converts this into a slice representation. This is the only way to read values of this type. + pub const fn as_slice(&self) -> &'static [ValueDefinition] { + self.0 + } +} + +/// Describes multiple MSRs. +/// +/// Each wrapped [`ValueDefinitions`] corresponds to the given [`RegisterAddress`] in the same tuple. +pub struct MsrDefinitions([(RegisterAddress, ValueDefinitions); NUM]); + +impl MsrDefinitions { + pub const fn as_slice(&self) -> &[(RegisterAddress, ValueDefinitions); NUM] { + &self.0 + } +} From d0a764a2d4f058815458cb13e643740a18d34887 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 7 Jan 2026 10:24:09 +0100 Subject: [PATCH 146/178] arch: INTEL MSR-based feature definitions We introduce MSR-based feature definitions for Intel CPUs that will be utilized by the upcoming CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/intel/mod.rs | 7 + .../intel/msr_based_features.rs | 3686 +++++++++++++++++ arch/src/x86_64/msr_definitions/mod.rs | 1 + 3 files changed, 3694 insertions(+) create mode 100644 arch/src/x86_64/msr_definitions/intel/mod.rs create mode 100644 arch/src/x86_64/msr_definitions/intel/msr_based_features.rs diff --git a/arch/src/x86_64/msr_definitions/intel/mod.rs b/arch/src/x86_64/msr_definitions/intel/mod.rs new file mode 100644 index 0000000000..84e2434535 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/mod.rs @@ -0,0 +1,7 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +mod msr_based_features; + +pub use msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs new file mode 100644 index 0000000000..6fae341539 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -0,0 +1,3686 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::x86_64::msr_definitions::{ + MsrDefinitions, ProfilePolicy, RegisterAddress, ValueDefinition, ValueDefinitions, +}; + +impl RegisterAddress { + pub const IA32_BIOS_SIGN_ID: Self = Self(0x8b); + pub const IA32_ARCH_CAPABILITIES: Self = Self(0x10a); + pub const IA32_PERF_CAPABILITIES: Self = Self(0x345); + pub const IA32_VMX_BASIC: Self = Self(0x480); + pub const IA32_VMX_PINBASED_CTLS: Self = Self(0x481); + pub const IA32_VMX_PROCBASED_CTLS: Self = Self(0x482); + pub const IA32_VMX_EXIT_CTLS: Self = Self(0x483); + pub const IA32_VMX_ENTRY_CTLS: Self = Self(0x484); + pub const IA32_VMX_MISC: Self = Self(0x485); + pub const IA32_VMX_CR0_FIXED0: Self = Self(0x486); + pub const IA32_VMX_CR0_FIXED1: Self = Self(0x487); + pub const IA32_VMX_CR4_FIXED0: Self = Self(0x488); + pub const IA32_VMX_CR4_FIXED1: Self = Self(0x489); + pub const IA32_VMX_VMCS_ENUM: Self = Self(0x48a); + pub const IA32_VMX_PROCBASED_CTLS2: Self = Self(0x48b); + pub const IA32_VMX_EPT_VPID_CAP: Self = Self(0x48c); + pub const IA32_VMX_TRUE_PINBASED_CTLS: Self = Self(0x48d); + pub const IA32_VMX_TRUE_PROCBASED_CTLS: Self = Self(0x48e); + pub const IA32_VMX_TRUE_EXIT_CTLS: Self = Self(0x48f); + pub const IA32_VMX_TRUE_ENTRY_CTLS: Self = Self(0x490); + pub const IA32_VMX_VMFUNC: Self = Self(0x491); + pub const IA32_VMX_PROCBASED_CTLS3: Self = Self(0x492); + pub const IA32_VMX_EXIT_CTLS2: Self = Self(0x493); + + // =============== Non-architectural MSRs ======== + + // KVM + Intel Skylake reports this as an MSR-based feature + pub const MSR_PLATFORM_INFO: Self = Self(0xce); +} + +/// This table contains descriptions of all the MSRs whose register addresses can be contained in +/// the list returned by `KVM_GET_MSR_FEATURE_INDEX_LIST` when executed on an Intel CPU. +/// +/// The values described here are based on the Intel 64 and IA-32 Architectures Software Developer's +/// Manual Combined Volumes: 1,2A, 2B, 2C, 2D, 3A, 3B, 3C, 3D, and 4 from October 2025. +/// +/// We try to use the same short descriptions as Intel, but in the cases where we could not find an +/// official name for the bit field(s) we invented our own based on the description. +/// +/// The descriptions written here are based on those found in the aforementioned manual, but often less +/// detailed. We recommend consulting the official Intel documentation whenever more information +/// is required. +/// +/// +/// ## Future-proofing +/// +/// Future processors and/or KVM versions may of course introduce more MSR-based features than those listed here at this time of writing. +/// In order to make sure that this is taken into account, the CPU profile generation tool will error when this is detected. The person +/// attempting to create a new CPU profile should then update this table accordingly and try again. +pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { + MsrDefinitions([ + ( + RegisterAddress::IA32_BIOS_SIGN_ID, + ValueDefinitions::new(&[ + ValueDefinition { + short: "PATCH_SIGN_ID", + description: "Any non-zero value is the microcode update signature patch signature ID", + bits_range: (32, 63), + policy: ProfilePolicy::Passthrough, + } + ]) + ), + + ( + RegisterAddress::IA32_ARCH_CAPABILITIES, + ValueDefinitions::new(&[ + ValueDefinition { + short: "RDCL_NO", + description: "The processor is not susceptible to Rogue Data Cache Load (RDCL)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "IBRS_ALL", + description: "The processor supports enhanced IBRS", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "RSBA", + description: "The processor supports RSB Alternate", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "SKIP_L1DFL_VMENTRY", + description: "A value of 1 indicates the hypervisor need not flush the L1D on VM entry", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "SSB_NO", + description: "Processor is not susceptible to Speculation Store Bypass", + bits_range: (4, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "MDS_NO", + description: "Processor is not susceptible to Microarchitectural Data Sampling (MDS)", + bits_range: (5, 5), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "IF_PSCHANGE_MC_NO", + description: "The processor is not susceptible to a machine check error due to modifying the size of a code page without TLB invalidation", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "TSX_CTRL", + description: "If 1, indicates presence of IA32_TSX_CTRL MSR", + bits_range: (7, 7), + // TSX is riddled with CVEs + // TODO: Check that this is indeed the right policy + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "TAA_NO", + description: "If 1, processor is not affected by TAA", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "MCU_CONTROL", + description: "If 1, the processor supports the IA32_MCU_CONTROL MSR", + bits_range: (9, 9), + // TODO: Check what the IA32_MCU_CONTROL MSR is + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "MISC_PACKAGE_CTLS", + description: "The processor supports IA32_MISC_PACKAGE_CTLS MSR", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ENERGY_FILTERING_CTL", + description: "The processor supports setting and reading the IA32_MISC_PACKAGE_CTLS[0] (ENERGY_FILTERING_ENABLE) bit", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "DOITM:", + description: "If 1, the processor supports Data Operand Independent Timing Mode", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "SBDR_SSDP_NO", + description: "The processor is not affected by either the Shared Buffers Data Read (SBDR) vulnerability or the Sideband Stale Data Propagator (SSDP)", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "FBSDP_NO", + description: "The processor is not affected by the Fill Buffer Stale Data Propagator (DBSDP)", + bits_range: (14, 14), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "PSDP_NO", + description: "The processor is not affected by vulnerabilities involving the Primary Stale Data Propagator (PSDP)", + bits_range: (15, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "MCU_ENUMERATION", + description: "If 1, the processor supportss the IA32_MCU_ENUMERATION and IA32_MCU_STATUS MSRs", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "FB_CLEAR", + description: "If 1, the processor supports overwrite of fill buffer values as part of MD_CLEAR operations with the VERW instruction. + On these processors L1D_FLUSH does not overwrite fill buffer values", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "FB_CLEAR_CTRL", + description: "If 1, the processor supports the IA32_MCU_OPT_CTRL MSR and allows software to set bit 3 of that MSR (FB_CLEAR_DIS)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "RRSBA", + description: "A value of 1 indicates the processor may have the RRSBA alternate prediction behavior, if not disabled by RRSBA_DIS_U or RRSBA_DIS_S", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "BHI_NO", + description: "A value of 1 indicates BHI_NO branch prediction behavior, regardless of the value of IA32_SPEC_CTRL[BHI_DIS_S] MSR bit", + bits_range: (20, 20), + policy: ProfilePolicy::Passthrough, + }, + + ValueDefinition { + short: "XAPIC_DISABLE_STATUS", + description: "Enumerates that the IA32_XAPIC_DISABLE_STATUS MSR exists, and that bit 0 specifies whether the legacy xAPIC is disabled and APIC state is locked to x2APIC", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "MCU_EXTENDED_SERVICE", + description: "If 1, the processor supports MCU extended servicing - IA32_MCU_EXT_SERVICE MSR", + bits_range: (22, 22), + // TODO: Check + policy: ProfilePolicy::Static(0), + }, + + ValueDefinition { + short: "OVERCLOCKING_STATUS", + description: "If set, the IA32_OVERCLOCKING_STATUS MSR exists", + bits_range: (23, 23), + // TODO: Check + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "PBRSB_NO", + description: "If 1, the processor is not affected by issues related to Post-Barrier Return Stack Buffer Predictions", + bits_range: (24, 24), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "GDS_CTRL", + description: "If 1, the processor supports the GDS_MITG_DIS and GDS_MITG_LOCK bits of the IA32_MCU_OPT_CTRL MSR", + bits_range: (25, 25), + // TODO: Check + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "GDS_NO", + description: "If 1, the processor is not affected by Gather Data Sampling", + bits_range: (26, 26), + policy: ProfilePolicy::Passthrough, + }, + + ValueDefinition { + short: "RFDS_NO", + description: "If 1, processor is not affected by Register File Data Sampling", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + + ValueDefinition { + short: "RFDS_CLEAR", + description: "If 1, when VERW is executed the processor will clear stale data from register files affected by Register File Data Sampling", + bits_range: (28, 28), + policy: ProfilePolicy::Passthrough, + }, + + ValueDefinition { + short: "IGN_UMONITOR_SUPPORT", + description: "If 0, IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR) is not supported. If 1, it indicates support of IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR)", + bits_range: (29, 29), + // TODO: Check + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "MON_UMON_MITG_SUPPORT", + description: "If 1, indicates support for IA32_MCU_OPT_CTRL bit 7 (MON_UMON_MITG), otherwise it is not supported", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + + + ValueDefinition { + short: "PBOPT_SUPPORT", + description: "If 1, IA32_PBOPT_CTRL bit 0 (Prediction Barrier Option (PBOPT)) is supported, otherwise it is not", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "ITS_NO", + description: "If 0, the hypervisor indicates that the system is not affected by indirect Target Selection. If 1, then the hypervisor + indicates that the system may be affected by indirect Target Selection", + bits_range: (62, 62), + policy: ProfilePolicy::Passthrough, + + }, + + ]), + ), + + ( + RegisterAddress::IA32_PERF_CAPABILITIES, + ValueDefinitions::new(&[ + ValueDefinition { + short: "IA32_PERF_CAPABILITIES", + description: "Read Only MSR that enumerates the existence of performance monitoring features", + bits_range: (0, 63), + // This MSR is only valid if CPUID 0x1.ECX[15] is set, but that bit is always zeroed out for CPU profiles different from host + policy: ProfilePolicy::Deny + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_BASIC, + ValueDefinitions::new(&[ + ValueDefinition { + short: "VMCS_REV_ID", + description: "31-bit VMCS revision identifier. Processors that use the same VMCS revision identifier + use the same size for VMCS regions", + bits_range: (0,31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "REGION_SIZE", + description: "Number of bytes that software should allocate for the VMXON region and any VMCS region. It is a value greater than + 0 and at most 4096", + bits_range: (32, 44), + policy: ProfilePolicy::Inherit, + }, + + ValueDefinition { + short: "DUAL_MON", + description: " If 1, the logical processor supports the dual-monitor treatment of system-management + interrupts and system-management mode. See Section 33.15 for details of this treatment", + bits_range: (49, 49), + // TODO: Should we have Static(0)? here (I think that might be equivalent to what QEMU does) + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "MEM_TYPE", + description: "The memory type that should be used for the VMCS, for data structures referenced by pointers + in the VMCS (I/O bitmaps, virtual-APIC page, MSR areas for VMX transitions), and for the MSEG header", + bits_range: (50, 53), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VM_EXIT_INFO_INS_OUTS", + description: " If 1, the processor reports information in the VM-exit instruction-information field on VM exits + due to execution of the INS and OUTS instructions. + ", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VMX_CTRLS_DEFAULT_MUT", + description: "Any VMX controls that default to 1 may be cleared to 0", + bits_range: (55,55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "VM_ENTRY_HARDWARE_EXCEPTIONS", + description: "If 1, then software can use VM entry to deliver a hardware exception", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_PINBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ZERO_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_1_2", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (1, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_VMX_PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + + ValueDefinition { + short: "ALLOWED_ZERO", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (8, 31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short:"ALLOWED_ONE_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_1_2", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (33, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_ACTIVATE_VMX__PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4_6", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (4, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_8", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_18", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit 32 + X of this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_6", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (36, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_18", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + ( + RegisterAddress::IA32_VMX_EXIT_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_10_11", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (10, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_16_17", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (16, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_10_11", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (42, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_16_17", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (48, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + ( + RegisterAddress::IA32_VMX_ENTRY_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_12", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + // TODO: Also determines whether SSP is loaded on VM exit (do we need that?) + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_23_24", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (23, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26_31", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (26, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_12", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_23_24", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (55, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26_31", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (58, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_MISC, + ValueDefinitions::new(&[ + ValueDefinition { + short: "VMX_PREEMPTION_TSC_REL", + description: "specifies the relationship between the rate of the VMX-preemption timer and that of the timestamp counter (TSC)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough + }, + ValueDefinition { + short: "IA32_EFER.LMA_STORE", + description: "If 1, then VM exits store the value of IA32_EFER.LMA into the IA32-e mode guest VM-entry control", + bits_range: (5,5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "HLT_STATE", + description: "Activity state 1 (HLT) is supported", + bits_range: (6,6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "SHUTDOWN_STATE", + description: "Activity state 2 (shutdown) is supported", + bits_range: (7,7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "WAIT_FOR_SIPI__STATE", + description: "Activity state 3 (wait-for-SIPI) is supported", + bits_range: (8,8), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "VMX_INTEL_PT", + description: "If 1 then Intel Processor Trace can be used in VMX operation", + bits_range: (14,14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "RDMSR_SMM", + description: "If 1 then the RDMSR instruction can be used in system management mode (SMM) to read the IA32_SMBASE MSR", + bits_range: (15,15), + // TODO: Is this a reasonable policy? + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "VMX_NUM_CR3", + description: "The number of CR3-target values supported by the processor", + bits_range: (16,24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "MAX_MSR_STORE_LISTS", + description: "If N then 512*(N +1) is the recommended maximum number of MSRs to be included each of the VM-exit MSR-store list, VM-exit-MSR-load-list, VM-entry MSR-load list", + bits_range: (25, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "SMM_MONITOR_CTL_BIT2", + description: "If set then bit 2 of the IA32_SMM_MONITOR_CTL can be set to 1", + // TODO: Check policy. Perhaps this should rather be Static(0) ? + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "VM_WRITE_EXIT_FIELDS", + description: "If 1 then software can use VMWRITE to write to any supported field in the VMCS", + bits_range: (29,29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "VM_ENTRY_INJECTION", + description: "If 1 then VM entry permits injection of the following: software interrupt, software exception, or privileged software exception with an instruction length of 0", + bits_range: (30,30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "MSEG_REV_ID", + description: "MSEG revision identifier used by the processor", + bits_range: (32,63), + // TODO: Should this be Passthrough? + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_CR0_FIXED0, + // NOTE 1: If any entry in IA32_VMX_CR0_FIXED1 has ProfilePolicy::Stattic(0) then the corresponding entry here must also have ProfilePolicy::Static(0) + // + // NOTE 2: We use the inherit policy for reserved fields. + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR0.PE", + description: "If 0, then bit 0 (Protection Enable) of CR0 is allowed to be 0. bit 0 of CR0 enables real-address mode when clear.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.MP", + description: "If 0, then bit 1 (Monitor Coprocessor) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + // We expect this to be 0 for all modern processors, but Inherit is fine. + ValueDefinition { + short: "CR0.EM", + description: "If 0, then bit 2 (Emulation) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.TS", + description: "If 0, then bit 3 (Task Switched) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.ET", + description: "If 0, then bit 4 (Extension Type) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NE", + description: "If 0, then bit 5 (Numeric Error) of CR0 is allowed to be 0. Enables the PC-style x87 FPU error reporting mechanism when clear in CR0.", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_6_15", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (6, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.WP", + description: "If 0, then bit 16 (Write protect) of CR0 is allowed to be 0. If this bit is clear in CR0 then supervisor-level procedures are + allowed to write into read-only pages", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_17_17", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.AM", + description: "If 0, then bit 18 (Alignment Mask) of CR0 is allowed to be 0. If this bit is clear in CR0 then alignment checking is disabled.", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_19_28", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (19, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NW", + description: "If 0, then bit 29 (Not Write-through) of CR0 is allowed to be 0. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.CD", + description: "If 0, then bit 30 (Cache disable) of CR0 is allowed to be 0. If CR0 bits 30 and 29 are 0 then caching of memory locations + for the whole of physical memory in the processor's internal (and external) cache is enabled.", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + // TOD0: Disabling paging sounds bad, should we force this to 1? + ValueDefinition { + short: "CR0.PG", + description: "If 0, then bit 31 (Paging) of CR0 is allowed to be 0. If bit 31 of CR0 is cleared then paging is disabled (all linear addresses get treated as physical addresses).", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_32_63", + description: "Reports bits allowed to be 0 in CR0", + bits_range: (32, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + // NOTE: CR0_FIXED1 cannot be set by KVM, but this is OK, because its value is determined by CPUID anyway + ( + RegisterAddress::IA32_VMX_CR0_FIXED1, + ValueDefinitions::new(&[ + + ValueDefinition { + short: "CR0.PE", + description: "If 1, then bit 0 (Protection Enable) of CR0 is allowed to be 1. bit 0 of CR0 enables protected mode when set", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "CR0.MP", + description: "If 1, then bit 1 (Monitor Coprocessor) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + // We expect this to be 0 for all modern processors, but Inherit is fine. + ValueDefinition { + short: "CR0.EM", + description: "If 1, then bit 2 (Emulation) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.TS", + description: "If 1, then bit 3 (Task Switched) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.ET", + description: "If 1, then bit 4 (Extension Type) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NE", + description: "If 1, then bit 5 (Numeric Error) of CR0 is allowed to be 1. This bit enables the native (internal) mechanism for reporting x87 FPU errors when set in CR0.", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_6_15", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (6, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.WP", + description: "If 1, then bit 16 (Write protect) of CR0 is allowed to be 1. If this bit is set in CR0 then supervisor-level procedures are + inhibited from writing into read-only pages", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_17_17", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.AM", + description: "If 1, then bit 18 (Alignment Mask) of CR0 is allowed to be 1. If bit 18 of CR0 is set then automatic alignment checking is possible.", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_19_28", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (19, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.NW", + description: "If 1, then bit 29 (Not Write-through) of CR0 is allowed to be 1. See Intel SDM Vol. 3A Section 2.5 for more information", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.CD", + description: "If 1, then bit 30 (Cache disable) of CR0 is allowed to be 1. If CR0 bit 30 is 1 then caching is restricted", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR0.PG", + description: "If 1, then bit 31 (Paging) of CR0 is allowed to be 1 which enables paging", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR0_FIXED1_RESERVED_32_63", + description: "Reports bits allowed to be 1 in CR0", + bits_range: (32, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_CR4_FIXED0, + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR4.VME", + description: "If 0, then bit 0 (Virtual-8086 Mode Extension) of CR4 is allowed to be 0. Bit 0 of CR4 disables the interrupt and exception-handling extensions in virtual-8086 mode when clear.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PVI", + description: "If 0, then bit 1 (Protected-Mode Virtual Interrupts) of CR4 is allowed to be 0. Bit 1 of CR4 disables the virtual interrupt flag in protected mode when clear.", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.TSD", + description: "If 0, then bit 2 (Time Stamp Disable) of CR4 is allowed to be 0. Bit 2 of CR4 allows RDTSC instruction to be executed at any privilege level when clear.", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.DE", + description: "If 0, then bit 3 (Debugging extensions) of CR4 is allowed to be 0. When Bit 3 of CR4 is clear the processor aliases references to registers DR4 and DR5 for compatibility with legacy software", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PSE", + description: "If 0, then bit 4 (Page Size Extensions) of CR4 is allowed to be 0. Bit 4 of CR4 restricts 32-bit paging to pages of 4 KBytes when clear.", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PAE", + description: "If 0, then bit 5 (Physical Address Extension) of CR4 is allowed to be 0. Bit 5 of CR4 restricts physical addresses to 32 bits when clear", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + // TODO: Perhaps we should force this to 0? + ValueDefinition { + short: "CR4.MCE", + description: "If 0, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 0. Bit 6 of CR4 disables the machine-check exception when clear", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PGE", + description: "If 0, then bit 7 (Page Global Enable) of CR4 is allowed to be 0. Bit 7 of CR4 disables the global page feature when clear", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PCE", + description: "If 0, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 0. The RDPMC instruction can only be executed at protection level 0 when bit 8 of CR4 is clear", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSFXSR", + description: "If 0, then bit 9 (OS Support for FXSAVE and FXRSTOR) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXMMEXCPT", + description: "If 0, then bit 10 (OS Support for Unmaksed SIMD Floating-Point Exceptions) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.UMIP", + description: "If 0, then bit 11 (User-Mode instruction Prevention) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + // Maybe this could even be passthrogh? CHV is 64-bit only. + ValueDefinition { + short: "CR4.LA57", + description: "If 0, then bit 12 (57-bit linear addresses) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.VMXE", + description: "If 0, then bit 13 (VMX-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMXE", + description: "If 0, then bit 14 (SMX-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_15", + description: "If 0, then bit 15 (RESERVED) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.FSGSBASE", + description: "If 0, then bit 16 (FSGSBASE-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + // Probably irrelevant? + ValueDefinition { + short: "CR4.PCIDE", + description: "If 0, then bit 17 (PCID-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXSAVE", + description: "If 0, then bit 18 (XSAVE and Processor Extended States-Enable) of CR4 is allowed to be 0. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + // CPU Profiles do not support Key locker features for now + ValueDefinition { + short: "CR4.KL", + description: "If 0, then bit 19 (Key-Locker-Enable) of CR4 is allowed to be 0. When bit 19 of CR4 is set, the LOADIWKEY instruction is enabled and CPUID.0x19.EBX[0] is set if support for AES key locker instructions has been activated by system firmware", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.SMEP", + description: "If 0, then bit 20 (SMEP-Enable) of CR4 is allowed to be 0. See Intel SDM Vol 3.A Section 2.5 for more information", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMAP", + description: "If 0, then bit 21 (SMAP-Enable) of CR4 is allowed to be 0. See Intel SDM Vol 3.A Section 2.5 for more information", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PKE", + description: "If 0, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "CR4.CET", + description: "If 0, then bit 23 (Control-flow Enforcement Technology) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.PKS", + description: "If 0, then bit 24 (Enable protection keys for supervisor-mode pages) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.UINTR", + description: "If 0, then bit 25 (User Interrupts Enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_26", + description: "If 0, then bit 26 (RESERVED) of CR4 is allowed to be 0. See Intel SDM Vol.3.A Section 2.5 for more information.", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LASS", + description: "If 0, then bit 27 (User Interrupts Enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LAM_SUP", + description: "If 0, then bit 28 (Supervisor LAM-enable) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 25 for more information.", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "IA32_VMX_CR4_FIXED0", + description: "Reports bits allowed to be 0 in CR4", + bits_range: (29, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + // NOTE: CR4_FIXED1 cannot be set by KVM, but this is OK, because its value is determined by CPUID anyway + ( + RegisterAddress::IA32_VMX_CR4_FIXED1, + ValueDefinitions::new(&[ + ValueDefinition { + short: "CR4.VME", + description: "If 1, then bit 1 (Virtual-8086 Mode Extension) of CR4 is allowed to be 1. Bit 0 of CR4 enables the interrupt and exception-handling extensions in virtual-8086 mode when set.", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PVI", + description: "If 1, then bit 1 (Protected-Mode Virtual Interrupts) of CR4 is allowed to be 1. Bit 1 of CR4 enables hardware support for a virtual interrupt flag in protected mode when set.", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.TSD", + description: "If 1, then bit 2 (Time Stamp Disable) of CR4 is allowed to be 1. Bit 2 of CR4 restricts the execution of the RDTS instruction to procedures running at privilege level 0 when set.", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.DE", + description: "If 1, then bit 3 (Debugging extensions) of CR4 is allowed to be 1. Bit 3 of CR4 make references to debug registers DR4 and DR5 cause an undefined opcode exception when set", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PSE", + description: "If 1, then bit 4 (Page Size Extensions) of CR4 is allowed to be 1. Bit 4 of CR4 enables 4-MByte pages with 32-bit paging when set", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PAE", + description: "If 1, then bit 5 (Physical Address Extension) of CR4 is allowed to be 1. Bit 5 of CR4 enables paging to produce physical addresses of more than 32 bits when set", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + // TODO: Perhaps we should force this to 0? + ValueDefinition { + short: "CR4.MCE", + description: "If 1, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 1. Bit 6 of CR4 enables the machine-check exception when set", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PGE", + description: "If 1, then bit 7 (Page Global Enable) of CR4 is allowed to be 1. Bit 7 of CR4 enables the global page feature when set", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PCE", + description: "If 1, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 1. The RDPMC instruction can be executed at any protection level when bit 8 of CR4 is set.", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSFXSR", + description: "If 1, then bit 9 (OS Support for FXSAVE and FXRSTOR) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXMMEXCPT", + description: "If 1, then bit 10 (OS Support for Unmaksed SIMD Floating-Point Exceptions) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + // TODO: Is this always 0 for QEMU? + ValueDefinition { + short: "CR4.UMIP", + description: "If 1, then bit 11 (User-Mode instruction Prevention) of CR4 is allowed to be 1. If bit 11 of CR4 is set and CPL > 0 then the SGDT,SIDT,SLDT,SMSW and STR instructions cannot be executed.", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + // Maybe this could even be passthrogh? CHV is 64-bit only. + ValueDefinition { + short: "CR4.LA57", + description: "If 1, then bit 12 (57-bit linear addresses) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.VMXE", + description: "If 1, then bit 13 (VMX-Enable) of CR4 is allowed to be 1. Bit 13 of CR4 enables VMX operation when set.", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMXE", + description: "If 1, then bit 14 (SMX-Enable) of CR4 is allowed to be 1. Bit 14 of CR4 enables SMX operation when set.", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_15", + description: "If 1, then bit 15 (RESERVED) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.FSGSBASE", + description: "If 1, then bit 16 (FSGSBASE-Enable) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + // Probably irrelevant? + ValueDefinition { + short: "CR4.PCIDE", + description: "If 1, then bit 17 (PCID-Enable) of CR4 is allowed to be 1. Enables process-context identifiers (PCIDs) when bit 17 of CR4 is set. Applies only in IA-32e mode", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.OSXSAVE", + description: "If 1, then bit 18 (XSAVE and Processor Extended States-Enable) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + // CPU Profiles do not support Key locker features for now + ValueDefinition { + short: "CR4.KL", + description: "If 1, then bit 19 (Key-Locker-Enable) of CR4 is allowed to be 1. When bit 19 of CR4 is set, the LOADIWKEY instruction is enabled and CPUID.0x19.EBX[0] is set if support for AES key locker instructions has been activated by system firmware", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.SMEP", + description: "If 1, then bit 20 (SMEP-Enable) of CR4 is allowed to be 1. Bit 20 of CR4 enables supervisor-mode execution prevention when set", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.SMAP", + description: "If 1, then bit 21 (SMAP-Enable) of CR4 is allowed to be 1. Bit 21 of CR4 enables supervisor-mode access prevention when set", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.PKE", + description: "If 1, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 1. When bit 22 of CR4 is set, CPUID.0x7.ECX[4] is displayed as 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.CET", + description: "If 1, then bit 23 (Control-flow Enforcement Technology) of CR4 is allowed to be 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.PKS", + description: "If 1, then bit 24 (Enable protection keys for supervisor-mode pages) of CR4 is allowed to be 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.UINTR", + description: "If 1, then bit 25 (User Interrupts Enable) of CR4 is allowed to be 1. Bit 25 of CR4 enables user interrupts when set.", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "CR4.RESERVED_26", + description: "If 1, then bit 26 (RESERVED) of CR4 is allowed to be 1. See Intel SDM Vol.3A Section 2.5 for more information.", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LASS", + description: "If 1, then bit 27 (User Interrupts Enable) of CR4 is allowed to be 1. Bit 27 of CR4 enables LASS (Linear-Address-Space Separation) when set.", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.LAM_SUP", + description: "If 1, then bit 28 (Supervisor LAM-enable) of CR4 is allowed to be 1. Bit 28 of CR4 enables LAM (linear-address masking) for supervisor pointers when set.", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "CR4.RESERVED_29_63", + description: "Reports bits allowed to be 1 in CR4", + bits_range: (29, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_VMCS_ENUM, + ValueDefinitions::new(&[ + ValueDefinition{ + short: "MAX_INDEX", + description: "highest index value used for any VCMS encoding", + bits_range: (1, 9), + policy: ProfilePolicy::Inherit + } + ]) + + ), + + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS2, + ValueDefinitions::new(&[ + // Intel SDM Vol.3D A.3.3 documents that the ALLOWED_ZERO bits are actually always 0 for this MSR. + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUALIZE_APIC_ACCESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DESCRIPTOR_TABLE_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_RDTSCP", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUALIZE_X2APIC_MODE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_VPID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_WBINVD_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNRESTRICTED_GUEST", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_APIC_REGISTER_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_INTERRUPT_DELIVERY", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_LOOP_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDRAND_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_INVPCID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_VM_FUNCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VMCS_SHADOWING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_ENCLS_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDSEED_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_PML", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_EPT_VIOLATION_#VE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_XSAVES/XRSTORS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PASID_TRANSLATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MODE_BASED_EXECUTE_CONTROL_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SUB_PAGE_WRITE_PERMISSIONS_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTEL_PT_USES_GUEST_PHYSICAL_ADDRESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_SCALING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_USER_WAIT_AND_PAUSE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENABLE_PCONFIG", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_28_29", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (28, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VMM_BUS_LOCK_DETECTION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INSTRUCTION_TIMEOU", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUALIZE_APIC_ACCESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (33, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DESCRIPTOR_TABLE_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_RDTSCP", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUALIZE_X2APIC_MODE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_VPID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_WBINVD_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNRESTRICTED_GUEST", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_APIC_REGISTER_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VIRTUAL_INTERRUPT_DELIVERY", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_LOOP_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDRAND_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_INVPCID", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_VM_FUNCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VMCS_SHADOWING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_ENCLS_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDSEED_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_PML", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_EPT_VIOLATION_#VE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_XSAVES/XRSTORS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PASID_TRANSLATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MODE_BASED_EXECUTE_CONTROL_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SUB_PAGE_WRITE_PERMISSIONS_FOR_EPT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTEL_PT_USES_GUEST_PHYSICAL_ADDRESSES", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_SCALING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_USER_WAIT_AND_PAUSE", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENABLE_PCONFIG", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_28_29", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (60, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_VMM_BUS_LOCK_DETECTION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INSTRUCTION_TIMEOUT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-7. (Definitions of Secondary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + ( + RegisterAddress::IA32_VMX_EPT_VPID_CAP, + ValueDefinitions::new(&[ + ValueDefinition{ + short: "EPT_EXECUTE_ONLY", + description: "The processor supports execute-only translations by EPT", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "PAGE_WALK_LENGTH_4", + description: "Support for Page-walk length of 4", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "PAGE_WALK_LENGTH_5", + description: "Support for Page-walk length of 5", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_MEM_TYPE_UC", + description: "Software can configure the EPT paging structure to memory type to be unreachable (UC)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_MEM_TYPE_WB", + description: "Software can configure the EPT paging structure to memory type to be write-back (WB)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_PDE_2M", + description: "Software can configure the EPT PDE to map a 2-Mbyte page", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "EPT_PDPTE_1G", + description: "Software can configure the EPT PDPTE to map a 1-Gbyte page", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INVEPT", + description: "INVEPT instruction is supported", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "FLAGS_EPT", + description: "Accessed and dirty flags for EPT are supported", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "VM_EXIT_VIOLATIONS_INFO", + description: "If set, the processors advanced VM-exit information for EPT violations", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "SHADOW_STACK_CTL", + description: "Supervisor shadow-stack control is supported", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_INVEPT", + description: "The single-context INVEPT type is supported", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "ALL_CONTEXT_INVEPT", + description: "The all-context INVEPT type is supported", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INVVPID", + description: "INVVPID instruction is supported", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "INDIVIDUAL_ADDRESS_INVVPID", + description: "The individual address INVVPID type is supported", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_INVVPID", + description: "The single-context INVVPID type is supported", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "ALL_CONTEXT_INVVPID", + description: "The all-context INVEPT type is supported", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "SINGLE_CONTEXT_RETAINING_GLOBALS_INVVPID", + description: "The single-context-retaining-globals INVVPID type is supported", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short: "MAX_HLAT_PREFIX", + description: "Enumerates the maximum HLAT prefix size", + bits_range: (48, 53), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + + RegisterAddress::IA32_VMX_TRUE_PINBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ZERO_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_1_2", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (1, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_VMX_PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + + + ValueDefinition { + short: "ALLOWED_ZERO", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (8, 31), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition{ + short:"ALLOWED_ONE_EXTERNAL_INTERRUPT_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (32, 32), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_1_2", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (33, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_NMI_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4", + description: "VM entry allows control X to be 1 if bit X in this MSR is 1", + bits_range: (36, 36), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_VIRTUAL_NMIS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (37, 37), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_ACTIVATE_VMX__PREEMPTION_TIMER", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (38, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition{ + short:"ALLOWED_ONE_PROCESS_POSTED_INTERRUPTS", + description: "See Intel SDM Vol.3C Section 26.6.1 Table 26-5 (Definitions of Pin-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_PROCBASED_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_4_6", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (4, 6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_8", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_18", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26", + description: "Control X is allowed to be 0 if bit X of this MSR is 0", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit 32 + X of this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INTERRUPT_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TSC_OFFSETTING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (35, 35), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_6", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (36, 38), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HLT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (39, 39), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (40, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_INVLPG_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MWAIT_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDPMC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_RDTSC_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR3_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_TERTIARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_18", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_CR8_LOAD_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CR8_STORE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_TPR_SHADOW", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_NMI_WINDOW_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MOV_DR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_UNCONDITIONAL_I/O_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_I/O_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26", + description: "Control X is allowed to be 1 if bit X of this MSR is 1", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_TRAP_FLAG", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_USE_MSR_BITMAPS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_MONITOR_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_PAUSE_EXITING", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM. Vol.3C Section 26.6.2 Table 26-6 (Definitions of Primary Processor-Based VM-Execution Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_EXIT_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_10_11", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (10, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_13_14", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (13, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_16_17", + description: "Control X is allowed to be 0 if bit X in this MSR is 0", + bits_range: (16, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (28, 28), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short:"ALLOWED_ONE_SAVE_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_HOST_ADDRESS_SPACE_SIZE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_10_11", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (42, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (44, 44), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short: "ALLOWED_ONE_13_14", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (45, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ACKNOWLEDGE_INTERRUPT_O_EXIT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_16_17", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (48, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_VMX_PREEMPTION_TIMER_VALUE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (55, 55), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (56, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (58, 58), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_CLEAR_UINV", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (59, 59), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (60, 60), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (61, 61), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_SAVE_IA32_PERF_GLOBAL_CTL", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (62, 62), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_ACTIVATE_SECONDARY_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", + bits_range: (63, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_TRUE_ENTRY_CTLS, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ZERO_0_1", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (0, 1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_3_8", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (3, 8), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_12", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_23_24", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (23, 24), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ZERO_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ZERO_26_31", + description: "VM entry allows control X to be 0 if bit X in this MSR is zero", + bits_range: (26, 31), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_0_1", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (32, 33), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_DEBUG_CONTROLS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (34, 34), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_3_8", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (35, 40), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_IA_32E_MODE_GUES", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (41, 41), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ENTRY_TO_SMM", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (42, 42), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_DEACTIVATE_DUAL__MONITOR_TREATMENT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (43, 43), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_12", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (44, 44), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PERF_GLOBAL_CTRL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (45, 45), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_PAT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (46, 46), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_EFER", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (47, 47), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_BNDCFGS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (48, 48), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_CONCEAL_VMX_FROM_PT", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (49, 49), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_IA32_RTIT_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (50, 50), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_UINV", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (51, 51), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_CET_STATE", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (52, 52), + policy: ProfilePolicy::Static(0) + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (53, 53), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_LOAD_PKRS", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (54, 54), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_23_24", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (55, 56), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_ALLOW_SEAM_GUEST_TELEMETRY", + description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", + bits_range: (57, 57), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_26_31", + description:"VM entry allows control X to be 1 if bit X + 32 in this MSR is 1", + bits_range: (58, 63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + ( + RegisterAddress::IA32_VMX_VMFUNC, + ValueDefinitions::new(&[ + ValueDefinition { + short:"ALLOWED_ONE_EPTP_SWITCHING", + description: "See Intel SDM Vol.3C Section 26.6.14 Table 26-10. (Definitions of VM-Function Controls)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short:"ALLOWED_ONE_1_63", + description: "See Intel SDM Vol.3C Section 26.6.14 Table 26-10. (Definitions of VM-Function Controls)", + bits_range: (1, 63), + policy: ProfilePolicy::Inherit + }, + + ]) + ), + + // NOTE: This MSR is currently not supported by KVM. We keep the definition here regardless. (TODO: Maybe it would be better to remove it?) + ( + RegisterAddress::IA32_VMX_PROCBASED_CTLS3, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ONE_LOADIWKEY_EXITING", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (0,0), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_ENABLE_HLAT", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (1,1), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_EPT_PAGING_WRITE_CONTROL", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (2,2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_GUEST_PAGING_VERIFICATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (3,3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_IPI_VIRTUALIZATION", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (4,4), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_SEAM_GUEST_PHYSICAL_ADDRESS_WIDTH", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (5,5), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_ENABLE_MSR_LIST_INSTRUCTIONS", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (6,6), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_VIRTUALIZE_IA32_SPEC_CTRL", + description: "See Intel SDM Vol.3C Section 26.6.2 Table 26-8 (Definitions of Tertiary Processor-Based VM-Execution Controls)", + bits_range: (7,7), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_8_63", + description: "Control X is allowed to be 1 if bit X in this MSR is 1", + bits_range: (8,63), + policy: ProfilePolicy::Inherit + }, + ]) + ), + + // NOTE: This MSR is currently not supported by KVM. We keep the definition here regardless. (TODO: Maybe it would be better to remove it?) + ( + RegisterAddress::IA32_VMX_EXIT_CTLS2, + ValueDefinitions::new(&[ + ValueDefinition { + short: "ALLOWED_ONE_0_2", + description:"VM entry allows control X to be 1 if bit X is 1", + bits_range: (0, 2), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_PREMATURELY_BUSY_SHADOW_STACK", + description:"See Intel SDM Vol.3C Section 26.7.1", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit + }, + ValueDefinition { + short: "ALLOWED_ONE_4_63", + description:"VM entry allows control X to be 1 if bit X is 1", + bits_range: (4, 63), + policy: ProfilePolicy::Inherit + } + ]) + ), + ( + RegisterAddress::MSR_PLATFORM_INFO, + ValueDefinitions::new(&[ + ValueDefinition { + short: "PLATFORM_INFORMATION", + description: "Contains power management and other model specific features enumeration. In reality bits 15:8 describe the maximum frequency that does not require turbo. All other bits are reserved", + bits_range: (0, 63), + policy: ProfilePolicy::Deny + } + ]) + ) + ]) +}; + +/// Convenience function to lookup value definitions corresponding to the given MSR register address (as a const parameter). +#[cold] +#[inline(never)] +pub(in crate::x86_64) const fn msr_definitions() -> &'static [ValueDefinition] +{ + const { + let mut out = [].as_slice(); + let intel_definitions = INTEL_MSR_FEATURE_DEFINITIONS.as_slice(); + let mut i = 0; + let length = intel_definitions.len(); + while i < length { + let (addr, definitions) = intel_definitions[i]; + if addr.0 == REG_ADDR { + out = definitions.as_slice(); + break; + } + i += 1; + } + if out.is_empty() { + panic!("MSR definition not found"); + } + out + } +} diff --git a/arch/src/x86_64/msr_definitions/mod.rs b/arch/src/x86_64/msr_definitions/mod.rs index 577821a704..3a9860cf58 100644 --- a/arch/src/x86_64/msr_definitions/mod.rs +++ b/arch/src/x86_64/msr_definitions/mod.rs @@ -4,6 +4,7 @@ // use serde::{Deserialize, Serialize}; +pub mod intel; use crate::{deserialize_u32_hex, serialize_u32_hex}; /// The register address of an MSR From 56afe85bdfe5b840c37a1b1b6cce76b7814a83f2 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 11 Mar 2026 16:27:46 +0100 Subject: [PATCH 147/178] arch: I32_ARCH_CAPABILITIES changes according to PR review We apply changes suggested in the PR review to the IA32_ARCH_CAPABILITIES MSR policies. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/msr_based_features.rs | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index 6fae341539..c2c0748257 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -86,6 +86,8 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { bits_range: (1, 1), policy: ProfilePolicy::Inherit, }, + // Skylake has this bit set, but not Sapphire Rapids + // TODO: Is Inherit the right policy here? (Will it still be possible to use the Skylake profile on a Sapphire Rapids machine?) ValueDefinition { short: "RSBA", description: "The processor supports RSB Alternate", @@ -102,19 +104,19 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "SSB_NO", description: "Processor is not susceptible to Speculation Store Bypass", bits_range: (4, 4), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "MDS_NO", description: "Processor is not susceptible to Microarchitectural Data Sampling (MDS)", bits_range: (5, 5), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "IF_PSCHANGE_MC_NO", description: "The processor is not susceptible to a machine check error due to modifying the size of a code page without TLB invalidation", bits_range: (6, 6), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "TSX_CTRL", @@ -128,7 +130,8 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "TAA_NO", description: "If 1, processor is not affected by TAA", bits_range: (8, 8), - policy: ProfilePolicy::Passthrough, + // This is TSX related which we disable anyway + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "MCU_CONTROL", @@ -153,25 +156,25 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "DOITM:", description: "If 1, the processor supports Data Operand Independent Timing Mode", bits_range: (12, 12), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "SBDR_SSDP_NO", description: "The processor is not affected by either the Shared Buffers Data Read (SBDR) vulnerability or the Sideband Stale Data Propagator (SSDP)", bits_range: (13, 13), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "FBSDP_NO", description: "The processor is not affected by the Fill Buffer Stale Data Propagator (DBSDP)", bits_range: (14, 14), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "PSDP_NO", description: "The processor is not affected by vulnerabilities involving the Primary Stale Data Propagator (PSDP)", bits_range: (15, 15), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "MCU_ENUMERATION", @@ -191,7 +194,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "FB_CLEAR_CTRL", description: "If 1, the processor supports the IA32_MCU_OPT_CTRL MSR and allows software to set bit 3 of that MSR (FB_CLEAR_DIS)", bits_range: (18, 18), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { @@ -205,14 +208,14 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "BHI_NO", description: "A value of 1 indicates BHI_NO branch prediction behavior, regardless of the value of IA32_SPEC_CTRL[BHI_DIS_S] MSR bit", bits_range: (20, 20), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "XAPIC_DISABLE_STATUS", description: "Enumerates that the IA32_XAPIC_DISABLE_STATUS MSR exists, and that bit 0 specifies whether the legacy xAPIC is disabled and APIC state is locked to x2APIC", bits_range: (21, 21), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { @@ -228,14 +231,14 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { description: "If set, the IA32_OVERCLOCKING_STATUS MSR exists", bits_range: (23, 23), // TODO: Check - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "PBRSB_NO", description: "If 1, the processor is not affected by issues related to Post-Barrier Return Stack Buffer Predictions", bits_range: (24, 24), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "GDS_CTRL", @@ -249,39 +252,37 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "GDS_NO", description: "If 1, the processor is not affected by Gather Data Sampling", bits_range: (26, 26), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "RFDS_NO", description: "If 1, processor is not affected by Register File Data Sampling", bits_range: (27, 27), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "RFDS_CLEAR", description: "If 1, when VERW is executed the processor will clear stale data from register files affected by Register File Data Sampling", bits_range: (28, 28), - policy: ProfilePolicy::Passthrough, + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "IGN_UMONITOR_SUPPORT", description: "If 0, IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR) is not supported. If 1, it indicates support of IA32_MCU_OPT_CTRL bit 6 (IGN_UMONITOR)", bits_range: (29, 29), - // TODO: Check - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "MON_UMON_MITG_SUPPORT", description: "If 1, indicates support for IA32_MCU_OPT_CTRL bit 7 (MON_UMON_MITG), otherwise it is not supported", bits_range: (30, 30), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, - ValueDefinition { short: "PBOPT_SUPPORT", description: "If 1, IA32_PBOPT_CTRL bit 0 (Prediction Barrier Option (PBOPT)) is supported, otherwise it is not", From fe5b3df65749f62c138a7570ec9b171861c5a7a1 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 13 Mar 2026 14:51:34 +0100 Subject: [PATCH 148/178] arch: Change policies for CR4_FIXED_I This is to be consistent with recent changes to CPUID policies Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../x86_64/msr_definitions/intel/msr_based_features.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index c2c0748257..170789928c 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -1689,7 +1689,6 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { bits_range: (5, 5), policy: ProfilePolicy::Inherit }, - // TODO: Perhaps we should force this to 0? ValueDefinition { short: "CR4.MCE", description: "If 0, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 0. Bit 6 of CR4 disables the machine-check exception when clear", @@ -1706,7 +1705,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "CR4.PCE", description: "If 0, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 0. The RDPMC instruction can only be executed at protection level 0 when bit 8 of CR4 is clear", bits_range: (8, 8), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short: "CR4.OSFXSR", @@ -1793,7 +1792,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "CR4.PKE", description: "If 0, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 0. See Intel SDM Vol. 3.A Section 2.5 for more information.", bits_range: (22, 22), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "CR4.CET", @@ -1880,7 +1879,6 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { bits_range: (5, 5), policy: ProfilePolicy::Inherit }, - // TODO: Perhaps we should force this to 0? ValueDefinition { short: "CR4.MCE", description: "If 1, then bit 6 (Machine-Check Enable) of CR4 is allowed to be 1. Bit 6 of CR4 enables the machine-check exception when set", @@ -1897,7 +1895,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "CR4.PCE", description: "If 1, then bit 8 (Performance-Monitoring Counter Enable) of CR4 is allowed to be 1. The RDPMC instruction can be executed at any protection level when bit 8 of CR4 is set.", bits_range: (8, 8), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short: "CR4.OSFXSR", @@ -1985,7 +1983,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short: "CR4.PKE", description: "If 1, then bit 22 (Enable protection keys for user-mode pages) of CR4 is allowed to be 1. When bit 22 of CR4 is set, CPUID.0x7.ECX[4] is displayed as 1. See Intel SDM Vol. 3.A Section 2.5 for more information.", bits_range: (22, 22), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short: "CR4.CET", From b249f20f6aecbd2860aef65e92b3333f742c5b32 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 16 Jan 2026 10:31:52 +0100 Subject: [PATCH 149/178] arch: MSR compatibility checks While KVM already has compatibility checks for most MSR-based features which run when these MSRs are set by userspace, we do not get very much useful information about exactly what the problem is when any of these checks fail. Hence to be on the safe side and also to ensure good UX for users running into errors when trying to apply a CPU profile we introduce our own compatibility checks for Intel CPUs that log at the error and debug levels. The error logs aim to provide the minimal amount of information required to investigate the problem further, while the debug logs provide (much) more convenience when debugging. We will incorporate these checks in the context of CPU profiles in a later commit. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/intel/mod.rs | 1 + .../intel/msr_based_features.rs | 735 ++++++++++++++++++ 2 files changed, 736 insertions(+) diff --git a/arch/src/x86_64/msr_definitions/intel/mod.rs b/arch/src/x86_64/msr_definitions/intel/mod.rs index 84e2434535..9811363bab 100644 --- a/arch/src/x86_64/msr_definitions/intel/mod.rs +++ b/arch/src/x86_64/msr_definitions/intel/mod.rs @@ -5,3 +5,4 @@ mod msr_based_features; pub use msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; +pub(in crate::x86_64) use msr_based_features::check_feature_msr_compatibility; diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index 170789928c..9b6bb40e90 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -3,6 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::collections::HashMap; + +use log::{debug, error, warn}; + use crate::x86_64::msr_definitions::{ MsrDefinitions, ProfilePolicy, RegisterAddress, ValueDefinition, ValueDefinitions, }; @@ -3683,3 +3687,734 @@ pub(in crate::x86_64) const fn msr_definitions() -> &'stati out } } + +/// Check that the `src_feature_msrs` are compatible with those given in `dest_feature_msrs`. +/// +/// If this check fails, then software that works under the `src_feature_msrs`, may no longer +/// behave correctly with `dest_feature_msrs`. +/// +/// The `src_id` and `dest_id` strings are only used for logging purposes to identify what +/// is being compared (e.g. CPU profile vs host where the profile should be applied, etc). +/// +/// NOTE: This function assumes CPUID compatibility. +/// +/// All register addresses/keys in [`INTEL_MSR_FEATURE_DEFINITIONS`] are checked, except for: +/// - IA32_BIOS_SIGN_ID, +/// - IA32_PERF_CAPABILITIES, +/// - MSR_PLATFORM_INFO +/// +/// IA32_PERF_CAPABILITIES are inherently incompatible between different VMs and we do not +/// think it makes much sense to compare IA32_BIOS_SIGN_ID or MSR_PLATFORM_INFO in this context. +/// +/// # Errors +/// +/// This function does not return early upon error, but rather attempts all MSR-based feature +/// checks while logging errors it encounters. If any of these checks fail an error is returned +/// at the end. +/// +/// We also just use the unit type as the error variant for now, as not much can be done to +/// recover from these errors at runtime and the logs should provide the user with enough +/// information to debug the problem. +/// +/// At this moment in time we prefer the aforementioned approach over designing a complex +/// error type capable of tracking everything that might fail. +pub(in crate::x86_64) fn check_feature_msr_compatibility( + src_feature_msrs: &HashMap, + dest_feature_msrs: &HashMap, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + // First check IA32_ARCH_CAPABILITIES + // Since we are assuming CPUID to be compatible we + // may assume that either both src and dest have this + // MSR or none of them do + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_ARCH_CAPABILITIES.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_ARCH_CAPABILITIES.0)) + { + is_err |= + check_arch_capabilities_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + // Next let us consider IA32_VMX_BASIC + let mut true_ctls_exist_src = false; + let mut true_ctls_exist_dest = false; + // Since we assume compatibility of CPUID we can again check that either both src and dest + // have the IA32_VMX_BASIC MSR or none of them do + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_BASIC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_BASIC.0)) + { + true_ctls_exist_src = (*src_val & (1 << 55)) != 0; + true_ctls_exist_dest = (*dest_val & (1 << 55)) != 0; + is_err |= check_vmx_basic_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + // The following closure saves us some boiler plate when checking the various VMX CTLS that have a default1 class + let check_vmx_ctls_with_default1_class = |vmx_ctrl_reg_address: RegisterAddress, + vmx_true_ctrl_reg_address: RegisterAddress, + check_id: &str, + src_id: &str, + dest_id: &str| + -> Result<(), ()> { + let mut is_err = false; + let src_reg_address = { + conditional_select( + vmx_ctrl_reg_address.0, + vmx_true_ctrl_reg_address.0, + true_ctls_exist_src, + ) + }; + + let dest_reg_address = { + conditional_select( + vmx_ctrl_reg_address.0, + vmx_true_ctrl_reg_address.0, + true_ctls_exist_dest, + ) + }; + + let src_val = src_feature_msrs.get(&src_reg_address); + let dest_val = dest_feature_msrs.get(&dest_reg_address); + if src_val.is_some() && dest_val.is_none() { + error!( + "{check_id} compatibility check failed: unable to compare value of MSR {src_reg_address:#x} of {src_id} with value of MSR {dest_reg_address:#x} of {dest_id}, because the latter value was not found" + ); + is_err = true; + } + if let Some((src_val, dest_val)) = src_val.zip(dest_val) + && let Err(CtlsCheck { + bitset_only_zero_src_lo, + bitset_only_one_src_hi, + }) = check_negative_subset_lo_and_subset_hi(*src_val, *dest_val) + { + is_err = true; + if let Some(bitset) = bitset_only_zero_src_lo { + for_each_bitpos(bitset, |bit_pos| { + debug!( + "{check_id} compatibility check failed: bit {bit_pos} is 0 in MSR:={src_reg_address:#x} of {src_id}, but 1 in MSR:={dest_reg_address:#x} of {dest_id}" + ); + }); + } + + if let Some(bitset) = bitset_only_one_src_hi { + for_each_bitpos(bitset, |bit_pos| { + debug!( + "{check_id} compatibility check failed: bit {bit_pos} is 1 in MSR:={src_reg_address:#x} of {src_id}, but 0 in MSR:={dest_reg_address:#x} of {dest_id}" + ); + }); + } + } + + if is_err { + if let Some(src_val) = src_val + && let Some(dest_val) = dest_val + { + error!( + "{check_id} compatibility check failed: {src_id} register address:={src_reg_address:#x}, {src_id} value:={:#x}, {dest_id} register address:={dest_reg_address:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + } + Err(()) + } else { + Ok(()) + } + }; + + // Now we consider IA32_VMX_PINBASED_CTLS and/or IA32_VMX_TRUE_BINBASED_CTLS + // (Intel SDM Vol.3D A.3.1) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_PINBASED_CTLS, + RegisterAddress::IA32_VMX_TRUE_PINBASED_CTLS, + "IA32_VMX_PINBASED_CTLS", + src_id, + dest_id, + ) + .is_err(); + + // Next up is IA32_VMX_PROCBASED_CTLS and/or IA32_VMX_TRUE_PROCBASED_CTLS + // (Intel SDM Vol.3D A.3.2.) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + RegisterAddress::IA32_VMX_TRUE_PROCBASED_CTLS, + "IA32_PROCBASED_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_EXIT_CTLS and/or IA32_VMX_TRUE_EXIT_CTLS + // (Intel SDM Vol.3D A.4) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_EXIT_CTLS, + RegisterAddress::IA32_VMX_TRUE_EXIT_CTLS, + "IA32_VMX_EXIT_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_ENTRY_CTLS and/or IA32_VMX_TRUE_ENTRY_CTLS + // (Intel SDM Vol.3D A.5) + is_err |= check_vmx_ctls_with_default1_class( + RegisterAddress::IA32_VMX_ENTRY_CTLS, + RegisterAddress::IA32_VMX_TRUE_ENTRY_CTLS, + "IA32_VMX_ENTRY_CTLS", + src_id, + dest_id, + ) + .is_err(); + // Check IA32_VMX_MISC + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_MISC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_MISC.0)) + { + is_err |= check_vmx_misc_msr(*src_val, *dest_val, src_id, dest_id).is_err(); + } + // Check IA32_VMX_CR0_FIXED0 + if let Some((src_fixed0, dest_fixed0)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_CR0_FIXED0.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_CR0_FIXED0.0)) + { + is_err |= + check_cr_i_compatibility::<0>(*src_fixed0, *dest_fixed0, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_CR4_FIXED0 + if let Some((src_fixed0, dest_fixed0)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_CR4_FIXED0.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_CR4_FIXED0.0)) + { + is_err |= + check_cr_i_compatibility::<4>(*src_fixed0, *dest_fixed0, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_VMCS_ENUM + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_VMCS_ENUM.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_VMCS_ENUM.0)) + { + is_err |= check_vmx_vmcs_enum_compatibility(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + // Check IA32_VMX_PROCBASED_CTLS2 + // This MSR exists only if bit 63 of IA32_VMX_PROCBASED_CTLS is set + // (note that if it is set on src then our IA32_VMX_PROCBASED_CTLS check + // ensures that it is also set on dest) + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS2.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS2.0)) + { + let src_val = *src_val; + let dest_val = *dest_val; + // First verify that the first 32 bits are indeed 0 as documented by Intel, otherwise we have misunderstood the documentation + // and we should not continue. + let lo_mask = u64::from(u32::MAX); + assert_eq!( + src_val & lo_mask, + 0, + "BUG: The 32-first bits of the IA32_VMX_PROCBASED_CTLS2 MSR were not zero for src" + ); + assert_eq!( + dest_val & lo_mask, + 0, + "BUG: The 32-first bits of the IA32_VMX_PROCBASED_CTLS2 MSR were not zero for dest" + ); + // Note that the 32-first bits are documented to always be 0 + if let Err(bits_only_in_src) = check_subset(src_val, dest_val) { + is_err = true; + error!( + "IA32_VMX_PROCBASED_CTLS2 compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_PROCBASED_CTLS2 check failed: VM entry allows control X:={bit_pos} to be 1 for {src_id}, but not for {dest_id}" + ); + }); + } + } + + // Check IA32_VMX_PROCBASED_CTLS3 + // This MSR exists only if bit 49 of IA32_VMX_PROCBASED_CTLS is set + // (note that if it is set on src then our IA32_VMX_PROCBASED_CTLS check + // ensures that it is also set on dest) + + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS3.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_PROCBASED_CTLS3.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_PROCBASED_CTLS3 compatibility check failed: {src_id} value:= {:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_PROCBASED_CTLS3 compatibility check failed: VM entry allows control X:={bit_pos} for {src_id}, but not for {dest_id}" + ); + }); + } + + // Check IA32_VMX_EXIT_CTLS2 + // This MSR exists only if bit 63 of the IA32_VMX_EXIT_CTLS is set + // (note that if it is set on src then our IA32_VMX_EXIT_CTLS check + // ensures that it is also set on dest) + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_EXIT_CTLS2.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_EXIT_CTLS2.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_EXIT_CTLS2 compatibility check failed: {src_id} value:={:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_EXIT_CTLS2 compatibility check failed: bit {bit_pos} is set for {src_id}, but not for {dest_id}" + ); + }); + } + + // Check IA32_VMX_EPT_VPID_CAP (Intel SDM Vol.3D A.10) + // + // This MSR is only available on processors where bit 63 of IA32_VMX_PROCBASED_CTLS is 1 and that either + // have bit 33 of IA32_VMX_PROCBASED_CTLS2 set, or bit 37 of IA32_VMX_PROC_BASED_CTLS2 set. Since we + // already check for compatibility of those bits, we may assume that if this MSR is available for src, then + // it is also available for dest. + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_EPT_VPID_CAP.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_EPT_VPID_CAP.0)) + { + is_err |= check_vpid_and_ept_capabilities(*src_val, *dest_val, src_id, dest_id).is_err(); + } + + if let Some((src_val, dest_val)) = src_feature_msrs + .get(&RegisterAddress::IA32_VMX_VMFUNC.0) + .zip(dest_feature_msrs.get(&RegisterAddress::IA32_VMX_VMFUNC.0)) + && let Err(bits_only_in_src) = check_subset(*src_val, *dest_val) + { + is_err = true; + error!( + "IA32_VMX_VMFUNC compatibility check failed: {src_id} value:={:#x}, {dest_id} value:={:#x}", + *src_val, *dest_val + ); + for_each_bitpos(bits_only_in_src, |bit_pos| { + debug!( + "IA32_VMX_VMFUNC compatibility check failed: VM entry allows bit X:={bit_pos} of the VM-function controls to be 1 for {src_id}, but not for {dest_id}" + ); + }); + } + + if is_err { Err(()) } else { Ok(()) } +} + +/// `a` if `condition` else `b` +fn conditional_select(a: u32, b: u32, condition: bool) -> u32 { + let a_mask = u32::from(condition).wrapping_neg(); + let b_mask = !a_mask; + (a & a_mask) | (b & b_mask) +} + +/// Check that the values of MSR IA32_ARCH_CAPABILITIES are compatible. +/// +/// If this check fails then programs that work when the value is `src_val`, may possibly +/// no longer work if the value is `dest_val`. +/// +/// See: Ch.2 Table 2-2. IA-32 Architectural MSRs in Intel SDM Vol.4 +fn check_arch_capabilities_compatibility( + src_val: u64, + dest_val: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + // Make a mask out of + const TSX_CONTROL: u64 = 1 << 7; + const MCU_CONTROL: u64 = 1 << 9; + const MISC_PACKAGE_CTLS: u64 = 1 << 10; + const ENERGY_FILTERING_CTL: u64 = 1 << 11; + const DOITM: u64 = 1 << 12; + const MCU_ENUMERATION: u64 = 1 << 16; + const FB_CLEAR: u64 = 1 << 17; + const FB_CLEAR_CTRL: u64 = 1 << 18; + const XAPIC_DISABLE_STATUS: u64 = 1 << 21; + const MCU_EXTENDED_SERVICE: u64 = 1 << 22; + const OVERCLOCKING_STATUS: u64 = 1 << 23; + const GDS_CTRL: u64 = 1 << 25; + // TODO: Should we perhaps ignore checking this (is it too strict)? + const RFDS_CLEAR: u64 = 1 << 28; + const IGN_UMONITOR_SUPPORT: u64 = 1 << 29; + const MON_UMON_MITG_SUPPORT: u64 = 1 << 30; + const PBOPT_SUPPORT: u64 = 1 << 32; + + let mask: u64 = { + TSX_CONTROL + | MCU_CONTROL + | MISC_PACKAGE_CTLS + | ENERGY_FILTERING_CTL + | DOITM + | MCU_ENUMERATION + | FB_CLEAR + | FB_CLEAR_CTRL + | XAPIC_DISABLE_STATUS + | MCU_EXTENDED_SERVICE + | OVERCLOCKING_STATUS + | GDS_CTRL + | IGN_UMONITOR_SUPPORT + | MON_UMON_MITG_SUPPORT + | PBOPT_SUPPORT + | RFDS_CLEAR + }; + if let Err(only_in_src) = check_subset(src_val & mask, dest_val & mask) { + error!( + "IA32_ARCH_CAPABILITIES compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + let definitions = msr_definitions::<{ RegisterAddress::IA32_ARCH_CAPABILITIES.0 }>(); + log_features_only_in_src(only_in_src, src_id, definitions, "IA32_ARCH_CAPABILITIES"); + Err(()) + } else { + Ok(()) + } +} + +/// Check that the values of MSR IA32_VMX_BASIC are compatible. +/// +/// See Intel SDM Vol.3D A.1 for more information about the IA32_VMX_BASIC MSR +fn check_vmx_basic_compatibility( + src_val: u64, + dest_val: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + // All bits between 0 and 53 are expected to be equal (except bit 49) + let req_eq_mask: u64 = ((1 << 54) - 1) & (!(1 << 49)); + let src_req_eq = src_val & req_eq_mask; + let dest_req_eq = dest_val & req_eq_mask; + if src_req_eq != dest_req_eq { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_BASIC.0 }>(); + log_inequalities( + src_req_eq, + dest_req_eq, + definitions, + src_id, + dest_id, + "IA32_VMX_BASIC compatibility", + ); + } + // bits 49, 54, 55, and 56 indicate some form of capability and we need to check + // that these bits in the `src_value` are a subset of those in `dest_value` + let req_subset_eq_mask: u64 = (1 << 54) | (1 << 55) | (1 << 56) | (1 << 49); + let src_val_seq = req_subset_eq_mask & src_val; + let dest_val_seq = req_subset_eq_mask & dest_val; + is_err |= check_subset(src_val_seq, dest_val_seq).is_err(); + + if is_err { + error!( + "IA32_VMX_BASIC compatibility check failed: {src_id} value:={src_val:#x}, {dest_id} value:={dest_val:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check that no values are only in a +/// +/// Upon error a bitset is returned with the +/// bits that are only available in `src_val` +fn check_subset(src_val: u64, dest_val: u64) -> Result<(), u64> { + let only_in_src_val = src_val & (src_val ^ dest_val); + if only_in_src_val != 0 { + Err(only_in_src_val) + } else { + Ok(()) + } +} + +/// Checks the following: +/// 1. For any X < 32; If bit X of src_val is 0 then bit X of dest_val is also 0 +/// 2. For any X >= 32; If bit X of src_val is 1 then bit X of dest_val is also 1 +struct CtlsCheck { + bitset_only_zero_src_lo: Option, + bitset_only_one_src_hi: Option, +} + +fn check_negative_subset_lo_and_subset_hi(src_val: u64, dest_val: u64) -> Result<(), CtlsCheck> { + let lo_mask = (1_u64 << 32) - 1; + let hi_mask = !lo_mask; + + let lo_check = check_subset((!src_val) & lo_mask, (!dest_val) & lo_mask); + + let hi_check = check_subset(src_val & hi_mask, dest_val & hi_mask); + + if lo_check.is_ok() && hi_check.is_ok() { + Ok(()) + } else { + Err(CtlsCheck { + bitset_only_zero_src_lo: lo_check.err(), + bitset_only_one_src_hi: hi_check.err(), + }) + } +} + +/// Check that the values of MSR IA32_VMX_MISC are compatible. +/// +/// See Intel SDM Vol.3D A.6 for more information about the IA32_VMX_MISC MSR +fn check_vmx_misc_msr( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + let subset_eq_check_mask: u64 = { + (1 << 5) + | (1 << 6) + | (1 << 7) + | (1 << 8) + | (1 << 14) + | (1 << 15) + | (1 << 28) + | (1 << 29) + | (1 << 30) + }; + if let Err(only_in_src) = check_subset( + subset_eq_check_mask & src_value, + subset_eq_check_mask & dest_value, + ) { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + log_features_only_in_src(only_in_src, src_id, definitions, "IA32_VMX_MISC"); + } + + let eq_mask: u64 = { + // TODO: Do we also need to check that the MSEG revisions match? + (16..=24).fold(0_u64, |acc, next| acc | (1 << next)) + }; + + let src_req_eq_val = src_value & eq_mask; + let dest_req_eq_val = dest_value & eq_mask; + if src_req_eq_val != dest_req_eq_val { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + log_inequalities( + src_req_eq_val, + dest_req_eq_val, + definitions, + src_id, + dest_id, + "IA32_VMX_MISC", + ); + } + + let leq_mask: u64 = { (25..=27).fold(0_u64, |acc, next| acc | (1 << next)) }; + + let src_req_leq = src_value & leq_mask; + let dest_req_leq = dest_value & leq_mask; + if src_req_leq > dest_req_leq { + is_err = true; + debug!( + "IA32_VMX_MISC compatibility check failed when checking definition: {:?}, {src_id} has value:={src_req_leq}, {dest_id} has value:={dest_req_leq}", + max_msr_store_lists_def(), + ); + } + + if is_err { + error!( + "IA32_VMX_MISC compatibility check failed: {src_id} value:={src_value:#x}, {dest_id} value:={dest_value:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_CR{I}_FIXED0 for I = 0, 4. +/// +/// See Intel SDM Vol.3D A.7 & A.8 for more information about these MSRs. +/// +/// NOTE: We don't need to check compatibility for CR{I}_FIXED1 because +/// that is ensured by CPUID. +fn check_cr_i_compatibility( + src_fixed0: u64, + dest_fixed0: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let cri = const { + match I { + 0 => "CR0", + 4 => "CR4", + _ => { + panic!("only 0 and 4 may be used") + } + } + }; + + // Need to ensure that there are no bits that are only 0 in src_fixed0 and also no bits + // that are only 1 in src_fixed1. + + if let Err(only_zero_in_src) = check_subset(!src_fixed0, !dest_fixed0) { + error!( + "IA32_VMX_{cri}_FIXED0 compatibility check failed: {src_id} value:={src_fixed0:#x}, {dest_id} value:={dest_fixed0:#x}" + ); + for_each_bitpos(only_zero_in_src, |bit_pos| { + debug!( + "IA32_VMX_{cri}_FIXED0 compatibility check failed: bit {bit_pos} is allowed to be 0 in {cri} for {src_id}, but not for {dest_id}" + ); + }); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_VMCS_ENUM. +/// +/// See Intel SDM Vol.3D A.9 for more information about IA32_VMX_VMCS_ENUM. +fn check_vmx_vmcs_enum_compatibility( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mask = (1..=9).fold(0_u64, |acc, next| acc | (1 << next)); + let src_req_leq = src_value & mask; + let dest_req_leq = dest_value & mask; + if src_req_leq > dest_req_leq { + error!( + "VMX_VMCS_ENUM compatibility check failed: MAX_INDEX for {src_id}:={src_req_leq} is greater than MAX_INDEX:={dest_req_leq} for {dest_id}" + ); + Err(()) + } else { + Ok(()) + } +} + +/// Check compatibility of MSRs IA32_VMX_EPT_VPID_CAP. +/// +/// See (Intel TODO:) Vol. 3D A.10 for more information about IA32_VMX_EPT_VPID_CAP. +// Only if IA32_VMX_PROCBASED_CTLS[63] & (IA32_VMX_PROCBASED_CTLS2[33] | IA32_VMX_PROCBASED_CTLS2[37]) +fn check_vpid_and_ept_capabilities( + src_value: u64, + dest_value: u64, + src_id: &str, + dest_id: &str, +) -> Result<(), ()> { + let mut is_err = false; + let subset_eq_mask = { (1 << 44) - 1 }; + + if let Err(bits_only_in_src) = + check_subset(src_value & subset_eq_mask, dest_value & subset_eq_mask) + { + is_err = true; + let definitions = msr_definitions::<{ RegisterAddress::IA32_VMX_EPT_VPID_CAP.0 }>(); + log_features_only_in_src( + bits_only_in_src, + src_id, + definitions, + "IA32_VMX_EPT_VPID_CAP", + ); + } + + let leq_mask = { (48..=53).fold(0_u64, |acc, next| acc | (1 << next)) }; + let src_req_leq = src_value & leq_mask; + let dest_req_leq = dest_value & leq_mask; + if src_req_leq > dest_req_leq { + is_err = true; + debug!( + "IA32_VMX_EPT_VPID_CAP compatibility check failed: maximum HLAT prefix size is {src_req_leq} for {src_id}, but {dest_req_leq} for {dest_id}" + ); + } + if is_err { + error!( + "IA32_VMX_EPT_VPID_CAP compatibility check failed: {src_id} value:={src_value:#x}, {dest_id} value:={dest_value:#x}" + ); + Err(()) + } else { + Ok(()) + } +} + +fn for_each_bitpos(bits: u64, mut cb: impl FnMut(u8)) { + let mut bits = bits; + while bits != 0 { + let pos = bits.trailing_zeros() as u8; + cb(pos); + let lsb = bits & bits.wrapping_neg(); + bits ^= lsb; + } +} + +#[inline(never)] +#[cold] +fn log_features_only_in_src( + only_in_src: u64, + src_id: &str, + definitions: &[ValueDefinition], + check_id: &str, +) { + for_each_bitpos(only_in_src, |bit_pos| { + let Some(def) = definitions + .iter() + .find(|def| (def.bits_range.0..=def.bits_range.1).contains(&bit_pos)) + else { + debug!( + "{check_id} compatibility check failed: bit:={bit_pos} is only set for {src_id}" + ); + warn!( + "unable to produce proper debug log: No MSR value definition found for bit:={bit_pos} check:={check_id} compatibility" + ); + return; + }; + debug!( + "{check_id} compatibility check failed: feature bit {bit_pos} only set for {src_id}: feature definition:={def:?}" + ); + }); +} + +#[inline(never)] +#[cold] +fn log_inequalities( + src_val: u64, + dest_val: u64, + definitions: &[ValueDefinition], + src_id: &str, + dest_id: &str, + check_id: &str, +) { + for def in definitions { + let mask = + (def.bits_range.0..=def.bits_range.1).fold(0_u64, |acc, next| acc | (1_u64 << next)); + let val_src = mask & src_val; + let val_dest = mask & dest_val; + if src_val != dest_val { + debug!( + "Check: {check_id} compatibility failed: on definition:={def:?}, values are required to be equal, but we have {src_id} value:={val_src:#x}, {dest_id} value:={val_dest:#x}" + ); + } + } +} + +#[inline(never)] +#[cold] +const fn max_msr_store_lists_def() -> &'static ValueDefinition { + const { + let defs = msr_definitions::<{ RegisterAddress::IA32_VMX_MISC.0 }>(); + // Currently stored at index = 8, if this changes we make sure that we fail at compile time. + // We do not perform a search as the order is unlikely to change frequently and we want to keep + // compile times down. + let def = &defs[8]; + assert!( + def.bits_range.0 == 25, + "MAX_MSR_STORE_LISTS definition is no longer at index 8 in the ValueDefinitions corresponding to IA32_VMX_MISC, please update the index" + ); + assert!( + def.bits_range.1 == 27, + "MAX_MSR_STORE_LISTS definition is no longer at index 8 in the ValueDefinitions corresponding to IA32_VMX_MISC, please update the index" + ); + def + } +} From 550d63361ace8394ae8926c44ca64b54203abc6f Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 11 Mar 2026 17:18:31 +0100 Subject: [PATCH 150/178] arch: Stricter MSR compatibility checks for IA32_ARCH_CAPABILITIES Due to the changes to IA32_ARCH_CAPABILITIES applied after the last code review we introduce stricter checks. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/msr_based_features.rs | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index 9b6bb40e90..bc36eb8581 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -4029,7 +4029,13 @@ fn check_arch_capabilities_compatibility( dest_id: &str, ) -> Result<(), ()> { // Make a mask out of + const RDCL_NO: u64 = 1 << 0; + const IBRS_ALL: u64 = 1 << 1; + const SKIP_L1_DFL_VMENTRY: u64 = 1 << 3; + const SSB_NO: u64 = 1 << 4; + const MDS_NO: u64 = 1 << 5; const TSX_CONTROL: u64 = 1 << 7; + const TAA_NO: u64 = 1 << 8; const MCU_CONTROL: u64 = 1 << 9; const MISC_PACKAGE_CTLS: u64 = 1 << 10; const ENERGY_FILTERING_CTL: u64 = 1 << 11; @@ -4037,10 +4043,14 @@ fn check_arch_capabilities_compatibility( const MCU_ENUMERATION: u64 = 1 << 16; const FB_CLEAR: u64 = 1 << 17; const FB_CLEAR_CTRL: u64 = 1 << 18; + const BHI_NO: u64 = 1 << 20; const XAPIC_DISABLE_STATUS: u64 = 1 << 21; const MCU_EXTENDED_SERVICE: u64 = 1 << 22; const OVERCLOCKING_STATUS: u64 = 1 << 23; + const PBRSB_NO: u64 = 1 << 24; const GDS_CTRL: u64 = 1 << 25; + const GDS_NO: u64 = 1 << 26; + const RFDS_NO: u64 = 1 << 27; // TODO: Should we perhaps ignore checking this (is it too strict)? const RFDS_CLEAR: u64 = 1 << 28; const IGN_UMONITOR_SUPPORT: u64 = 1 << 29; @@ -4048,7 +4058,13 @@ fn check_arch_capabilities_compatibility( const PBOPT_SUPPORT: u64 = 1 << 32; let mask: u64 = { - TSX_CONTROL + RDCL_NO + | IBRS_ALL + | SKIP_L1_DFL_VMENTRY + | SSB_NO + | MDS_NO + | TAA_NO + | TSX_CONTROL | MCU_CONTROL | MISC_PACKAGE_CTLS | ENERGY_FILTERING_CTL @@ -4064,6 +4080,10 @@ fn check_arch_capabilities_compatibility( | MON_UMON_MITG_SUPPORT | PBOPT_SUPPORT | RFDS_CLEAR + | PBRSB_NO + | GDS_NO + | RFDS_NO + | BHI_NO }; if let Err(only_in_src) = check_subset(src_val & mask, dest_val & mask) { error!( From 0869f9e878a961296ab22fe659cd0fd18343f27c Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Thu, 12 Feb 2026 23:33:49 +0100 Subject: [PATCH 151/178] arch: Add a list of all Intel architectural MSRS This list will be used to help us detect unknown MSRs when generating CPU profiles. It serves no other purpose beyond that. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .typos.toml | 2 + arch/src/x86_64/cpuid_definitions/intel.rs | 6 +- .../intel/architectural_msrs.rs | 1707 +++++++++++++++++ arch/src/x86_64/msr_definitions/intel/mod.rs | 8 + 4 files changed, 1722 insertions(+), 1 deletion(-) create mode 100644 arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs diff --git a/.typos.toml b/.typos.toml index 063770942f..ef9c7b96d0 100644 --- a/.typos.toml +++ b/.typos.toml @@ -5,6 +5,8 @@ extend-exclude = [ "hypervisor/src/kvm/x86_64/mod.rs", "resources/linux-config-*", ] +[default] +extend-ignore-re = ["_TME_"] [default.extend-words] CLASSE = "CLASSE" diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index f57785ff27..18cb6c8c01 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -332,7 +332,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "tsc_deadline_timer", description: "APIC timer one-shot operation", bits_range: (24, 24), - policy: ProfilePolicy::Static(0), + policy: ProfilePolicy::Passthrough, }, ValueDefinition { short: "aes", @@ -1711,6 +1711,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { policy: ProfilePolicy::Inherit, }, // MSR related + // + // TODO: Is passthrough correct? + // If this bit is set then MSR IA32_FLUSH_CMD + // becomes available, otherwise it is not. ValueDefinition { short: "flush_l1d", description: "FLUSH L1D cache: IA32_FLUSH_CMD MSR", diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs new file mode 100644 index 0000000000..6099acd6a1 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -0,0 +1,1707 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +//! This module contains lists of architectural MSRs (or more accurately MSR register addresses) that +//! are permitted and forbidden for use with CPU profiles. +//! +//! The CPU profile generation tool obtains all MSRS supported by both KVM and the hardware +//! when it runs and uses the permitted list to only record those that are permitted. +//! +//! The list of forbidden architectural MSRs is only used to rule out "false" new MSRs that otherwise +//! would require updating the CPU profile generation tool. + +// We occasionally write doc comments for constants that are defined in private modules. This +// is still helpful for developers as the LSP can then provide information about the constants +// directly at the site(s) where they are being used. +#![allow(unused_doc_comments)] + +pub(in crate::x86_64) use forbidden_architectural_msrs::FORBIDDEN_IA32_MSR_RANGES; +pub(in crate::x86_64) use permitted_architectural_msrs::PERMITTED_IA32_MSRS; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::Parameters; +use crate::x86_64::cpuid_definitions::intel::assert_not_denied_cpuid_feature; + +mod permitted_architectural_msrs { + use read_only::READ_ONLY_IA32_MSRS; + use read_write::READ_WRITE_IA32_MSRS; + use write_only::WRITE_ONLY_IA32_MSRS; + + use super::{CpuidReg, Parameters}; + use crate::x86_64::msr_definitions::intel::architectural_msrs::assert_not_denied_cpuid_feature; + + mod read_only { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + /// (R/O) + const IA32_BARRIER: u32 = 0x2f; + const _IA32_BARRIER_CPUID_CHECK: () = const { + assert_not_denied_cpuid_feature::<27>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EAX, + }); + }; + /// Overclocking Status (R/O) + const IA32_OVERCLOCKING_STATUS: u32 = 0x195; + // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[23] + + /// xAPIC Disable Status (R/O) + const IA32_XAPIC_DISABLE_STATUS: u32 = 0xbd; + const _IA32_XAPIC_DISABLE_STATUS_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<29>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + // TODO: Also assert that IA32_ARCH_CAPABILITIES[21] is also not hard-coded to prevent + // this MSR from being accessed + + /// MTRR Capability (R/O) + const IA32_MTRRCAP: u32 = 0xfe; + + // TODO: Not sure whether the IA32_FZM_* msrs should be permitted + const IA32_FZM_DOMAIN_CONFIG: u32 = 0x83; + const IA32_FZM_RANGE_STARTADDR: u32 = 0x84; + const IA32_FZM_RANGE_ENDADDR: u32 = 0x85; + const IA32_FZM_RANGE_WRITESTATUS: u32 = 0x86; + + /// DCA Capability (R) + const IA32_PLATFORM_DCA_CAP: u32 = 0x1f8; + /// If set, CPU supports Prefetch-Hint type + const IA32_CPU_DCA_CAP: u32 = 0x1f9; + + const _IA32_DCA_CAP_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<18>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + // TODO: Can we rather place this MSR in the deny list? + const IA32_MCU_STAGING_MBOX_ADDR: u32 = 0x7a5; + + // NOTE: THE X2APIC related MSRs cannot be filtered by KVM, but we include them here anyway for completeness sake. + const IA32_X2APIC_APICID: u32 = 0x802; + const IA32_X2APIC_VERSION: u32 = 0x803; + const IA32_X2APIC_PPR: u32 = 0x80a; + const IA32_X2APIC_LDR: u32 = 0x80d; + const IA32_X2APIC_ISR0: u32 = 0x810; + const IA32_X2APIC_ISR1: u32 = 0x811; + const IA32_X2APIC_ISR2: u32 = 0x812; + + const IA32_X2APIC_ISR3: u32 = 0x813; + const IA32_X2APIC_ISR4: u32 = 0x814; + const IA32_X2APIC_ISR5: u32 = 0x815; + const IA32_X2APIC_ISR6: u32 = 0x816; + const IA32_X2APIC_ISR7: u32 = 0x817; + const IA32_X2APIC_TMR0: u32 = 0x818; + const IA32_X2APIC_TMR1: u32 = 0x819; + const IA32_X2APIC_TMR2: u32 = 0x81a; + const IA32_X2APIC_TMR3: u32 = 0x81b; + const IA32_X2APIC_TMR4: u32 = 0x81c; + const IA32_X2APIC_TMR5: u32 = 0x81d; + const IA32_X2APIC_TMR6: u32 = 0x81e; + const IA32_X2APIC_TMR7: u32 = 0x81f; + const IA32_X2APIC_IRR0: u32 = 0x820; + const IA32_X2APIC_IRR1: u32 = 0x821; + const IA32_X2APIC_IRR2: u32 = 0x822; + const IA32_X2APIC_IRR3: u32 = 0x823; + const IA32_X2APIC_IRR4: u32 = 0x824; + const IA32_X2APIC_IRR5: u32 = 0x825; + const IA32_X2APIC_IRR6: u32 = 0x826; + const IA32_X2APIC_IRR7: u32 = 0x827; + const IA32_X2APIC_CUR_COUNT: u32 = 0x839; + + pub(super) const READ_ONLY_IA32_MSRS: [u32; 40] = [ + IA32_BARRIER, + IA32_MTRRCAP, + IA32_OVERCLOCKING_STATUS, + IA32_XAPIC_DISABLE_STATUS, + IA32_FZM_DOMAIN_CONFIG, + IA32_FZM_RANGE_STARTADDR, + IA32_FZM_RANGE_ENDADDR, + IA32_FZM_RANGE_WRITESTATUS, + IA32_PLATFORM_DCA_CAP, + IA32_CPU_DCA_CAP, + IA32_MCU_STAGING_MBOX_ADDR, + IA32_X2APIC_APICID, + IA32_X2APIC_VERSION, + IA32_X2APIC_PPR, + IA32_X2APIC_LDR, + IA32_X2APIC_ISR0, + IA32_X2APIC_ISR1, + IA32_X2APIC_ISR2, + IA32_X2APIC_ISR3, + IA32_X2APIC_ISR4, + IA32_X2APIC_ISR5, + IA32_X2APIC_ISR6, + IA32_X2APIC_ISR7, + IA32_X2APIC_TMR0, + IA32_X2APIC_TMR1, + IA32_X2APIC_TMR2, + IA32_X2APIC_TMR3, + IA32_X2APIC_TMR4, + IA32_X2APIC_TMR5, + IA32_X2APIC_TMR6, + IA32_X2APIC_TMR7, + IA32_X2APIC_IRR0, + IA32_X2APIC_IRR1, + IA32_X2APIC_IRR2, + IA32_X2APIC_IRR3, + IA32_X2APIC_IRR4, + IA32_X2APIC_IRR5, + IA32_X2APIC_IRR6, + IA32_X2APIC_IRR7, + IA32_X2APIC_CUR_COUNT, + ]; + } + + mod read_write { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + // TODO: Not sure if we need to permit this + const IA32_P5_MC_ADDR: u32 = 0x0; + // TODO: Not sure if we need to permit this + const IA32_P5_MC_TYPE: u32 = 0x1; + + // TODO: Is this also write? + const IA32_TIME_STAMP_COUNTER: u32 = 0x10; + + const IA32_APIC_BASE: u32 = 0x1b; + + const IA32_FEATURE_CONTROL: u32 = 0x3a; + + /// Per Logical Processor TSC Adjust (R/Write to clear) + const IA32_TSC_ADJUST: u32 = 0x3b; + const _IA32_TSC_ADJUST_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<1>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EBX, + }); + + const IA32_SPEC_CTRL: u32 = 0x48; + const _IA32_SPECT_CTRL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<26>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_MCU_OPT_CTRL: u32 = 0x123; + const _IA32_MCU_OPT_CTRL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<9>(&Parameters { + leaf: 0x7, + sub_leaf: (0..=0), + register: CpuidReg::EDX, + }); + + /// SYSENTER_CS_MSR + const IA32_SYSENTER_CS: u32 = 0x174; + + /// SYSENTER_ESP_MSR + const IA32_SYSENTER_ESP: u32 = 0x175; + + /// SYSENTER_ESP_MSR + const IA32_SYSENTER_EIP: u32 = 0x176; + + // TODO: Does it really make sense to permit this MSR? + const IA32_SMM_MONITOR_CTL: u32 = 0x9b; + const _IA32_SMM_MONITOR_CTL_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<5>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + /// Enable Misc. Processr Features + const IA32_MISC_ENABLE: u32 = 0x1a0; + + // TODO: Not sure what this does and whether it should be enabled + const IA32_FZM_RANGE_INDEX: u32 = 0x82; + + const IA32_XFD: u32 = 0x1c4; + const IA32_XFD_ERR: u32 = 0x1c5; + + // TODO: Not sure about SMRR_* (note that they are writable only in SMM) + const IA32_SMRR_PHYSBASE: u32 = 0x1f2; + const IA32_SMRR_PHYSMASK: u32 = 0x1f3; + + const IA32_DCA_0_CAP: u32 = 0x1fa; + + const _IA32_DCA_0_CAP_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<18>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + const IA32_MTRR_PHYSBASE0: u32 = 0x200; + const IA32_MTRR_PHYSMASK0: u32 = 0x201; + const IA32_MTRR_PHYSBASE1: u32 = 0x202; + const IA32_MTRR_PHYSMASK1: u32 = 0x203; + const IA32_MTRR_PHYSBASE2: u32 = 0x204; + const IA32_MTRR_PHYSMASK2: u32 = 0x205; + const IA32_MTRR_PHYSBASE3: u32 = 0x206; + const IA32_MTRR_PHYSMASK3: u32 = 0x207; + const IA32_MTRR_PHYSBASE4: u32 = 0x208; + const IA32_MTRR_PHYSMASK4: u32 = 0x209; + const IA32_MTRR_PHYSBASE5: u32 = 0x20a; + const IA32_MTRR_PHYSMASK5: u32 = 0x20b; + const IA32_MTRR_PHYSBASE6: u32 = 0x20c; + const IA32_MTRR_PHYSMASK6: u32 = 0x20d; + const IA32_MTRR_PHYSBASE7: u32 = 0x20e; + const IA32_MTRR_PHYSMASK7: u32 = 0x20f; + const IA32_MTRR_PHYSBASE8: u32 = 0x210; + const IA32_MTRR_PHYSMASK8: u32 = 0x211; + const IA32_MTRR_PHYSBASE9: u32 = 0x212; + const IA32_MTRR_PHYSMASK9: u32 = 0x213; + + // TODO: Are these actually READ + Write? + const IA32_MTRR_FIX64K_00000: u32 = 0x250; + const IA32_MTRR_FIX16K_80000: u32 = 0x258; + const IA32_MTRR_FIX16K_A0000: u32 = 0x259; + const IA32_MTRR_FIX4K_C0000: u32 = 0x268; + const IA32_MTRR_FIX4K_C8000: u32 = 0x269; + const IA32_MTRR_FIX4K_D0000: u32 = 0x26a; + const IA32_MTRR_FIX4K_D8000: u32 = 0x26b; + const IA32_MTRR_FIX4K_E0000: u32 = 0x26c; + const IA32_MTRR_FIX4K_E8000: u32 = 0x26d; + const IA32_MTRR_FIX4K_F0000: u32 = 0x26e; + const IA32_MTRR_FIX4K_F8000: u32 = 0x26f; + + const _IA32_MTRR_FIX_I_X_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<12>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_PAT: u32 = 0x277; + const _IA32_PAT_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<16>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_MTRR_DEF_TYPE: u32 = 0x2ff; + + const IA32_U_CET: u32 = 0x6a0; + const IA32_S_CET: u32 = 0x6a2; + + const IA32_TSC_DEADLINE: u32 = 0x6e0; + const _IA32_TSC_DEADLINE_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<24>(&Parameters { + leaf: 0x1, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + // TODO: We should probably not permit this if possible + const IA32_PECI_HWP_REQUEST_INFO: u32 = 0x775; + + // NOTE: THE X2APIC related MSRs cannot be filtered by KVM, but we include them here anyway for completeness sake. + const IA32_X2APIC_TPR: u32 = 0x808; + const IA32_X2APIC_SIVR: u32 = 0x80f; + + const IA32_X2APIC_ESR: u32 = 0x828; + const IA32_X2APIC_LVT_CMCI: u32 = 0x82f; + const IA32_X2APIC_ICR: u32 = 0x830; + const IA32_X2APIC_LVT_TIMER: u32 = 0x832; + const IA32_X2APIC_LVT_THERMAL: u32 = 0x833; + const IA32_X2APIC_LVT_PMI: u32 = 0x834; + const IA32_X2APIC_LVT_LINT0: u32 = 0x835; + + const IA32_X2APIC_LVT_LINT1: u32 = 0x836; + const IA32_X2APIC_LVT_ERROR: u32 = 0x837; + const IA32_X2APIC_INIT_COUNT: u32 = 0x838; + const IA32_X2APIC_DIV_CONF: u32 = 0x83e; + + // TODO: Not sure about this MSR + const IA32_RESOURCE_PRIORITY: u32 = 0xc88; + // TODO: Not sure about this MSR + const IA32_RESOURCE_PRIORITY_PKG: u32 = 0xc89; + + const IA32_PASID: u32 = 0xd93; + + const IA32_XSS: u32 = 0xda0; + const _IA32_XSS_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<3>(&Parameters { + leaf: 0xd, + sub_leaf: 1..=1, + register: CpuidReg::EAX, + }); + + /// Extended Feature Enable + const IA32_EFER: u32 = 0xc0000080; + + const IA32_STAR: u32 = 0xc000_0081; + const IA32_LSTAR: u32 = 0xc000_0082; + const IA32_CSTAR: u32 = 0xc000_0083; + const IA32_FMASK: u32 = 0xc000_0084; + const IA32_FS_BASE: u32 = 0xc000_0100; + const IA32_GS_BASE: u32 = 0xc000_0101; + const IA32_KERNEL_GS_BASE: u32 = 0xc000_0102; + const _IA32_EFER_UPTO_IA32_KERNEL_GS_BASE_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<29>(&Parameters { + leaf: 0x80000001, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + const IA32_TSC_AUX: u32 = 0xc000_0103; + // NOTE That either the following has to pass, or the same test with 0x80000001.EDX[27] + const _IA32_TSC_AUX_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<22>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::ECX, + }); + + const IA32_UARCH_MISC_CTL: u32 = 0x1b01; + // TODO: Check against IA32_ARCH_CAPABILITIES[12] + pub(super) const READ_WRITE_IA32_MSRS: [u32; 83] = [ + IA32_P5_MC_ADDR, + IA32_P5_MC_TYPE, + IA32_TIME_STAMP_COUNTER, + IA32_APIC_BASE, + IA32_FEATURE_CONTROL, + IA32_TSC_ADJUST, + IA32_SPEC_CTRL, + IA32_MCU_OPT_CTRL, + IA32_SYSENTER_CS, + IA32_SYSENTER_ESP, + IA32_SYSENTER_EIP, + IA32_SMM_MONITOR_CTL, + IA32_MISC_ENABLE, + IA32_FZM_RANGE_INDEX, + IA32_XFD, + IA32_XFD_ERR, + IA32_SMRR_PHYSBASE, + IA32_SMRR_PHYSMASK, + IA32_DCA_0_CAP, + IA32_MTRR_PHYSBASE0, + IA32_MTRR_PHYSMASK0, + IA32_MTRR_PHYSBASE1, + IA32_MTRR_PHYSMASK1, + IA32_MTRR_PHYSBASE2, + IA32_MTRR_PHYSMASK2, + IA32_MTRR_PHYSBASE3, + IA32_MTRR_PHYSMASK3, + IA32_MTRR_PHYSBASE4, + IA32_MTRR_PHYSMASK4, + IA32_MTRR_PHYSBASE5, + IA32_MTRR_PHYSMASK5, + IA32_MTRR_PHYSBASE6, + IA32_MTRR_PHYSMASK6, + IA32_MTRR_PHYSBASE7, + IA32_MTRR_PHYSMASK7, + IA32_MTRR_PHYSBASE8, + IA32_MTRR_PHYSMASK8, + IA32_MTRR_PHYSBASE9, + IA32_MTRR_PHYSMASK9, + IA32_MTRR_FIX64K_00000, + IA32_MTRR_FIX16K_80000, + IA32_MTRR_FIX16K_A0000, + IA32_MTRR_FIX4K_C0000, + IA32_MTRR_FIX4K_C8000, + IA32_MTRR_FIX4K_D0000, + IA32_MTRR_FIX4K_D8000, + IA32_MTRR_FIX4K_E0000, + IA32_MTRR_FIX4K_E8000, + IA32_MTRR_FIX4K_F0000, + IA32_MTRR_FIX4K_F8000, + IA32_PAT, + IA32_MTRR_DEF_TYPE, + IA32_U_CET, + IA32_S_CET, + IA32_TSC_DEADLINE, + IA32_PECI_HWP_REQUEST_INFO, + IA32_X2APIC_TPR, + IA32_X2APIC_SIVR, + IA32_X2APIC_ESR, + IA32_X2APIC_LVT_CMCI, + IA32_X2APIC_ICR, + IA32_X2APIC_LVT_TIMER, + IA32_X2APIC_LVT_THERMAL, + IA32_X2APIC_LVT_PMI, + IA32_X2APIC_LVT_LINT0, + IA32_X2APIC_LVT_LINT1, + IA32_X2APIC_LVT_ERROR, + IA32_X2APIC_INIT_COUNT, + IA32_X2APIC_DIV_CONF, + IA32_RESOURCE_PRIORITY, + IA32_RESOURCE_PRIORITY_PKG, + IA32_PASID, + IA32_XSS, + IA32_EFER, + IA32_STAR, + IA32_LSTAR, + IA32_CSTAR, + IA32_FMASK, + IA32_FS_BASE, + IA32_GS_BASE, + IA32_KERNEL_GS_BASE, + IA32_TSC_AUX, + IA32_UARCH_MISC_CTL, + ]; + } + + mod write_only { + use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; + + /// Prediction Command (WO) + const IA32_PRED_CMD: u32 = 0x49; + const _IA32_PRED_CMD_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<26>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + /// Flush Command (WO) + const IA32_FLUSH_CMD: u32 = 0x10b; + + // TODO: Should probably use inherit policy here + const _IA32_FLUSH_CMD_CPUID_CHECK: () = + assert_not_denied_cpuid_feature::<28>(&Parameters { + leaf: 0x7, + sub_leaf: 0..=0, + register: CpuidReg::EDX, + }); + + // X2apic related MSRS cannot be filtered by KVM, but we include it here anyway for completeness sake + const IA32_X2APIC_EOI: u32 = 0x80b; + + const IA32_X2APIC_SELF_IPI: u32 = 0x83f; + + pub(super) const WRITE_ONLY_IA32_MSRS: [u32; 4] = [ + IA32_PRED_CMD, + IA32_FLUSH_CMD, + IA32_X2APIC_EOI, + IA32_X2APIC_SELF_IPI, + ]; + } + + /// A list of permitted Intel IA32 MSRs that are not considered MSR-based feature indices + /// by KVM. + /// + /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM + /// Vol. 4 from October 2025 + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 127] = const { + let mut permitted = [0u32; 127]; + let read_only_len = READ_ONLY_IA32_MSRS.len(); + let write_only_len = WRITE_ONLY_IA32_MSRS.len(); + let read_write_len = READ_WRITE_IA32_MSRS.len(); + assert!(permitted.len() == (read_only_len + write_only_len + read_write_len)); + let mut idx = 0; + // Insert read only msrs + { + let mut i = 0; + while i < read_only_len { + permitted[idx + i] = READ_ONLY_IA32_MSRS[i]; + i += 1; + } + idx += read_only_len; + } + // Insert write only msrs + { + let mut i = 0; + while i < write_only_len { + permitted[idx + i] = WRITE_ONLY_IA32_MSRS[i]; + i += 1; + } + idx += write_only_len; + } + // Insert read & write msrs + { + let mut i = 0; + while i < read_write_len { + permitted[idx + i] = READ_WRITE_IA32_MSRS[i]; + i += 1; + } + } + permitted + }; +} + +mod forbidden_architectural_msrs { + // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE + + const IA32_MONITOR_FILTER_SIZE: (u32, u32) = (0x6, 0x6); + // TODO: Not sure about this one + const IA32_PLATFORM_ID: (u32, u32) = (0x17, 0x17); + + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + const IA32_PPIN_CTL: (u32, u32) = (0x4e, 0x4e); + + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + const IA32_PPIN: (u32, u32) = (0x4f, 0x4f); + + /// Used for microcode updates. Should not be available for guests. + const IA32_BIOS_UPDT_TRIG: (u32, u32) = (0x79, 0x79); + + /// Currently only related to Secure enclaves/Keylocker which is not available for non-host CPU profiles + const IA32_FEATURE_ACTIVATION: (u32, u32) = (0x7a, 0x7a); + + /// Related to microcode updates + const IA32_MCU_ENUMERATION: (u32, u32) = (0x7b, 0x7b); + + const IA32_MCU_STATUS: (u32, u32) = (0x7c, 0x7c); + + /// Related to total memory encryption + const IA32_MKTME_KEYID_PARTITIONING: (u32, u32) = (0x87, 0x87); + + // TODO: Not sure what to do about IA32_BIOS_SIGN_ID (note that it is also a MSR-based feature according to KVM) + + const IA32_SGXLEPUBKEYHASH0: (u32, u32) = (0x8c, 0x8c); + + const IA32_SGXLEPUBKEYHASH1: (u32, u32) = (0x8d, 0x8d); + + const IA32_SGXLEPUBKEYHASH2: (u32, u32) = (0x8e, 0x8e); + + const IA32_SGXLEPUBKEYHASH3: (u32, u32) = (0x8f, 0x8f); + + const IA32_SGXLEPUBKEYHASH4: (u32, u32) = (0x90, 0x90); + + const IA32_SGXLEPUBKEYHASH5: (u32, u32) = (0x91, 0x91); + + // TODO: Check this + const IA32_SMBASE: (u32, u32) = (0x9e, 0x9e); + + const IA32_MISC_PACKAGE_CTLS: (u32, u32) = (0xbc, 0xbc); + + /// Clock Modulation Control + /// This is disabled via CPUID for non-host CPU profiles + const IA32_CLOCK_MODULATION: (u32, u32) = (0x19a, 0x19a); + + // TODO: IA32_X2APIC_DISABLE_STATUS + + // IA32_PLI_SSP is disabled via CPUID for non-host profiles + const IA32_PLI_SSP: (u32, u32) = (0x6a4, 0x6a7); + + // This is disabled via CPUID for non-host profiles + const IA32_INTERRUPT_SSP_TABLE_ADDR: (u32, u32) = (0x6a8, 0x6a8); + + const IA32_PMC0: (u32, u32) = (0xc1, 0xc1); + const IA32_PMC1: (u32, u32) = (0xc2, 0xc2); + const IA32_PMC2: (u32, u32) = (0xc3, 0xc3); + const IA32_PMC3: (u32, u32) = (0xc4, 0xc4); + const IA32_PMC4: (u32, u32) = (0xc5, 0xc5); + const IA32_PMC5: (u32, u32) = (0xc6, 0xc6); + const IA32_PMC6: (u32, u32) = (0xc7, 0xc7); + const IA32_PMC7: (u32, u32) = (0xc8, 0xc8); + const IA32_PMC8: (u32, u32) = (0xc9, 0xc9); + const IA32_PMC9: (u32, u32) = (0xca, 0xca); + + const IA32_CORE_CAPABILITIES: (u32, u32) = (0xcf, 0xcf); + + // TODO: Do we really want to forbid this MSR? + const IA32_UMWAIT_CONTROL: (u32, u32) = (0xe1, 0xe1); + + // Disabled by CPUID for non-host CPU profiles + const IA32_MPERF: (u32, u32) = (0xe7, 0xe7); + + const IA32_APERF: (u32, u32) = (0xe8, 0xe8); + + const IA32_TSX_FORCE_ABORT: (u32, u32) = (0x10f, 0x10f); + + // Disabled via static IA32_ARCH_CAPABILITIES bit for non-host CPU profiles + const IA32_TSX_CTRL: (u32, u32) = (0x122, 0x122); + + // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to + // what we set in CPUID for some CPU profiles (inherit policy) + + // TODO: Don't know about IA32_SYSENTER_CS, IA32_SYSENTER_ESP, + // IA32_SYSENTER_EIP + // + + // TODO: Not sure if we can/should deny this MSR, but + // it doesn't really make sense to have it available in + // a virtualized environment + // + // If we keep it denied we should document that + // even for 06_01H one cannot rely on the existence of this MSR + const IA32_MCG_CAP: (u32, u32) = (0x179, 0x179); + + // TODO: Also not sure if we may deny this MSR + const IA32_MCG_STATUS: (u32, u32) = (0x17a, 0x17a); + + // TODO: Can we deny this? + const IA32_MCG_CTL: (u32, u32) = (0x17b, 0x17b); + + // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all + + /// Disabled via CPUID for all non-host CPU profiles + const IA32_PERFEVTSEL0: (u32, u32) = (0x186, 0x186); + const IA32_PERFEVTSEL1: (u32, u32) = (0x187, 0x187); + const IA32_PERFEVTSEL2: (u32, u32) = (0x188, 0x188); + const IA32_PERFEVTSEL3: (u32, u32) = (0x189, 0x189); + const IA32_PERFEVTSEL4: (u32, u32) = (0x18a, 0x18a); + const IA32_PERFEVTSEL5: (u32, u32) = (0x18b, 0x18b); + const IA32_PERFEVTSEL6: (u32, u32) = (0x18c, 0x18c); + const IA32_PERFEVTSEL7: (u32, u32) = (0x18d, 0x18d); + const IA32_PERFEVTSEL8: (u32, u32) = (0x18e, 0x18e); + const IA32_PERFEVTSEL9: (u32, u32) = (0x18f, 0x18f); + + // TODO: 0x18a - 0x194 is reserved and should not be included in any list + + // TODO: 0x196, 197 is reserved and should not be included in any list + // + + const IA32_PERF_STATUS: (u32, u32) = (0x198, 0x198); + + const IA32_PERF_CTL: (u32, u32) = (0x199, 0x199); + + // Disabled via CPUID for non-host profiles + const IA32_THERM_INTERRUPT: (u32, u32) = (0x19b, 0x19b); + + // Disabled via CPUID for non-host profiles + const IA32_THERM_STATUS: (u32, u32) = (0x19c, 0x19c); + + // TODO: Consider disabling IA32_MISC_ENABLE + + // Disabled via CPUID for non-host profiles + const IA32_ENERGY_PERF_BIAS: (u32, u32) = (0x1b0, 0x1b0); + + // Disabled via CPUID for non-host profiles + const IA32_PACKAGE_THERM_STATUS: (u32, u32) = (0x1b1, 0x1b1); + + // Disabled via CPUID for non-host profiles + const IA32_PACKAGE_THERM_INTERRUPT: (u32, u32) = (0x1b2, 0x1b2); + + const IA32_DEBUGCTL: (u32, u32) = (0x1d9, 0x1d9); + + const IA32_LER_FROM_IP: (u32, u32) = (0x1dd, 0x1dd); + + const IA32_LER_TO_IP: (u32, u32) = (0x1de, 0x1de); + + const IA32_LER_INFO: (u32, u32) = (0x1e0, 0x1e0); + + // TODO: Not sure about IA32_SMRR_PHYSBASE & IA32_SMRR_PHYSMASK + + const IA32_MC_I_CTL2: (u32, u32) = (0x280, 0x29f); + + // Disabled via CPUID for non-host profiles + const IA32_INTEGRITY_STATUS: (u32, u32) = (0x2dc, 0x2dc); + + const IA32_FIXED_CTRI: (u32, u32) = (0x309, 0x30f); + + // IA32_PERF_CAPABILITIES is an MSR-based feature thus not listed here + + // Disabled via CPUID for non-host profiles + const IA32_FIXED_CTR_CTRL: (u32, u32) = (0x38d, 0x38d); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS: (u32, u32) = (0x38e, 0x38e); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_CTRL: (u32, u32) = (0x38f, 0x38f); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS_RESET: (u32, u32) = (0x390, 0x390); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_STATUS_SET: (u32, u32) = (0x391, 0x391); + + // Disabled via CPUID for non-host profiles + const IA32_PERF_GLOBAL_INUSE: (u32, u32) = (0x392, 0x392); + + // TODO: Not sure about this one, but seems to be related to performance monitoring which + // should be disabled for non-host CPU profiles. + const IA32_PEBS_ENABLE: (u32, u32) = (0x3f1, 0x3f1); + + const IA32_MC0_CTL: (u32, u32) = (0x400, 0x400); + const IA32_MC0_STATUS: (u32, u32) = (0x401, 0x401); + const IA32_MC0_ADDR: (u32, u32) = (0x402, 0x402); + const IA32_MC0_MISC: (u32, u32) = (0x403, 0x403); + const IA32_MC1_CTL: (u32, u32) = (0x404, 0x404); + const IA32_MC1_STATUS: (u32, u32) = (0x405, 0x405); + const IA32_MC1_ADDR: (u32, u32) = (0x406, 0x406); + + const IA32_MC1_MISC: (u32, u32) = (0x407, 0x407); + const IA32_MC2_CTL: (u32, u32) = (0x408, 0x408); + const IA32_MC2_STATUS: (u32, u32) = (0x409, 0x409); + const IA32_MC2_ADDR: (u32, u32) = (0x40a, 0x40a); + const IA32_MC2_MISC: (u32, u32) = (0x40b, 0x40b); + const IA32_MC3_CTL: (u32, u32) = (0x40c, 0x40c); + const IA32_MC3_STATUS: (u32, u32) = (0x40d, 0x40d); + const IA32_MC3_ADDR1: (u32, u32) = (0x40e, 0x40e); + const IA32_MC3_MISC: (u32, u32) = (0x40f, 0x40f); + const IA32_MC4_CTL: (u32, u32) = (0x410, 0x410); + const IA32_MC4_STATUS: (u32, u32) = (0x411, 0x411); + const IA32_MC4_ADDR: (u32, u32) = (0x412, 0x412); + const IA32_MC4_MISC: (u32, u32) = (0x413, 0x413); + const IA32_MC5_CTL: (u32, u32) = (0x414, 0x414); + const IA32_MC5_STATUS: (u32, u32) = (0x415, 0x415); + const IA32_MC5_ADDR: (u32, u32) = (0x416, 0x416); + const IA32_MC5_MISC: (u32, u32) = (0x417, 0x417); + const IA32_MC6_CTL: (u32, u32) = (0x418, 0x418); + + const IA32_MC6_STATUS: (u32, u32) = (0x419, 0x419); + const IA32_MC6_ADDR1: (u32, u32) = (0x41a, 0x41a); + const IA32_MC6_MISC: (u32, u32) = (0x41b, 0x41b); + const IA32_MC7_CTL: (u32, u32) = (0x41c, 0x41c); + const IA32_MC7_STATUS: (u32, u32) = (0x41d, 0x41d); + const IA32_MC7_ADDR: (u32, u32) = (0x41e, 0x41e); + const IA32_MC7_MISC: (u32, u32) = (0x41f, 0x41f); + const IA32_MC8_CTL: (u32, u32) = (0x420, 0x420); + const IA32_MC8_STATUS: (u32, u32) = (0x421, 0x421); + const IA32_MC8_ADDR: (u32, u32) = (0x422, 0x422); + const IA32_MC8_MISC: (u32, u32) = (0x423, 0x423); + const IA32_MC9_CTL: (u32, u32) = (0x424, 0x424); + const IA32_MC9_STATUS: (u32, u32) = (0x425, 0x425); + const IA32_MC9_ADDR: (u32, u32) = (0x426, 0x426); + const IA32_MC9_MISC: (u32, u32) = (0x427, 0x427); + const IA32_MC10_CTL: (u32, u32) = (0x428, 0x428); + const IA32_MC10_STATUS: (u32, u32) = (0x429, 0x429); + const IA32_MC10_ADDR: (u32, u32) = (0x42a, 0x42a); + const IA32_MC10_MISC: (u32, u32) = (0x42b, 0x42b); + + const IA32_MC11_CTL: (u32, u32) = (0x42c, 0x42c); + const IA32_MC11_STATUS: (u32, u32) = (0x42d, 0x42d); + const IA32_MC11_ADDR: (u32, u32) = (0x42e, 0x42e); + const IA32_MC11_MISC: (u32, u32) = (0x42f, 0x42f); + const IA32_MC12_CTL: (u32, u32) = (0x430, 0x430); + const IA32_MC12_STATUS: (u32, u32) = (0x431, 0x431); + const IA32_MC12_ADDR: (u32, u32) = (0x432, 0x432); + const IA32_MC12_MISC: (u32, u32) = (0x433, 0x433); + const IA32_MC13_CTL: (u32, u32) = (0x434, 0x434); + const IA32_MC13_STATUS: (u32, u32) = (0x435, 0x435); + const IA32_MC13_ADDR: (u32, u32) = (0x436, 0x436); + const IA32_MC13_MISC: (u32, u32) = (0x437, 0x437); + const IA32_MC14_CTL: (u32, u32) = (0x438, 0x438); + const IA32_MC14_STATUS: (u32, u32) = (0x439, 0x439); + const IA32_MC14_ADDR: (u32, u32) = (0x43a, 0x43a); + const IA32_MC14_MISC: (u32, u32) = (0x43b, 0x43b); + const IA32_MC15_CTL: (u32, u32) = (0x43c, 0x43c); + const IA32_MC15_STATUS: (u32, u32) = (0x43d, 0x43d); + + const IA32_MC15_ADDR: (u32, u32) = (0x43e, 0x43e); + const IA32_MC15_MISC: (u32, u32) = (0x43f, 0x43f); + const IA32_MC16_CTL: (u32, u32) = (0x440, 0x440); + const IA32_MC16_STATUS: (u32, u32) = (0x441, 0x441); + const IA32_MC16_ADDR: (u32, u32) = (0x442, 0x442); + const IA32_MC16_MISC: (u32, u32) = (0x443, 0x443); + const IA32_MC17_CTL: (u32, u32) = (0x444, 0x444); + const IA32_MC17_STATUS: (u32, u32) = (0x445, 0x445); + const IA32_MC17_ADDR: (u32, u32) = (0x446, 0x446); + const IA32_MC17_MISC: (u32, u32) = (0x447, 0x447); + const IA32_MC18_CTL: (u32, u32) = (0x448, 0x448); + const IA32_MC18_STATUS: (u32, u32) = (0x449, 0x449); + const IA32_MC18_ADDR: (u32, u32) = (0x44a, 0x44a); + const IA32_MC18_MISC: (u32, u32) = (0x44b, 0x44b); + const IA32_MC19_CTL: (u32, u32) = (0x44c, 0x44c); + const IA32_MC19_STATUS: (u32, u32) = (0x44d, 0x44d); + const IA32_MC19_ADDR: (u32, u32) = (0x44e, 0x44e); + const IA32_MC19_MISC: (u32, u32) = (0x44f, 0x44f); + const IA32_MC20_CTL: (u32, u32) = (0x450, 0x450); + + const IA32_MC20_STATUS: (u32, u32) = (0x451, 0x451); + const IA32_MC20_ADDR: (u32, u32) = (0x452, 0x452); + const IA32_MC20_MISC: (u32, u32) = (0x453, 0x453); + const IA32_MC21_CTL: (u32, u32) = (0x454, 0x454); + const IA32_MC21_STATUS: (u32, u32) = (0x455, 0x455); + const IA32_MC21_ADDR: (u32, u32) = (0x456, 0x456); + const IA32_MC21_MISC: (u32, u32) = (0x457, 0x457); + const IA32_MC22_CTL: (u32, u32) = (0x458, 0x458); + const IA32_MC22_STATUS: (u32, u32) = (0x459, 0x459); + const IA32_MC22_ADDR: (u32, u32) = (0x45a, 0x45a); + const IA32_MC22_MISC: (u32, u32) = (0x45b, 0x45b); + const IA32_MC23_CTL: (u32, u32) = (0x45c, 0x45c); + const IA32_MC23_STATUS: (u32, u32) = (0x45d, 0x45d); + const IA32_MC23_ADDR: (u32, u32) = (0x45e, 0x45e); + const IA32_MC23_MISC: (u32, u32) = (0x45f, 0x45f); + const IA32_MC24_CTL: (u32, u32) = (0x460, 0x460); + const IA32_MC24_STATUS: (u32, u32) = (0x461, 0x461); + const IA32_MC24_ADDR: (u32, u32) = (0x462, 0x462); + + const IA32_MC24_MISC: (u32, u32) = (0x463, 0x463); + const IA32_MC25_CTL: (u32, u32) = (0x464, 0x464); + const IA32_MC25_STATUS: (u32, u32) = (0x465, 0x465); + const IA32_MC25_ADDR: (u32, u32) = (0x466, 0x466); + const IA32_MC25_MISC: (u32, u32) = (0x467, 0x467); + const IA32_MC26_CTL: (u32, u32) = (0x468, 0x468); + const IA32_MC26_STATUS: (u32, u32) = (0x469, 0x469); + const IA32_MC26_ADDR: (u32, u32) = (0x46a, 0x46a); + const IA32_MC26_MISC: (u32, u32) = (0x46b, 0x46b); + const IA32_MC27_CTL: (u32, u32) = (0x46c, 0x46c); + const IA32_MC27_STATUS: (u32, u32) = (0x46d, 0x46d); + const IA32_MC27_ADDR: (u32, u32) = (0x46e, 0x46e); + const IA32_MC27_MISC: (u32, u32) = (0x46f, 0x46f); + const IA32_MC28_CTL: (u32, u32) = (0x470, 0x470); + const IA32_MC28_STATUS: (u32, u32) = (0x471, 0x471); + const IA32_MC28_ADDR: (u32, u32) = (0x472, 0x472); + const IA32_MC28_MISC: (u32, u32) = (0x473, 0x473); + const IA32_MC29_CTL: (u32, u32) = (0x474, 0x474); + const IA32_MC29_STATUS: (u32, u32) = (0x475, 0x475); + + const IA32_MC29_ADDR: (u32, u32) = (0x476, 0x476); + const IA32_MC29_MISC: (u32, u32) = (0x477, 0x477); + const IA32_MC30_CTL: (u32, u32) = (0x478, 0x478); + const IA32_MC30_STATUS: (u32, u32) = (0x479, 0x479); + const IA32_MC30_ADDR: (u32, u32) = (0x47a, 0x47a); + const IA32_MC30_MISC: (u32, u32) = (0x47b, 0x47b); + const IA32_MC31_CTL: (u32, u32) = (0x47c, 0x47c); + const IA32_MC31_STATUS: (u32, u32) = (0x47d, 0x47d); + const IA32_MC31_ADDR: (u32, u32) = (0x47e, 0x47e); + const IA32_MC31_MISC: (u32, u32) = (0x47f, 0x47f); + + const IA32_A_PMC0: (u32, u32) = (0x4c1, 0x4c1); + const IA32_A_PMC1: (u32, u32) = (0x4c2, 0x4c2); + const IA32_A_PMC2: (u32, u32) = (0x4c3, 0x4c3); + const IA32_A_PMC3: (u32, u32) = (0x4c4, 0x4c4); + const IA32_A_PMC4: (u32, u32) = (0x4c5, 0x4c5); + const IA32_A_PMC5: (u32, u32) = (0x4c6, 0x4c6); + const IA32_A_PMC6: (u32, u32) = (0x4c7, 0x4c7); + const IA32_A_PMC7: (u32, u32) = (0x4c8, 0x4c8); + const IA32_A_PMC8: (u32, u32) = (0x4c9, 0x4c9); + const IA32_A_PMC9: (u32, u32) = (0x4ca, 0x4ca); + + const IA32_MCG_EXT_CTL: (u32, u32) = (0x4d0, 0x4d0); + + // SGX is disabled via CPUID for non-host CPU profiles + const IA32_SGX_SVN_STATUS: (u32, u32) = (0x500, 0x500); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_OUTPUT_BASE: (u32, u32) = (0x560, 0x560); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_OUTPUT_MASK_PTRS: (u32, u32) = (0x561, 0x561); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_CTL: (u32, u32) = (0x570, 0x570); + + // Disabled via CPUID for non-host CPU profiles + const IA32_RTIT_STATUS: (u32, u32) = (0x571, 0x571); + + // Disabled via CPU profiles + const IA32_RTIT_CR3_MATCH: (u32, u32) = (0x572, 0x572); + + const IA32_RTIT_ADDR0_A: (u32, u32) = (0x580, 0x580); + const IA32_RTIT_ADDR0_B: (u32, u32) = (0x581, 0x581); + const IA32_RTIT_ADDR1_A: (u32, u32) = (0x582, 0x582); + const IA32_RTIT_ADDR1_B: (u32, u32) = (0x583, 0x583); + const IA32_RTIT_ADDR2_A: (u32, u32) = (0x584, 0x584); + const IA32_RTIT_ADDR2_B: (u32, u32) = (0x585, 0x585); + const IA32_RTIT_ADDR3_A: (u32, u32) = (0x586, 0x586); + const IA32_RTIT_ADDR3_B: (u32, u32) = (0x587, 0x587); + + // Disabled via CPUID for non-host CPU profiles + const IA32_DS_AREA: (u32, u32) = (0x600, 0x600); + + // TODO: IA32_TSC_DEADLINE should be available because the TSC_DEADLINE CPUID bit + // is set by CHV unconditionally. The availability of this MSR probably needs to be + // handled by CHV itself and not the CPU profiles + + // Disabled via CPUID for non-host CPU profiles + const IA32_PKRS: (u32, u32) = (0x6e1, 0x6e1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PM_ENABLE: (u32, u32) = (0x770, 0x770); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_CAPABILITIES: (u32, u32) = (0x771, 0x771); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_REQUEST_PKG: (u32, u32) = (0x772, 0x772); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_INTERRUPT: (u32, u32) = (0x773, 0x773); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_REQUEST: (u32, u32) = (0x774, 0x774); + + // TODO: Can we also deny IA32_PECI_HWP_REQUEST_INFO? + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_CTL: (u32, u32) = (0x776, 0x776); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HWP_STATUS: (u32, u32) = (0x777, 0x777); + + // TODO: Currently permitted via IA32_ARCH_CAPABILITIES (bit 22), + // but that bit should probably have policy Static(0) ? + const IA32_MCU_EXT_SERVICE: (u32, u32) = (0x7a3, 0x7a3); + + const IA32_MCU_ROLLBACK_MIN_ID: (u32, u32) = (0x7a4, 0x7a4); + + // TODO: Not sure about IA32_MCU_STAGING_MBOX_ADDR + + const IA32_ROLLBACK_SIGN_ID_0: (u32, u32) = (0x7b0, 0x7b0); + const IA32_ROLLBACK_SIGN_ID_1: (u32, u32) = (0x7b1, 0x7b1); + const IA32_ROLLBACK_SIGN_ID_2: (u32, u32) = (0x7b2, 0x7b2); + const IA32_ROLLBACK_SIGN_ID_3: (u32, u32) = (0x7b3, 0x7b3); + const IA32_ROLLBACK_SIGN_ID_4: (u32, u32) = (0x7b4, 0x7b4); + const IA32_ROLLBACK_SIGN_ID_5: (u32, u32) = (0x7b5, 0x7b5); + const IA32_ROLLBACK_SIGN_ID_6: (u32, u32) = (0x7b6, 0x7b6); + const IA32_ROLLBACK_SIGN_ID_7: (u32, u32) = (0x7b7, 0x7b7); + const IA32_ROLLBACK_SIGN_ID_8: (u32, u32) = (0x7b8, 0x7b8); + const IA32_ROLLBACK_SIGN_ID_9: (u32, u32) = (0x7b9, 0x7b9); + const IA32_ROLLBACK_SIGN_ID_10: (u32, u32) = (0x7ba, 0x7ba); + const IA32_ROLLBACK_SIGN_ID_11: (u32, u32) = (0x7bb, 0x7bb); + const IA32_ROLLBACK_SIGN_ID_12: (u32, u32) = (0x7bc, 0x7bc); + const IA32_ROLLBACK_SIGN_ID_13: (u32, u32) = (0x7bd, 0x7bd); + const IA32_ROLLBACK_SIGN_ID_14: (u32, u32) = (0x7be, 0x7be); + const IA32_ROLLBACK_SIGN_ID_15: (u32, u32) = (0x7bf, 0x7bf); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_CAPABILITY: (u32, u32) = (0x981, 0x981); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_ACTIVATE: (u32, u32) = (0x982, 0x982); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_EXCLUDE_MASK: (u32, u32) = (0x983, 0x983); + + // Disabled via CPUID for non-host CPU profiles + const IA32_TME_EXCLUDE_BASE: (u32, u32) = (0x984, 0x984); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_RR: (u32, u32) = (0x985, 0x985); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_HANDLER: (u32, u32) = (0x986, 0x986); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_STACKADJUST: (u32, u32) = (0x987, 0x987); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_MISC: (u32, u32) = (0x988, 0x988); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_PD: (u32, u32) = (0x989, 0x989); + + // Disabled via CPUID for non-host CPU profiles + const IA32_UINTR_TT: (u32, u32) = (0x98a, 0x98a); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_STATUS: (u32, u32) = (0x990, 0x990); + + // Disabled via CPUID for non-host CPU profiles + const IA32_IWKEYBACKUP_STATUS: (u32, u32) = (0x991, 0x991); + + const IA32_TME_CLEAR_SAVED_KEY: (u32, u32) = (0x9fb, 0x9fb); + + // Disabled via CPUID for non-host CPU profiles + const IA32_DEBUG_INTERFACE: (u32, u32) = (0xc80, 0xc80); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L3_QOS_CFG: (u32, u32) = (0xc81, 0xc81); + + // Disabled via CPUID + const IA32_L2_QOS_CFG: (u32, u32) = (0xc82, 0xc82); + + // Disabled via CPUID + const IA32_L3_IO_QOS_CFG: (u32, u32) = (0xc83, 0xc83); + + // TODO: Not sure about IA32_RESOURCE_PRIORITY and IA32_RESOURCE_PRIORITY_PKG + + // Disabled via CPUID for non-host CPU profiles + const IA32_QM_EVTSEL: (u32, u32) = (0xc8d, 0xc8d); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QM_CTR: (u32, u32) = (0xc8e, 0xc8e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PQR_ASSOC: (u32, u32) = (0xc8f, 0xc8f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L3_MASK_0: (u32, u32) = (0xc90, 0xc90); + + const IA32_L3_MASK_N: (u32, u32) = (0xc91, 0xd8f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_MASK_0: (u32, u32) = (0xd10, 0xd10); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_MASK_N: (u32, u32) = (0xd11, 0xd4f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_L2_QOS_EXT_BW_THRTL_I: (u32, u32) = (0xd50, 0xd5e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_BNDCFGS: (u32, u32) = (0xd90, 0xd90); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_LOCAL_TO_PLATFORM: (u32, u32) = (0xd91, 0xd91); + + // Disabled via CPUID for non-host CPU profiles + const IA32_COPY_PLATFORM_TO_LOCAL: (u32, u32) = (0xd92, 0xd92); + + // TODO: Not sure about IA32_PASID + + // Disabled via CPUID for non-host CPU profiles + const IA32_PKG_HDC_CTL: (u32, u32) = (0xdb0, 0xdb0); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PM_CTL1: (u32, u32) = (0xdb1, 0xdb1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_THREAD_STALL: (u32, u32) = (0xdb2, 0xdb2); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QOS_CORE_BW_THRTL_0: (u32, u32) = (0xe00, 0xe00); + + // Disabled via CPUID for non-host CPU profiles + const IA32_QOS_CORE_BW_THRTL_1: (u32, u32) = (0xe01, 0xe01); + + // TODO: Is it OK to disable this for CPU profiles? + // Note that we have CPUID 0x7.EDX.[19] = 0 (ARCH_LBR) + const IA32_LBR_X_INFO: (u32, u32) = (0x1200, 0x121f); + + // TDX related. + const IA32_SEAMRR_BASE: (u32, u32) = (0x1400, 0x1400); + + // TDX related. + const IA32_SEAMRR_MASK: (u32, u32) = (0x1401, 0x1401); + + // Disabled via ARCH_CAPABILITIES for non-host CPU profiles + // TODO: Check that deny policy is compatible with + // the policy for IA32_ARCH_COMPATIBILITY[9] + const IA32_MCU_CONTROL: (u32, u32) = (0x1406, 1406); + + const IA32_LBR_CTL: (u32, u32) = (0x14ce, 0x14ce); + + const IA32_LBR_DEPTH: (u32, u32) = (0x14cf, 0x14cf); + + const IA32_LBR_X_FROM_IP: (u32, u32) = (0x1500, 0x151f); + + const IA32_LBR_X_TO_IP: (u32, u32) = (0x1600, 0x161f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_PTR: (u32, u32) = (0x17d0, 0x17d0); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_CONFIG: (u32, u32) = (0x17d1, 0x17d1); + + // Disabled via CPUID for non-host CPU profiles + const IA32_HW_FEEDBACK_THREAD_CHAR: (u32, u32) = (0x17d2, 0x17d2); + + const IA32_HW_FEEDBACK_THREAD_CONFIG: (u32, u32) = (0x17d4, 0x17d4); + + const IA32_HRESET_ENABLE: (u32, u32) = (0x17da, 0x17da); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CTR: (u32, u32) = (0x1900, 0x1900); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CFG_A: (u32, u32) = (0x1901, 0x1901); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP0_CFG_C: (u32, u32) = (0x1903, 0x1903); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CTR: (u32, u32) = (0x1904, 0x1904); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CFG_A: (u32, u32) = (0x1905, 0x1905); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP1_CFG_C: (u32, u32) = (0x1907, 0x1907); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CTR: (u32, u32) = (0x1908, 0x1908); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_A: (u32, u32) = (0x1909, 0x1909); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_B: (u32, u32) = (0x190a, 0x190a); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP2_CFG_C: (u32, u32) = (0x190b, 0x190b); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CTR: (u32, u32) = (0x190c, 0x190c); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_A: (u32, u32) = (0x190d, 0x190d); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_B: (u32, u32) = (0x190e, 0x190e); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP3_CFG_C: (u32, u32) = (0x190f, 0x190f); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CTR: (u32, u32) = (0x1910, 0x1910); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_A: (u32, u32) = (0x1911, 0x1911); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_B: (u32, u32) = (0x1912, 0x1912); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP4_CFG_C: (u32, u32) = (0x1913, 0x1913); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CTR: (u32, u32) = (0x1914, 0x1914); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_A: (u32, u32) = (0x1915, 0x1915); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_B: (u32, u32) = (0x1916, 0x1916); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP5_CFG_C: (u32, u32) = (0x1917, 0x1917); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CTR: (u32, u32) = (0x1918, 0x1918); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_A: (u32, u32) = (0x1919, 0x1919); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_B: (u32, u32) = (0x191a, 0x191a); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP6_CFG_C: (u32, u32) = (0x191b, 0x191b); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CTR: (u32, u32) = (0x191c, 0x191c); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_A: (u32, u32) = (0x191d, 0x191d); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_B: (u32, u32) = (0x191e, 0x191e); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP7_CFG_C: (u32, u32) = (0x191f, 0x191f); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP8_CTR: (u32, u32) = (0x1920, 0x1920); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP8_CFG_A: (u32, u32) = (0x1921, 0x1921); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP9_CTR: (u32, u32) = (0x1924, 0x1924); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_GP9_CFG_A: (u32, u32) = (0x1925, 0x1925); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX0_CTR: (u32, u32) = (0x1980, 0x1980); + + const IA32_PMC_FX0_CFG_B: (u32, u32) = (0x1982, 0x1982); + const IA32_PMC_FX0_CFG_C: (u32, u32) = (0x1983, 0x1983); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX1_CTR: (u32, u32) = (0x1984, 0x1984); + const IA32_PMC_FX1_CFG_B: (u32, u32) = (0x1986, 0x1986); + const IA32_PMC_FX1_CFG_C: (u32, u32) = (0x1987, 0x1987); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX2_CTR: (u32, u32) = (0x1988, 0x1988); + + const IA32_PMC_FX2_CFG_C: (u32, u32) = (0x198b, 0x198b); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX3_CTR: (u32, u32) = (0x198c, 0x198c); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX4_CTR: (u32, u32) = (0x1990, 0x1990); + const IA32_PMC_FX4_CFG_C: (u32, u32) = (0x1993, 0x1993); + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX5_CTR: (u32, u32) = (0x1994, 0x1994); + const IA32_PMC_FX5_CFG_C: (u32, u32) = (0x1997, 0x1997); + + // Disabled via CPUID for non-host CPU profiles + const IA32_PMC_FX6_CTR: (u32, u32) = (0x1998, 0x1998); + const IA32_PMC_FX6_CFG_C: (u32, u32) = (0x199b, 0x199b); + + // TODO: Check IA32_UARCH_MISC_CTL + // + + /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not + /// considered MSR-based FEATURE indices by KVM. + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 345] = [ + // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE + IA32_MONITOR_FILTER_SIZE, + // TODO: Not sure about this one + IA32_PLATFORM_ID, + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + IA32_PPIN_CTL, + /// Only available is CPUID 0x7.0x1.EBX[0] = 1, but this is always 0 for non-host CPU profiles + IA32_PPIN, + /// Used for microcode updates. Should not be available for guests. + IA32_BIOS_UPDT_TRIG, + /// Currently only related to Secure enclaves/Keylocker which is not available for non-host CPU profiles + IA32_FEATURE_ACTIVATION, + /// Related to microcode updates + IA32_MCU_ENUMERATION, + IA32_MCU_STATUS, + /// Related to total memory encryption + IA32_MKTME_KEYID_PARTITIONING, + // TODO: Not sure what to do about IA32_BIOS_SIGN_ID (note that it is also a MSR-based feature according to KVM) + IA32_SGXLEPUBKEYHASH0, + IA32_SGXLEPUBKEYHASH1, + IA32_SGXLEPUBKEYHASH2, + IA32_SGXLEPUBKEYHASH3, + IA32_SGXLEPUBKEYHASH4, + IA32_SGXLEPUBKEYHASH5, + // TODO: Check this + IA32_SMBASE, + IA32_MISC_PACKAGE_CTLS, + // TODO: IA32_X2APIC_DISABLE_STATUS + IA32_PMC0, + IA32_PMC1, + IA32_PMC2, + IA32_PMC3, + IA32_PMC4, + IA32_PMC5, + IA32_PMC6, + IA32_PMC7, + IA32_PMC8, + IA32_PMC9, + IA32_CORE_CAPABILITIES, + IA32_UMWAIT_CONTROL, + IA32_CLOCK_MODULATION, + IA32_PLI_SSP, + IA32_INTERRUPT_SSP_TABLE_ADDR, + // Disabled by CPUID for non-host CPU profiles + IA32_MPERF, + IA32_APERF, + IA32_TSX_FORCE_ABORT, + // Disabled via static IA32_ARCH_CAPABILITIES bit for non-host CPU profiles + IA32_TSX_CTRL, + // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to + // what we set in CPUID for some CPU profiles (inherit policy) + + // TODO: Don't know about IA32_SYSENTER_CS, IA32_SYSENTER_ESP, + // IA32_SYSENTER_EIP + // + + // TODO: Not sure if we can/should deny this MSR, but + // it doesn't really make sense to have it available in + // a virtualized environment + // + // If we keep it denied we should document that + // even for 06_01H one cannot rely on the existence of this MSR + IA32_MCG_CAP, + // TODO: Also not sure if we may deny this MSR + IA32_MCG_STATUS, + // TODO: Can we deny this? + IA32_MCG_CTL, + // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all + /// Disabled via CPUID for all non-host CPU profiles + IA32_PERFEVTSEL0, + IA32_PERFEVTSEL1, + IA32_PERFEVTSEL2, + IA32_PERFEVTSEL3, + IA32_PERFEVTSEL4, + IA32_PERFEVTSEL5, + IA32_PERFEVTSEL6, + IA32_PERFEVTSEL7, + IA32_PERFEVTSEL8, + IA32_PERFEVTSEL9, + // TODO: 0x18a - 0x194 is reserved and should not be included in any list + + // TODO: 0x196, 197 is reserved and should not be included in any list + // + IA32_PERF_STATUS, + IA32_PERF_CTL, + // Disabled via CPUID for non-host profiles + IA32_THERM_INTERRUPT, + // Disabled via CPUID for non-host profiles + IA32_THERM_STATUS, + // TODO: Consider disabling IA32_MISC_ENABLE + + // Disabled via CPUID for non-host profiles + IA32_ENERGY_PERF_BIAS, + // Disabled via CPUID for non-host profiles + IA32_PACKAGE_THERM_STATUS, + // Disabled via CPUID for non-host profiles + IA32_PACKAGE_THERM_INTERRUPT, + IA32_DEBUGCTL, + IA32_LER_FROM_IP, + IA32_LER_TO_IP, + IA32_LER_INFO, + // TODO: Not sure about IA32_SMRR_PHYSBASE & IA32_SMRR_PHYSMASK + IA32_MC_I_CTL2, + // Disabled via CPUID for non-host profiles + IA32_INTEGRITY_STATUS, + IA32_FIXED_CTRI, + // IA32_PERF_CAPABILITIES is an MSR-based feature thus not listed here + + // Disabled via CPUID for non-host profiles + IA32_FIXED_CTR_CTRL, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_CTRL, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS_RESET, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_STATUS_SET, + // Disabled via CPUID for non-host profiles + IA32_PERF_GLOBAL_INUSE, + // TODO: Not sure about this one, but seems to be related to performance monitoring which + // should be disabled for non-host CPU profiles. + IA32_PEBS_ENABLE, + IA32_MC0_CTL, + IA32_MC0_STATUS, + IA32_MC0_ADDR, + IA32_MC0_MISC, + IA32_MC1_CTL, + IA32_MC1_STATUS, + IA32_MC1_ADDR, + IA32_MC1_MISC, + IA32_MC2_CTL, + IA32_MC2_STATUS, + IA32_MC2_ADDR, + IA32_MC2_MISC, + IA32_MC3_CTL, + IA32_MC3_STATUS, + IA32_MC3_ADDR1, + IA32_MC3_MISC, + IA32_MC4_CTL, + IA32_MC4_STATUS, + IA32_MC4_ADDR, + IA32_MC4_MISC, + IA32_MC5_CTL, + IA32_MC5_STATUS, + IA32_MC5_ADDR, + IA32_MC5_MISC, + IA32_MC6_CTL, + IA32_MC6_STATUS, + IA32_MC6_ADDR1, + IA32_MC6_MISC, + IA32_MC7_CTL, + IA32_MC7_STATUS, + IA32_MC7_ADDR, + IA32_MC7_MISC, + IA32_MC8_CTL, + IA32_MC8_STATUS, + IA32_MC8_ADDR, + IA32_MC8_MISC, + IA32_MC9_CTL, + IA32_MC9_STATUS, + IA32_MC9_ADDR, + IA32_MC9_MISC, + IA32_MC10_CTL, + IA32_MC10_STATUS, + IA32_MC10_ADDR, + IA32_MC10_MISC, + IA32_MC11_CTL, + IA32_MC11_STATUS, + IA32_MC11_ADDR, + IA32_MC11_MISC, + IA32_MC12_CTL, + IA32_MC12_STATUS, + IA32_MC12_ADDR, + IA32_MC12_MISC, + IA32_MC13_CTL, + IA32_MC13_STATUS, + IA32_MC13_ADDR, + IA32_MC13_MISC, + IA32_MC14_CTL, + IA32_MC14_STATUS, + IA32_MC14_ADDR, + IA32_MC14_MISC, + IA32_MC15_CTL, + IA32_MC15_STATUS, + IA32_MC15_ADDR, + IA32_MC15_MISC, + IA32_MC16_CTL, + IA32_MC16_STATUS, + IA32_MC16_ADDR, + IA32_MC16_MISC, + IA32_MC17_CTL, + IA32_MC17_STATUS, + IA32_MC17_ADDR, + IA32_MC17_MISC, + IA32_MC18_CTL, + IA32_MC18_STATUS, + IA32_MC18_ADDR, + IA32_MC18_MISC, + IA32_MC19_CTL, + IA32_MC19_STATUS, + IA32_MC19_ADDR, + IA32_MC19_MISC, + IA32_MC20_CTL, + IA32_MC20_STATUS, + IA32_MC20_ADDR, + IA32_MC20_MISC, + IA32_MC21_CTL, + IA32_MC21_STATUS, + IA32_MC21_ADDR, + IA32_MC21_MISC, + IA32_MC22_CTL, + IA32_MC22_STATUS, + IA32_MC22_ADDR, + IA32_MC22_MISC, + IA32_MC23_CTL, + IA32_MC23_STATUS, + IA32_MC23_ADDR, + IA32_MC23_MISC, + IA32_MC24_CTL, + IA32_MC24_STATUS, + IA32_MC24_ADDR, + IA32_MC24_MISC, + IA32_MC25_CTL, + IA32_MC25_STATUS, + IA32_MC25_ADDR, + IA32_MC25_MISC, + IA32_MC26_CTL, + IA32_MC26_STATUS, + IA32_MC26_ADDR, + IA32_MC26_MISC, + IA32_MC27_CTL, + IA32_MC27_STATUS, + IA32_MC27_ADDR, + IA32_MC27_MISC, + IA32_MC28_CTL, + IA32_MC28_STATUS, + IA32_MC28_ADDR, + IA32_MC28_MISC, + IA32_MC29_CTL, + IA32_MC29_STATUS, + IA32_MC29_ADDR, + IA32_MC29_MISC, + IA32_MC30_CTL, + IA32_MC30_STATUS, + IA32_MC30_ADDR, + IA32_MC30_MISC, + IA32_MC31_CTL, + IA32_MC31_STATUS, + IA32_MC31_ADDR, + IA32_MC31_MISC, + IA32_A_PMC0, + IA32_A_PMC1, + IA32_A_PMC2, + IA32_A_PMC3, + IA32_A_PMC4, + IA32_A_PMC5, + IA32_A_PMC6, + IA32_A_PMC7, + IA32_A_PMC8, + IA32_A_PMC9, + IA32_MCG_EXT_CTL, + // SGX is disabled via CPUID for non-host CPU profiles + IA32_SGX_SVN_STATUS, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_OUTPUT_BASE, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_OUTPUT_MASK_PTRS, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_RTIT_STATUS, + // Disabled via CPU profiles + IA32_RTIT_CR3_MATCH, + IA32_RTIT_ADDR0_A, + IA32_RTIT_ADDR0_B, + IA32_RTIT_ADDR1_A, + IA32_RTIT_ADDR1_B, + IA32_RTIT_ADDR2_A, + IA32_RTIT_ADDR2_B, + IA32_RTIT_ADDR3_A, + IA32_RTIT_ADDR3_B, + // Disabled via CPUID for non-host CPU profiles + IA32_DS_AREA, + // TODO: IA32_TSC_DEADLINE should be available because the TSC_DEADLINE CPUID bit + // is set by CHV unconditionally. The availability of this MSR probably needs to be + // handled by CHV itself and not the CPU profiles + + // Disabled via CPUID for non-host CPU profiles + IA32_PKRS, + // Disabled via CPUID for non-host CPU profiles + IA32_PM_ENABLE, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_CAPABILITIES, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_REQUEST_PKG, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_INTERRUPT, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_REQUEST, + // TODO: Can we also deny IA32_PECI_HWP_REQUEST_INFO? + + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_HWP_STATUS, + // TODO: Currently permitted via IA32_ARCH_CAPABILITIES (bit 22), + // but that bit should probably have policy Static(0) ? + IA32_MCU_EXT_SERVICE, + IA32_MCU_ROLLBACK_MIN_ID, + // TODO: Not sure about IA32_MCU_STAGING_MBOX_ADDR + IA32_ROLLBACK_SIGN_ID_0, + IA32_ROLLBACK_SIGN_ID_1, + IA32_ROLLBACK_SIGN_ID_2, + IA32_ROLLBACK_SIGN_ID_3, + IA32_ROLLBACK_SIGN_ID_4, + IA32_ROLLBACK_SIGN_ID_5, + IA32_ROLLBACK_SIGN_ID_6, + IA32_ROLLBACK_SIGN_ID_7, + IA32_ROLLBACK_SIGN_ID_8, + IA32_ROLLBACK_SIGN_ID_9, + IA32_ROLLBACK_SIGN_ID_10, + IA32_ROLLBACK_SIGN_ID_11, + IA32_ROLLBACK_SIGN_ID_12, + IA32_ROLLBACK_SIGN_ID_13, + IA32_ROLLBACK_SIGN_ID_14, + IA32_ROLLBACK_SIGN_ID_15, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_CAPABILITY, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_ACTIVATE, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_EXCLUDE_MASK, + // Disabled via CPUID for non-host CPU profiles + IA32_TME_EXCLUDE_BASE, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_RR, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_HANDLER, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_STACKADJUST, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_MISC, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_PD, + // Disabled via CPUID for non-host CPU profiles + IA32_UINTR_TT, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_STATUS, + // Disabled via CPUID for non-host CPU profiles + IA32_IWKEYBACKUP_STATUS, + IA32_TME_CLEAR_SAVED_KEY, + // Disabled via CPUID for non-host CPU profiles + IA32_DEBUG_INTERFACE, + // Disabled via CPUID for non-host CPU profiles + IA32_L3_QOS_CFG, + // Disabled via CPUID + IA32_L2_QOS_CFG, + // Disabled via CPUID + IA32_L3_IO_QOS_CFG, + // TODO: Not sure about IA32_RESOURCE_PRIORITY and IA32_RESOURCE_PRIORITY_PKG + + // Disabled via CPUID for non-host CPU profiles + IA32_QM_EVTSEL, + // Disabled via CPUID for non-host CPU profiles + IA32_QM_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PQR_ASSOC, + // Disabled via CPUID for non-host CPU profiles + IA32_L3_MASK_0, + IA32_L3_MASK_N, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_MASK_0, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_MASK_N, + // Disabled via CPUID for non-host CPU profiles + IA32_L2_QOS_EXT_BW_THRTL_I, + // Disabled via CPUID for non-host CPU profiles + IA32_BNDCFGS, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_LOCAL_TO_PLATFORM, + // Disabled via CPUID for non-host CPU profiles + IA32_COPY_PLATFORM_TO_LOCAL, + // TODO: Not sure about IA32_PASID + + // Disabled via CPUID for non-host CPU profiles + IA32_PKG_HDC_CTL, + // Disabled via CPUID for non-host CPU profiles + IA32_PM_CTL1, + // Disabled via CPUID for non-host CPU profiles + IA32_THREAD_STALL, + // Disabled via CPUID for non-host CPU profiles + IA32_QOS_CORE_BW_THRTL_0, + // Disabled via CPUID for non-host CPU profiles + IA32_QOS_CORE_BW_THRTL_1, + // TODO: Is it OK to disable this for CPU profiles? + // Note that we have CPUID 0x7.EDX.[19] = 0 (ARCH_LBR) + IA32_LBR_X_INFO, + // TDX related. + IA32_SEAMRR_BASE, + // TDX related. + IA32_SEAMRR_MASK, + // Disabled via ARCH_CAPABILITIES for non-host CPU profiles + IA32_MCU_CONTROL, + IA32_LBR_CTL, + IA32_LBR_DEPTH, + IA32_LBR_X_FROM_IP, + IA32_LBR_X_TO_IP, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_PTR, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_CONFIG, + // Disabled via CPUID for non-host CPU profiles + IA32_HW_FEEDBACK_THREAD_CHAR, + IA32_HW_FEEDBACK_THREAD_CONFIG, + IA32_HRESET_ENABLE, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP0_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP1_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP2_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP3_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP4_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP5_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP6_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_B, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP7_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP8_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP8_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP9_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_GP9_CFG_A, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX0_CTR, + IA32_PMC_FX0_CFG_B, + IA32_PMC_FX0_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX1_CTR, + IA32_PMC_FX1_CFG_B, + IA32_PMC_FX1_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX2_CTR, + IA32_PMC_FX2_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX3_CTR, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX4_CTR, + IA32_PMC_FX4_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX5_CTR, + IA32_PMC_FX5_CFG_C, + // Disabled via CPUID for non-host CPU profiles + IA32_PMC_FX6_CTR, + IA32_PMC_FX6_CFG_C, + // TODO: Check IA32_UARCH_MISC_CTL + ]; +} diff --git a/arch/src/x86_64/msr_definitions/intel/mod.rs b/arch/src/x86_64/msr_definitions/intel/mod.rs index 9811363bab..8e9543c469 100644 --- a/arch/src/x86_64/msr_definitions/intel/mod.rs +++ b/arch/src/x86_64/msr_definitions/intel/mod.rs @@ -2,7 +2,15 @@ // // SPDX-License-Identifier: Apache-2.0 // + +#[cfg(feature = "cpu_profile_generation")] +mod architectural_msrs; + mod msr_based_features; +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use architectural_msrs::FORBIDDEN_IA32_MSR_RANGES; +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use architectural_msrs::PERMITTED_IA32_MSRS; pub use msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; pub(in crate::x86_64) use msr_based_features::check_feature_msr_compatibility; From 47c55a6b208514e8f01fc3f0b9422bd2c3f70262 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 11 Mar 2026 14:23:05 +0100 Subject: [PATCH 152/178] arch: Address architectural msrs review TODO: Squash into previous commit if this all works as expected Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/architectural_msrs.rs | 137 +++++------------- 1 file changed, 38 insertions(+), 99 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs index 6099acd6a1..0585c75296 100644 --- a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -42,21 +42,6 @@ mod permitted_architectural_msrs { register: CpuidReg::EAX, }); }; - /// Overclocking Status (R/O) - const IA32_OVERCLOCKING_STATUS: u32 = 0x195; - // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[23] - - /// xAPIC Disable Status (R/O) - const IA32_XAPIC_DISABLE_STATUS: u32 = 0xbd; - const _IA32_XAPIC_DISABLE_STATUS_CPUID_CHECK: () = - assert_not_denied_cpuid_feature::<29>(&Parameters { - leaf: 0x7, - sub_leaf: 0..=0, - register: CpuidReg::EDX, - }); - - // TODO: Also assert that IA32_ARCH_CAPABILITIES[21] is also not hard-coded to prevent - // this MSR from being accessed /// MTRR Capability (R/O) const IA32_MTRRCAP: u32 = 0xfe; @@ -113,11 +98,9 @@ mod permitted_architectural_msrs { const IA32_X2APIC_IRR7: u32 = 0x827; const IA32_X2APIC_CUR_COUNT: u32 = 0x839; - pub(super) const READ_ONLY_IA32_MSRS: [u32; 40] = [ + pub(super) const READ_ONLY_IA32_MSRS: [u32; 38] = [ IA32_BARRIER, IA32_MTRRCAP, - IA32_OVERCLOCKING_STATUS, - IA32_XAPIC_DISABLE_STATUS, IA32_FZM_DOMAIN_CONFIG, IA32_FZM_RANGE_STARTADDR, IA32_FZM_RANGE_ENDADDR, @@ -159,12 +142,7 @@ mod permitted_architectural_msrs { mod read_write { use super::{CpuidReg, Parameters, assert_not_denied_cpuid_feature}; - // TODO: Not sure if we need to permit this - const IA32_P5_MC_ADDR: u32 = 0x0; - // TODO: Not sure if we need to permit this - const IA32_P5_MC_TYPE: u32 = 0x1; - // TODO: Is this also write? const IA32_TIME_STAMP_COUNTER: u32 = 0x10; const IA32_APIC_BASE: u32 = 0x1b; @@ -217,16 +195,9 @@ mod permitted_architectural_msrs { /// Enable Misc. Processr Features const IA32_MISC_ENABLE: u32 = 0x1a0; - // TODO: Not sure what this does and whether it should be enabled - const IA32_FZM_RANGE_INDEX: u32 = 0x82; - const IA32_XFD: u32 = 0x1c4; const IA32_XFD_ERR: u32 = 0x1c5; - // TODO: Not sure about SMRR_* (note that they are writable only in SMM) - const IA32_SMRR_PHYSBASE: u32 = 0x1f2; - const IA32_SMRR_PHYSMASK: u32 = 0x1f3; - const IA32_DCA_0_CAP: u32 = 0x1fa; const _IA32_DCA_0_CAP_CPUID_CHECK: () = @@ -257,7 +228,6 @@ mod permitted_architectural_msrs { const IA32_MTRR_PHYSBASE9: u32 = 0x212; const IA32_MTRR_PHYSMASK9: u32 = 0x213; - // TODO: Are these actually READ + Write? const IA32_MTRR_FIX64K_00000: u32 = 0x250; const IA32_MTRR_FIX16K_80000: u32 = 0x258; const IA32_MTRR_FIX16K_A0000: u32 = 0x259; @@ -297,9 +267,6 @@ mod permitted_architectural_msrs { register: CpuidReg::ECX, }); - // TODO: We should probably not permit this if possible - const IA32_PECI_HWP_REQUEST_INFO: u32 = 0x775; - // NOTE: THE X2APIC related MSRs cannot be filtered by KVM, but we include them here anyway for completeness sake. const IA32_X2APIC_TPR: u32 = 0x808; const IA32_X2APIC_SIVR: u32 = 0x80f; @@ -317,13 +284,6 @@ mod permitted_architectural_msrs { const IA32_X2APIC_INIT_COUNT: u32 = 0x838; const IA32_X2APIC_DIV_CONF: u32 = 0x83e; - // TODO: Not sure about this MSR - const IA32_RESOURCE_PRIORITY: u32 = 0xc88; - // TODO: Not sure about this MSR - const IA32_RESOURCE_PRIORITY_PKG: u32 = 0xc89; - - const IA32_PASID: u32 = 0xd93; - const IA32_XSS: u32 = 0xda0; const _IA32_XSS_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<3>(&Parameters { leaf: 0xd, @@ -356,11 +316,7 @@ mod permitted_architectural_msrs { register: CpuidReg::ECX, }); - const IA32_UARCH_MISC_CTL: u32 = 0x1b01; - // TODO: Check against IA32_ARCH_CAPABILITIES[12] - pub(super) const READ_WRITE_IA32_MSRS: [u32; 83] = [ - IA32_P5_MC_ADDR, - IA32_P5_MC_TYPE, + pub(super) const READ_WRITE_IA32_MSRS: [u32; 73] = [ IA32_TIME_STAMP_COUNTER, IA32_APIC_BASE, IA32_FEATURE_CONTROL, @@ -372,11 +328,8 @@ mod permitted_architectural_msrs { IA32_SYSENTER_EIP, IA32_SMM_MONITOR_CTL, IA32_MISC_ENABLE, - IA32_FZM_RANGE_INDEX, IA32_XFD, IA32_XFD_ERR, - IA32_SMRR_PHYSBASE, - IA32_SMRR_PHYSMASK, IA32_DCA_0_CAP, IA32_MTRR_PHYSBASE0, IA32_MTRR_PHYSMASK0, @@ -414,7 +367,6 @@ mod permitted_architectural_msrs { IA32_U_CET, IA32_S_CET, IA32_TSC_DEADLINE, - IA32_PECI_HWP_REQUEST_INFO, IA32_X2APIC_TPR, IA32_X2APIC_SIVR, IA32_X2APIC_ESR, @@ -428,9 +380,6 @@ mod permitted_architectural_msrs { IA32_X2APIC_LVT_ERROR, IA32_X2APIC_INIT_COUNT, IA32_X2APIC_DIV_CONF, - IA32_RESOURCE_PRIORITY, - IA32_RESOURCE_PRIORITY_PKG, - IA32_PASID, IA32_XSS, IA32_EFER, IA32_STAR, @@ -441,7 +390,6 @@ mod permitted_architectural_msrs { IA32_GS_BASE, IA32_KERNEL_GS_BASE, IA32_TSC_AUX, - IA32_UARCH_MISC_CTL, ]; } @@ -485,8 +433,8 @@ mod permitted_architectural_msrs { /// /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM /// Vol. 4 from October 2025 - pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 127] = const { - let mut permitted = [0u32; 127]; + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 115] = const { + let mut permitted = [0u32; 115]; let read_only_len = READ_ONLY_IA32_MSRS.len(); let write_only_len = WRITE_ONLY_IA32_MSRS.len(); let read_write_len = READ_WRITE_IA32_MSRS.len(); @@ -523,7 +471,8 @@ mod permitted_architectural_msrs { } mod forbidden_architectural_msrs { - // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE + const IA32_P5_MC_ADDR: (u32, u32) = (0x0, 0x0); + const IA32_P5_MC_TYPE: (u32, u32) = (0x1, 0x1); const IA32_MONITOR_FILTER_SIZE: (u32, u32) = (0x6, 0x6); // TODO: Not sure about this one @@ -546,11 +495,13 @@ mod forbidden_architectural_msrs { const IA32_MCU_STATUS: (u32, u32) = (0x7c, 0x7c); + // TODO: Not sure what this does and whether it should be enabled + const IA32_FZM_RANGE_INDEX: (u32, u32) = (0x82, 0x82); + /// Related to total memory encryption + /// const IA32_MKTME_KEYID_PARTITIONING: (u32, u32) = (0x87, 0x87); - // TODO: Not sure what to do about IA32_BIOS_SIGN_ID (note that it is also a MSR-based feature according to KVM) - const IA32_SGXLEPUBKEYHASH0: (u32, u32) = (0x8c, 0x8c); const IA32_SGXLEPUBKEYHASH1: (u32, u32) = (0x8d, 0x8d); @@ -568,18 +519,27 @@ mod forbidden_architectural_msrs { const IA32_MISC_PACKAGE_CTLS: (u32, u32) = (0xbc, 0xbc); + /// xAPIC Disable Status + // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[21] + const IA32_XAPIC_DISABLE_STATUS: (u32, u32) = (0xbd, 0xbd); + + const IA32_SMRR_PHYS_BASE_MASK: (u32, u32) = (0x1f2, 0x1f3); + + /// Overclocking Status (R/O) + // TODO: Also check consistency with IA32_ARCH_CAPABILITIES[23] + const IA32_OVERCLOCKING_STATUS: (u32, u32) = (0x195, 0x195); + /// Clock Modulation Control /// This is disabled via CPUID for non-host CPU profiles const IA32_CLOCK_MODULATION: (u32, u32) = (0x19a, 0x19a); - // TODO: IA32_X2APIC_DISABLE_STATUS - // IA32_PLI_SSP is disabled via CPUID for non-host profiles const IA32_PLI_SSP: (u32, u32) = (0x6a4, 0x6a7); // This is disabled via CPUID for non-host profiles const IA32_INTERRUPT_SSP_TABLE_ADDR: (u32, u32) = (0x6a8, 0x6a8); + const IA32_PECI_HWP_REQUEST_INFO: (u32, u32) = (0x775, 0x775); const IA32_PMC0: (u32, u32) = (0xc1, 0xc1); const IA32_PMC1: (u32, u32) = (0xc2, 0xc2); const IA32_PMC2: (u32, u32) = (0xc3, 0xc3); @@ -609,22 +569,10 @@ mod forbidden_architectural_msrs { // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to // what we set in CPUID for some CPU profiles (inherit policy) - // TODO: Don't know about IA32_SYSENTER_CS, IA32_SYSENTER_ESP, - // IA32_SYSENTER_EIP - // - - // TODO: Not sure if we can/should deny this MSR, but - // it doesn't really make sense to have it available in - // a virtualized environment - // - // If we keep it denied we should document that - // even for 06_01H one cannot rely on the existence of this MSR const IA32_MCG_CAP: (u32, u32) = (0x179, 0x179); - // TODO: Also not sure if we may deny this MSR const IA32_MCG_STATUS: (u32, u32) = (0x17a, 0x17a); - // TODO: Can we deny this? const IA32_MCG_CTL: (u32, u32) = (0x17b, 0x17b); // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all @@ -656,8 +604,6 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host profiles const IA32_THERM_STATUS: (u32, u32) = (0x19c, 0x19c); - // TODO: Consider disabling IA32_MISC_ENABLE - // Disabled via CPUID for non-host profiles const IA32_ENERGY_PERF_BIAS: (u32, u32) = (0x1b0, 0x1b0); @@ -675,8 +621,6 @@ mod forbidden_architectural_msrs { const IA32_LER_INFO: (u32, u32) = (0x1e0, 0x1e0); - // TODO: Not sure about IA32_SMRR_PHYSBASE & IA32_SMRR_PHYSMASK - const IA32_MC_I_CTL2: (u32, u32) = (0x280, 0x29f); // Disabled via CPUID for non-host profiles @@ -909,16 +853,12 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles const IA32_HWP_REQUEST: (u32, u32) = (0x774, 0x774); - // TODO: Can we also deny IA32_PECI_HWP_REQUEST_INFO? - // Disabled via CPUID for non-host CPU profiles const IA32_HWP_CTL: (u32, u32) = (0x776, 0x776); // Disabled via CPUID for non-host CPU profiles const IA32_HWP_STATUS: (u32, u32) = (0x777, 0x777); - // TODO: Currently permitted via IA32_ARCH_CAPABILITIES (bit 22), - // but that bit should probably have policy Static(0) ? const IA32_MCU_EXT_SERVICE: (u32, u32) = (0x7a3, 0x7a3); const IA32_MCU_ROLLBACK_MIN_ID: (u32, u32) = (0x7a4, 0x7a4); @@ -992,7 +932,8 @@ mod forbidden_architectural_msrs { // Disabled via CPUID const IA32_L3_IO_QOS_CFG: (u32, u32) = (0xc83, 0xc83); - // TODO: Not sure about IA32_RESOURCE_PRIORITY and IA32_RESOURCE_PRIORITY_PKG + const IA32_RESOURCE_PRIORITY: (u32, u32) = (0xc88, 0xc88); + const IA32_RESOURCE_PRIORITY_PKG: (u32, u32) = (0xc89, 0xc89); // Disabled via CPUID for non-host CPU profiles const IA32_QM_EVTSEL: (u32, u32) = (0xc8d, 0xc8d); @@ -1026,8 +967,7 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles const IA32_COPY_PLATFORM_TO_LOCAL: (u32, u32) = (0xd92, 0xd92); - // TODO: Not sure about IA32_PASID - + const IA32_PASID: (u32, u32) = (0xd93, 0xd93); // Disabled via CPUID for non-host CPU profiles const IA32_PKG_HDC_CTL: (u32, u32) = (0xdb0, 0xdb0); @@ -1043,7 +983,6 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles const IA32_QOS_CORE_BW_THRTL_1: (u32, u32) = (0xe01, 0xe01); - // TODO: Is it OK to disable this for CPU profiles? // Note that we have CPUID 0x7.EDX.[19] = 0 (ARCH_LBR) const IA32_LBR_X_INFO: (u32, u32) = (0x1200, 0x121f); @@ -1190,12 +1129,13 @@ mod forbidden_architectural_msrs { const IA32_PMC_FX6_CTR: (u32, u32) = (0x1998, 0x1998); const IA32_PMC_FX6_CFG_C: (u32, u32) = (0x199b, 0x199b); - // TODO: Check IA32_UARCH_MISC_CTL - // - + // TODO: Check against IA32_ARCH_CAPABILITIES[12] + const IA32_UARCH_MISC_CTL: (u32, u32) = (0x1b01, 0x1b01); /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not /// considered MSR-based FEATURE indices by KVM. - pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 345] = [ + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 356] = [ + IA32_P5_MC_ADDR, + IA32_P5_MC_TYPE, // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE IA32_MONITOR_FILTER_SIZE, // TODO: Not sure about this one @@ -1208,6 +1148,9 @@ mod forbidden_architectural_msrs { IA32_BIOS_UPDT_TRIG, /// Currently only related to Secure enclaves/Keylocker which is not available for non-host CPU profiles IA32_FEATURE_ACTIVATION, + IA32_FZM_RANGE_INDEX, + IA32_SMRR_PHYS_BASE_MASK, + IA32_PECI_HWP_REQUEST_INFO, /// Related to microcode updates IA32_MCU_ENUMERATION, IA32_MCU_STATUS, @@ -1223,7 +1166,8 @@ mod forbidden_architectural_msrs { // TODO: Check this IA32_SMBASE, IA32_MISC_PACKAGE_CTLS, - // TODO: IA32_X2APIC_DISABLE_STATUS + IA32_XAPIC_DISABLE_STATUS, + IA32_OVERCLOCKING_STATUS, IA32_PMC0, IA32_PMC1, IA32_PMC2, @@ -1480,10 +1424,6 @@ mod forbidden_architectural_msrs { IA32_RTIT_ADDR3_B, // Disabled via CPUID for non-host CPU profiles IA32_DS_AREA, - // TODO: IA32_TSC_DEADLINE should be available because the TSC_DEADLINE CPUID bit - // is set by CHV unconditionally. The availability of this MSR probably needs to be - // handled by CHV itself and not the CPU profiles - // Disabled via CPUID for non-host CPU profiles IA32_PKRS, // Disabled via CPUID for non-host CPU profiles @@ -1556,8 +1496,8 @@ mod forbidden_architectural_msrs { IA32_L2_QOS_CFG, // Disabled via CPUID IA32_L3_IO_QOS_CFG, - // TODO: Not sure about IA32_RESOURCE_PRIORITY and IA32_RESOURCE_PRIORITY_PKG - + IA32_RESOURCE_PRIORITY, + IA32_RESOURCE_PRIORITY_PKG, // Disabled via CPUID for non-host CPU profiles IA32_QM_EVTSEL, // Disabled via CPUID for non-host CPU profiles @@ -1579,8 +1519,7 @@ mod forbidden_architectural_msrs { IA32_COPY_LOCAL_TO_PLATFORM, // Disabled via CPUID for non-host CPU profiles IA32_COPY_PLATFORM_TO_LOCAL, - // TODO: Not sure about IA32_PASID - + IA32_PASID, // Disabled via CPUID for non-host CPU profiles IA32_PKG_HDC_CTL, // Disabled via CPUID for non-host CPU profiles @@ -1702,6 +1641,6 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles IA32_PMC_FX6_CTR, IA32_PMC_FX6_CFG_C, - // TODO: Check IA32_UARCH_MISC_CTL + IA32_UARCH_MISC_CTL, ]; } From af970c7157aa46a7f9366586694af3cbf8d3699c Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 13 Feb 2026 18:49:21 +0100 Subject: [PATCH 153/178] arch: Include a list of non-architectural MSRs We include a list of non-architectural MSRS. This list will only be used to help the CPU profile generation tool rule out MSRs that it does not know how to handle. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/intel/mod.rs | 5 + .../intel/non_architectural_msrs.rs | 113 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs diff --git a/arch/src/x86_64/msr_definitions/intel/mod.rs b/arch/src/x86_64/msr_definitions/intel/mod.rs index 8e9543c469..c8e8a91d5c 100644 --- a/arch/src/x86_64/msr_definitions/intel/mod.rs +++ b/arch/src/x86_64/msr_definitions/intel/mod.rs @@ -6,6 +6,9 @@ #[cfg(feature = "cpu_profile_generation")] mod architectural_msrs; +#[cfg(feature = "cpu_profile_generation")] +mod non_architectural_msrs; + mod msr_based_features; #[cfg(feature = "cpu_profile_generation")] @@ -14,3 +17,5 @@ pub(in crate::x86_64) use architectural_msrs::FORBIDDEN_IA32_MSR_RANGES; pub(in crate::x86_64) use architectural_msrs::PERMITTED_IA32_MSRS; pub use msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; pub(in crate::x86_64) use msr_based_features::check_feature_msr_compatibility; +#[cfg(feature = "cpu_profile_generation")] +pub(in crate::x86_64) use non_architectural_msrs::NON_ARCHITECTURAL_INTEL_MSRS; diff --git a/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs new file mode 100644 index 0000000000..b1f88aa809 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/intel/non_architectural_msrs.rs @@ -0,0 +1,113 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module contains a list of all known non-architectural MSRS for various Intel +//! CPUs. This list only helps us detect new MSRs that we are not (yet) aware of when +//! generating CPU profiles, but has no importance beyond that. + +/// A list of known non-architectural MSRs +/// +/// Note: KVM_GET_MSR_FEATURE_INDEX_LIST may return non-architectural MSRS. We append those +/// to [`crate::x86_64::msr_definitions_intel::INTEL_MSR_FEATURE_DEFINITIONS`] and not here. +pub(in crate::x86_64) const NON_ARCHITECTURAL_INTEL_MSRS: [u32; 872] = [ + 0x11, 0x12, 0x13, 0x2a, 0x2b, 0x2c, 0x33, 0x34, 0x35, 0x39, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, + 0x46, 0x47, 0x53, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x80, 0x88, 0x89, 0x8a, 0x98, + 0x99, 0x9a, 0xa0, 0xa1, 0xa5, 0xa7, 0xcd, 0xe2, 0xe4, 0xed, 0xee, 0xef, 0xf0, 0x105, 0x116, + 0x118, 0x119, 0x11a, 0x11b, 0x11e, 0x13c, 0x140, 0x151, 0x17d, 0x17f, 0x180, 0x181, 0x182, + 0x183, 0x184, 0x185, 0x190, 0x191, 0x192, 0x193, 0x194, 0x196, 0x197, 0x19d, 0x1a1, 0x1a2, + 0x1a4, 0x1a6, 0x1a7, 0x1aa, 0x1ac, 0x1ad, 0x1ae, 0x1af, 0x1c8, 0x1c9, 0x1d7, 0x1d8, 0x1da, + 0x1db, 0x1dc, 0x1f1, 0x1f4, 0x1f5, 0x1fb, 0x1fc, 0x2a0, 0x2a1, 0x2a2, 0x2a3, 0x2a4, 0x2a5, + 0x2a6, 0x2a7, 0x2b8, 0x2b9, 0x2ba, 0x2bb, 0x2bc, 0x2bd, 0x2be, 0x2bf, 0x2c2, 0x2c3, 0x2c4, + 0x2c5, 0x2c6, 0x2c7, 0x2c8, 0x2c9, 0x2d6, 0x2d7, 0x2d9, 0x2f4, 0x2f5, 0x300, 0x301, 0x302, + 0x303, 0x304, 0x305, 0x306, 0x307, 0x308, 0x310, 0x311, 0x329, 0x350, 0x351, 0x354, 0x355, + 0x360, 0x361, 0x362, 0x363, 0x364, 0x365, 0x366, 0x367, 0x368, 0x369, 0x36a, 0x36b, 0x36c, + 0x36d, 0x36e, 0x36f, 0x370, 0x371, 0x393, 0x394, 0x395, 0x396, 0x39c, 0x3a0, 0x3a1, 0x3a2, + 0x3a3, 0x3a4, 0x3a5, 0x3a6, 0x3a7, 0x3a8, 0x3a9, 0x3aa, 0x3ab, 0x3ac, 0x3ad, 0x3ae, 0x3af, + 0x3b0, 0x3b1, 0x3b2, 0x3b3, 0x3b4, 0x3b5, 0x3b6, 0x3b7, 0x3b8, 0x3b9, 0x3ba, 0x3bb, 0x3bc, + 0x3bd, 0x3be, 0x3c0, 0x3c1, 0x3c2, 0x3c3, 0x3c4, 0x3c5, 0x3c6, 0x3c7, 0x3c8, 0x3c9, 0x3ca, + 0x3cb, 0x3cc, 0x3cd, 0x3e0, 0x3e1, 0x3f0, 0x3f2, 0x3f6, 0x3f7, 0x3f8, 0x3f9, 0x3fa, 0x3fc, + 0x3fd, 0x3fe, 0x3ff, 0x4e0, 0x4e2, 0x4e3, 0x4f0, 0x4f8, 0x540, 0x541, 0x601, 0x606, 0x60a, + 0x60b, 0x60c, 0x60d, 0x610, 0x611, 0x612, 0x613, 0x614, 0x618, 0x619, 0x61b, 0x61c, 0x61e, + 0x620, 0x630, 0x631, 0x632, 0x638, 0x639, 0x63a, 0x640, 0x641, 0x642, 0x648, 0x649, 0x64a, + 0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650, 0x651, 0x652, 0x653, 0x655, 0x656, 0x657, 0x658, + 0x659, 0x65a, 0x65b, 0x65c, 0x65e, 0x65f, 0x660, 0x662, 0x664, 0x665, 0x666, 0x668, 0x669, + 0x66e, 0x680, 0x681, 0x682, 0x683, 0x684, 0x685, 0x686, 0x687, 0x688, 0x689, 0x68a, 0x68b, + 0x68c, 0x68d, 0x68e, 0x68f, 0x690, 0x691, 0x692, 0x693, 0x694, 0x695, 0x696, 0x697, 0x698, + 0x699, 0x69a, 0x69b, 0x69c, 0x69d, 0x69e, 0x69f, 0x6b0, 0x6b1, 0x6c0, 0x6c1, 0x6c2, 0x6c3, + 0x6c4, 0x6c5, 0x6c6, 0x6c7, 0x6c8, 0x6c9, 0x6ca, 0x6cb, 0x6cc, 0x6cd, 0x6ce, 0x6cf, 0x6d0, + 0x6d1, 0x6d2, 0x6d3, 0x6d4, 0x6d5, 0x6d6, 0x6d7, 0x6d8, 0x6d9, 0x6da, 0x6db, 0x6dc, 0x6dd, + 0x6de, 0x6df, 0x700, 0x701, 0x702, 0x703, 0x704, 0x705, 0x706, 0x707, 0x708, 0x709, 0x70a, + 0x70b, 0x710, 0x711, 0x712, 0x713, 0x714, 0x715, 0x716, 0x717, 0x718, 0x719, 0x71a, 0x71b, + 0x720, 0x721, 0x722, 0x723, 0x724, 0x725, 0x726, 0x727, 0x728, 0x729, 0x72a, 0x72b, 0x72c, + 0x72d, 0x72e, 0x72f, 0x730, 0x731, 0x732, 0x733, 0x734, 0x735, 0x736, 0x737, 0x738, 0x739, + 0x73a, 0x73b, 0x73c, 0x73d, 0x73e, 0x73f, 0x740, 0x741, 0x742, 0x743, 0x744, 0x745, 0x746, + 0x747, 0x748, 0x749, 0x9ff, 0xc00, 0xc01, 0xc02, 0xc06, 0xc08, 0xc09, 0xc10, 0xc11, 0xc16, + 0xc17, 0xc20, 0xc21, 0xc22, 0xc24, 0xc30, 0xc31, 0xc32, 0xc33, 0xc34, 0xc35, 0xc36, 0xc37, + 0xc38, 0xc39, 0xc40, 0xc41, 0xc42, 0xc50, 0xc51, 0xc52, 0xc53, 0xc54, 0xc55, 0xc56, 0xc57, + 0xc60, 0xc61, 0xc62, 0xc70, 0xc71, 0xc72, 0xc73, 0xc74, 0xc75, 0xc76, 0xc77, 0xc84, 0xd94, + 0xd95, 0xd96, 0xd97, 0xd98, 0xd99, 0xd9a, 0xd9b, 0xda1, 0xda2, 0xda4, 0xdb3, 0xdb4, 0xdb5, + 0xdb6, 0xdb7, 0xdb8, 0xdb9, 0xdba, 0xdbb, 0xdc0, 0xdc1, 0xdc2, 0xdc3, 0xdc4, 0xdc5, 0xdc6, + 0xdc7, 0xdc8, 0xdc9, 0xdca, 0xdcb, 0xdcc, 0xdcd, 0xdce, 0xdcf, 0xdd0, 0xdd1, 0xdd2, 0xdd3, + 0xdd4, 0xdd5, 0xdd6, 0xdd7, 0xdd8, 0xdd9, 0xdda, 0xddb, 0xddc, 0xddd, 0xdde, 0xddf, 0xde0, + 0xde1, 0xde2, 0xde4, 0xdf0, 0xdf1, 0xdf2, 0xdf3, 0xdf4, 0xdf5, 0xdf6, 0xdf7, 0xdf8, 0xdf9, + 0xdfa, 0xdfb, 0xe02, 0xe03, 0xe04, 0xe05, 0xe06, 0xe07, 0xe08, 0xe09, 0xe0a, 0xe0b, 0xe0c, + 0xe0d, 0xe0e, 0xe0f, 0xe10, 0xe11, 0xe12, 0xe13, 0xe14, 0xe15, 0xe16, 0xe17, 0xe18, 0xe19, + 0xe1a, 0xe1b, 0xe1c, 0xe1d, 0xe1e, 0xe1f, 0xe20, 0xe21, 0xe22, 0xe23, 0xe24, 0xe25, 0xe26, + 0xe27, 0xe28, 0xe29, 0xe2a, 0xe2b, 0xe2c, 0xe2d, 0xe2e, 0xe2f, 0xe30, 0xe31, 0xe32, 0xe33, + 0xe34, 0xe35, 0xe36, 0xe37, 0xe38, 0xe39, 0xe3a, 0xe3b, 0xe3c, 0xe3d, 0xe3e, 0xe3f, 0xe40, + 0xe41, 0xe42, 0xe43, 0xe44, 0xe45, 0xe46, 0xe47, 0xe48, 0xe49, 0xe4a, 0xe4b, 0xe4d, 0xe4e, + 0xe50, 0xe51, 0xe52, 0xe53, 0xe54, 0xe55, 0xe56, 0xe57, 0xe58, 0xe59, 0xe5a, 0xe5c, 0xe5d, + 0xe5e, 0xe60, 0xe61, 0xe62, 0xe63, 0xe64, 0xe65, 0xe66, 0xe67, 0xe68, 0xe69, 0xe6a, 0xe6b, + 0xe70, 0xe71, 0xe72, 0xe73, 0xe74, 0xe75, 0xe76, 0xe77, 0xe78, 0xe79, 0xe7a, 0xe7b, 0xe80, + 0xe81, 0xe82, 0xe83, 0xe84, 0xe85, 0xe86, 0xe87, 0xe88, 0xe89, 0xe8b, 0xe90, 0xe91, 0xe92, + 0xe93, 0xe94, 0xe95, 0xe96, 0xe97, 0xe98, 0xe99, 0xe9a, 0xe9b, 0xea0, 0xea1, 0xea2, 0xea3, + 0xea4, 0xea5, 0xea6, 0xea7, 0xea8, 0xea9, 0xeaa, 0xeab, 0xeb0, 0xeb1, 0xeb2, 0xeb3, 0xeb4, + 0xeb5, 0xeb6, 0xeb7, 0xeb8, 0xeb9, 0xeba, 0xebb, 0xec0, 0xec1, 0xec2, 0xec3, 0xec4, 0xec5, + 0xec6, 0xec7, 0xec8, 0xec9, 0xeca, 0xecb, 0xed0, 0xed1, 0xed2, 0xed3, 0xed4, 0xed5, 0xed6, + 0xed7, 0xed8, 0xed9, 0xeda, 0xedb, 0xee0, 0xee1, 0xee2, 0xee3, 0xee4, 0xee5, 0xee6, 0xee7, + 0xee8, 0xee9, 0xeea, 0xeeb, 0xef0, 0xef1, 0xef2, 0xef3, 0xef4, 0xef5, 0xef6, 0xef7, 0xef8, + 0xef9, 0xefa, 0xefb, 0xf00, 0xf01, 0xf02, 0xf03, 0xf04, 0xf05, 0xf06, 0xf07, 0xf08, 0xf09, + 0xf0a, 0xf0b, 0xf10, 0xf11, 0xf12, 0xf13, 0xf14, 0xf15, 0xf16, 0xf17, 0xf18, 0xf19, 0xf1a, + 0xf1b, 0xf40, 0xf41, 0xf42, 0xf50, 0xf51, 0xf52, 0xf53, 0xf54, 0xf55, 0xf56, 0xf57, 0xf58, + 0xf59, 0xf5a, 0xf5b, 0xfc0, 0xfc1, 0xfc2, 0xfd0, 0xfd1, 0xfd2, 0xfd3, 0xfd4, 0xfd5, 0xfd6, + 0xfd7, 0xfd8, 0xfd9, 0xfda, 0xfdb, 0x1309, 0x130a, 0x130b, 0x14c1, 0x14c2, 0x14c3, 0x14c4, + 0x14c5, 0x14c6, 0x14c7, 0x14c8, 0x1878, 0x1a8e, 0x1a8f, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, + 0x2009, 0x200a, 0x200b, 0x2010, 0x2011, 0x2012, 0x2013, 0x2018, 0x2019, 0x201a, 0x201b, 0x2020, + 0x2021, 0x2022, 0x2023, 0x2028, 0x2029, 0x202a, 0x202b, 0x2030, 0x2031, 0x2032, 0x2033, 0x2038, + 0x2039, 0x203a, 0x203b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2fd0, + 0x2fd1, 0x2fd2, 0x2fd3, 0x2fd4, 0x2fd5, 0x2fd8, 0x2fd9, 0x2fda, 0x2fdb, 0x2fdc, 0x2fdd, 0x2fde, + 0x2fdf, 0x2ff0, 0x2ff2, 0x107cc, 0x107cd, 0x107ce, 0x107cf, 0x107d0, 0x107d1, 0x107d2, 0x107d3, + 0x107d8, +]; + +// TODO: Look out for 0x13c (used to check for AES instruction on Intel Atom and ..)? +// TODO: 0x35 gives THREAD_COUNT will some programs stop working if we deny this MSR? + +// TODO: It is perfectly possible to convert the following test into compile time checks. +// We take care of that later. +#[cfg(test)] +mod tests { + use super::super::msr_based_features::INTEL_MSR_FEATURE_DEFINITIONS; + use super::super::{FORBIDDEN_IA32_MSR_RANGES, PERMITTED_IA32_MSRS}; + use super::NON_ARCHITECTURAL_INTEL_MSRS; + #[test] + fn disjoint_from_others() { + let mut unique_count = 0; + for msr in NON_ARCHITECTURAL_INTEL_MSRS { + if (!PERMITTED_IA32_MSRS.contains(&msr)) + && (!FORBIDDEN_IA32_MSR_RANGES + .iter() + .any(|r| (r.0..=r.1).contains(&msr))) + && (!INTEL_MSR_FEATURE_DEFINITIONS + .as_slice() + .iter() + .any(|(address, _)| address.0 == msr)) + { + unique_count += 1; + } + } + assert_eq!(unique_count, NON_ARCHITECTURAL_INTEL_MSRS.len()); + } +} From e81cd264679bd9833f6417a27ff89e69b7327910 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 13 Feb 2026 12:27:33 +0100 Subject: [PATCH 154/178] arch: Add lists of KVM MSRS We include a list of MSRS defined by KVM that may be approved by CPU profiles and another list of those that may not be approved by CPU profiles. These lists will later be used by the CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/kvm.rs | 93 ++++++++++++++++++++++++++ arch/src/x86_64/msr_definitions/mod.rs | 2 + 2 files changed, 95 insertions(+) create mode 100644 arch/src/x86_64/msr_definitions/kvm.rs diff --git a/arch/src/x86_64/msr_definitions/kvm.rs b/arch/src/x86_64/msr_definitions/kvm.rs new file mode 100644 index 0000000000..d85cc22f98 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/kvm.rs @@ -0,0 +1,93 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module lists KVM defined MSRS. It is currently only used when generating CPU profiles +//! (hence feature gated), but may possibly be extended and utilized for better debug logs in +//! the future. +pub(in crate::x86_64) use permitted_msrs::PROFILE_PERMITTED_KVM_MSRS; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::Parameters; + +mod permitted_msrs { + use super::{CpuidReg, Parameters}; + use crate::x86_64::cpuid_definitions::kvm::assert_not_denied_cpuid_feature; + + const MSR_KVM_WALL_CLOCK: u32 = 0x11; + const MSR_KVM_SYSTEM_TIME: u32 = 0x12; + const _KVM_CLOCKSOURCE_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<0>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00; + const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + const _KVM_CLOCKSOURCE2_CHECK: () = assert_not_denied_cpuid_feature::<3>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02; + const _KVM_ASYNC_PF_CHECK: () = assert_not_denied_cpuid_feature::<4>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03; + const _KVM_STEAL_TIME_CHECK: () = assert_not_denied_cpuid_feature::<5>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_EOI_EN: u32 = 0x4b564d04; + const _KVM_EOI_EN_CHECK: () = assert_not_denied_cpuid_feature::<6>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05; + const _KVM_POLL_CONTROL_CHECK: () = assert_not_denied_cpuid_feature::<12>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06; + const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07; + const _KVM_ASYNC_PF_INT_ACK_CHECK: () = assert_not_denied_cpuid_feature::<14>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08; + const _KVM_MIGRATION_CONTROL_CHECK: () = assert_not_denied_cpuid_feature::<17>(&Parameters { + leaf: 0x4000_0001, + sub_leaf: (0..=0), + register: CpuidReg::EAX, + }); + + /// KVM defined MSRS that CPU profiles may inclide in their permitted MSR definitions. + /// + /// This list is (currently) only utilized when generating CPU profiles. + pub(in crate::x86_64) const PROFILE_PERMITTED_KVM_MSRS: [u32; 11] = [ + MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME, + MSR_KVM_WALL_CLOCK_NEW, + MSR_KVM_SYSTEM_TIME_NEW, + MSR_KVM_ASYNC_PF_EN, + MSR_KVM_STEAL_TIME, + MSR_KVM_EOI_EN, + MSR_KVM_POLL_CONTROL, + MSR_KVM_ASYNC_PF_INT, + MSR_KVM_ASYNC_PF_ACK, + MSR_KVM_MIGRATION_CONTROL, + ]; +} diff --git a/arch/src/x86_64/msr_definitions/mod.rs b/arch/src/x86_64/msr_definitions/mod.rs index 3a9860cf58..d3feafc335 100644 --- a/arch/src/x86_64/msr_definitions/mod.rs +++ b/arch/src/x86_64/msr_definitions/mod.rs @@ -5,6 +5,8 @@ use serde::{Deserialize, Serialize}; pub mod intel; +#[cfg(all(feature = "kvm", feature = "cpu_profile_generation"))] +pub mod kvm; use crate::{deserialize_u32_hex, serialize_u32_hex}; /// The register address of an MSR From a3cc017bb970e32fbf453d8ca0813745ddc4e5b5 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Thu, 5 Mar 2026 17:11:01 +0100 Subject: [PATCH 155/178] arch: Add a list of HyperV MSRs The list of HyperV MSRs introduced here will be utilized during CPU profile generation and also at runtime to filter them out whenever `kvm_hyperv` is set to `false`. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/hyperv.rs | 169 ++++++++++++++++++++++ arch/src/x86_64/msr_definitions/mod.rs | 2 + 2 files changed, 171 insertions(+) create mode 100644 arch/src/x86_64/msr_definitions/hyperv.rs diff --git a/arch/src/x86_64/msr_definitions/hyperv.rs b/arch/src/x86_64/msr_definitions/hyperv.rs new file mode 100644 index 0000000000..8d5b6577e4 --- /dev/null +++ b/arch/src/x86_64/msr_definitions/hyperv.rs @@ -0,0 +1,169 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! This module exports a list of all known Hyper-V MSRs that we found in Appendix F in +//! the Microsoft Hypervisor Top Level Functional Specification document from February 2017. + +const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000; +const HV_X64_MSR_HYPERCALL: u32 = 0x40000001; +const HV_X64_MSR_VP_INDEX: u32 = 0x40000002; +const HV_X64_MSR_RESET: u32 = 0x40000003; +const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010; +const HV_X64_MSR_TIME_REF_COUNT: u32 = 0x40000020; +const HV_X64_MSR_REFERENCE_TSC: u32 = 0x40000021; +const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022; +const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023; +const HV_X64_MSR_EOI: u32 = 0x40000070; +const HV_X64_MSR_ICR: u32 = 0x40000071; +const HV_X64_MSR_TPR: u32 = 0x40000072; +const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073; +const HV_X64_MSR_SCONTROL: u32 = 0x40000080; +const HV_X64_MSR_SVERSION: u32 = 0x40000081; +const HV_X64_MSR_SIEFP: u32 = 0x40000082; +const HV_X64_MSR_SIMP: u32 = 0x40000083; +const HV_X64_MSR_EOM: u32 = 0x40000084; +const HV_X64_MSR_SINT0: u32 = 0x40000090; +const HV_X64_MSR_SINT1: u32 = 0x40000091; +const HV_X64_MSR_SINT2: u32 = 0x40000092; +const HV_X64_MSR_SINT3: u32 = 0x40000093; +const HV_X64_MSR_SINT4: u32 = 0x40000094; +const HV_X64_MSR_SINT5: u32 = 0x40000095; +const HV_X64_MSR_SINT6: u32 = 0x40000096; +const HV_X64_MSR_SINT7: u32 = 0x40000097; +const HV_X64_MSR_SINT8: u32 = 0x40000098; +const HV_X64_MSR_SINT9: u32 = 0x40000099; +const HV_X64_MSR_SINT10: u32 = 0x4000009A; +const HV_X64_MSR_SINT11: u32 = 0x4000009B; +const HV_X64_MSR_SINT12: u32 = 0x4000009C; +const HV_X64_MSR_SINT13: u32 = 0x4000009D; +const HV_X64_MSR_SINT14: u32 = 0x4000009E; +const HV_X64_MSR_SINT15: u32 = 0x4000009F; +const HV_X64_MSR_STIMER0_CONFIG: u32 = 0x400000B0; +const HV_X64_MSR_STIMER0_COUNT: u32 = 0x400000B1; +const HV_X64_MSR_STIMER1_CONFIG: u32 = 0x400000B2; +const HV_X64_MSR_STIMER1_COUNT: u32 = 0x400000B3; +const HV_X64_MSR_STIMER2_CONFIG: u32 = 0x400000B4; +const HV_X64_MSR_STIMER2_COUNT: u32 = 0x400000B5; +const HV_X64_MSR_STIMER3_CONFIG: u32 = 0x400000B6; +const HV_X64_MSR_STIMER3_COUNT: u32 = 0x400000B7; +const HV_X64_MSR_POWER_STATE_TRIGGER_C1: u32 = 0x400000C1; +const HV_X64_MSR_POWER_STATE_TRIGGER_C2: u32 = 0x400000C2; +const HV_X64_MSR_POWER_STATE_TRIGGER_C3: u32 = 0x400000C3; +const HV_X64_MSR_POWER_STATE_CONFIG_C1: u32 = 0x400000D1; +const HV_X64_MSR_POWER_STATE_CONFIG_C2: u32 = 0x400000D2; +const HV_X64_MSR_POWER_STATE_CONFIG_C3: u32 = 0x400000D3; +const HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE: u32 = 0x400000E0; +const HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE: u32 = 0x400000E1; +const HV_X64_MSR_STATS_VP_RETAIL_PAGE: u32 = 0x400000E2; +const HV_X64_MSR_STATS_VP_INTERNAL_PAGE: u32 = 0x400000E3; +const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0; +const HV_X64_MSR_SYNTH_DEBUG_CONTROL: u32 = 0x400000F1; +const HV_X64_MSR_SYNTH_DEBUG_STATUS: u32 = 0x400000F2; +const HV_X64_MSR_SYNTH_DEBUG_SEND_BUFFER: u32 = 0x400000F3; +const HV_X64_MSR_SYNTH_DEBUG_RECEIVE_BUFFER: u32 = 0x400000F4; +const HV_X64_MSR_SYNTH_DEBUG_PENDING_BUFFER: u32 = 0x400000F5; +const HV_X64_MSR_CRASH_P0: u32 = 0x40000100; +const HV_X64_MSR_CRASH_P1: u32 = 0x40000101; +const HV_X64_MSR_CRASH_P2: u32 = 0x40000102; +const HV_X64_MSR_CRASH_P3: u32 = 0x40000103; +const HV_X64_MSR_CRASH_P4: u32 = 0x40000104; +const HV_X64_MSR_CRASH_CTL: u32 = 0x40000105; + +/// This is a list of all Hyper-V MSRs that we found in Appendix F in the Microsoft +/// Hypervisor Top Level Functional Specification document from February 2017 +pub(in crate::x86_64) const HYPERV_MSRS: [u32; 64] = [ + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_HYPERCALL, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_TIME_REF_COUNT, + HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, + HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_EOI, + HV_X64_MSR_ICR, + HV_X64_MSR_TPR, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_SVERSION, + HV_X64_MSR_SIEFP, + HV_X64_MSR_SIMP, + HV_X64_MSR_EOM, + HV_X64_MSR_SINT0, + HV_X64_MSR_SINT1, + HV_X64_MSR_SINT2, + HV_X64_MSR_SINT3, + HV_X64_MSR_SINT4, + HV_X64_MSR_SINT5, + HV_X64_MSR_SINT6, + HV_X64_MSR_SINT7, + HV_X64_MSR_SINT8, + HV_X64_MSR_SINT9, + HV_X64_MSR_SINT10, + HV_X64_MSR_SINT11, + HV_X64_MSR_SINT12, + HV_X64_MSR_SINT13, + HV_X64_MSR_SINT14, + HV_X64_MSR_SINT15, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_STIMER0_COUNT, + HV_X64_MSR_STIMER1_CONFIG, + HV_X64_MSR_STIMER1_COUNT, + HV_X64_MSR_STIMER2_CONFIG, + HV_X64_MSR_STIMER2_COUNT, + HV_X64_MSR_STIMER3_CONFIG, + HV_X64_MSR_STIMER3_COUNT, + HV_X64_MSR_POWER_STATE_TRIGGER_C1, + HV_X64_MSR_POWER_STATE_TRIGGER_C2, + HV_X64_MSR_POWER_STATE_TRIGGER_C3, + HV_X64_MSR_POWER_STATE_CONFIG_C1, + HV_X64_MSR_POWER_STATE_CONFIG_C2, + HV_X64_MSR_POWER_STATE_CONFIG_C3, + HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, + HV_X64_MSR_STATS_VP_RETAIL_PAGE, + HV_X64_MSR_STATS_VP_INTERNAL_PAGE, + HV_X64_MSR_GUEST_IDLE, + HV_X64_MSR_SYNTH_DEBUG_CONTROL, + HV_X64_MSR_SYNTH_DEBUG_STATUS, + HV_X64_MSR_SYNTH_DEBUG_SEND_BUFFER, + HV_X64_MSR_SYNTH_DEBUG_RECEIVE_BUFFER, + HV_X64_MSR_SYNTH_DEBUG_PENDING_BUFFER, + HV_X64_MSR_CRASH_P0, + HV_X64_MSR_CRASH_P1, + HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, + HV_X64_MSR_CRASH_P4, + HV_X64_MSR_CRASH_CTL, +]; + +#[cfg(all(test, feature = "kvm", feature = "cpu_profile_generation"))] +mod tests { + use super::*; + use crate::x86_64::msr_definitions::intel::{ + INTEL_MSR_FEATURE_DEFINITIONS, PERMITTED_IA32_MSRS, + }; + use crate::x86_64::msr_definitions::kvm::PROFILE_PERMITTED_KVM_MSRS; + + // If this can be assumed than that simplifies some things. + // + // NOTE: It is perfectly possible to make this a compile time check instead, + // but that is more cumbersome hence we leave that for later. + #[test] + fn does_not_intersect_other_permitted_msr_sets() { + for msr in HYPERV_MSRS { + assert!( + !INTEL_MSR_FEATURE_DEFINITIONS + .as_slice() + .iter() + .map(|r| r.0.0) + .chain(PERMITTED_IA32_MSRS) + .chain(PROFILE_PERMITTED_KVM_MSRS) + .any(|other_permitted_msr| other_permitted_msr == msr) + ); + } + } +} diff --git a/arch/src/x86_64/msr_definitions/mod.rs b/arch/src/x86_64/msr_definitions/mod.rs index d3feafc335..805b83c863 100644 --- a/arch/src/x86_64/msr_definitions/mod.rs +++ b/arch/src/x86_64/msr_definitions/mod.rs @@ -8,6 +8,8 @@ pub mod intel; #[cfg(all(feature = "kvm", feature = "cpu_profile_generation"))] pub mod kvm; +pub mod hyperv; + use crate::{deserialize_u32_hex, serialize_u32_hex}; /// The register address of an MSR #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] From bdfeb7ee6735ffd8eadfe661b7d04db15e3dc6c7 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 21 Jan 2026 16:35:41 +0100 Subject: [PATCH 156/178] arch: Add required MSR update functionality We introduce functionality related to computing necessary MSR updates in accordance with the given CPU profile. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpu_profile.rs | 138 +++++++++++++++++++++++++++++++- arch/src/x86_64/mod.rs | 142 ++++++++++++++++++++++++++++++++- 2 files changed, 277 insertions(+), 3 deletions(-) diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index d2084ee8bd..0f6befbca3 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -5,7 +5,7 @@ use std::io::Write; -use hypervisor::arch::x86::CpuIdEntry; +use hypervisor::arch::x86::{CpuIdEntry, MsrEntry}; use hypervisor::{CpuVendor, HypervisorType}; use log::error; use serde::ser::SerializeStruct; @@ -15,6 +15,7 @@ use thiserror::Error; use crate::deserialize_u32_hex; use crate::x86_64::CpuidReg; use crate::x86_64::cpuid_definitions::Parameters; +use crate::x86_64::msr_definitions::RegisterAddress; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(rename_all = "kebab-case")] @@ -92,6 +93,21 @@ impl CpuProfile { // We will probably need one profile per hypervisor. unreachable!() } + + /// Loads pre-generated MSR data associated with a CPU profile. + #[cfg(feature = "kvm")] + pub(in crate::x86_64) fn msr_data(&self) -> Option { + todo!() + } + + #[cfg(not(feature = "kvm"))] + pub(in crate::x86_64) fn msr_data(&self) -> Option { + if matches!(*self, Self::Host) { + return None; + } + // CPU profiles are currently only available when using KVM as the hypervisor. + unreachable!() + } } /// Every [`CpuProfile`] different from `Host` has associated [`CpuIdProfileData`]. @@ -263,10 +279,130 @@ impl CpuidOutputRegisterAdjustments { } } +#[derive(Debug, Clone)] +pub(in crate::x86_64) struct FeatureMsrAdjustment { + pub(in crate::x86_64) mask: u64, + pub(in crate::x86_64) replacements: u64, +} + +impl Serialize for FeatureMsrAdjustment { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut s = serializer.serialize_struct("FeatureMsrAdjustment", 2)?; + let mut serialize_field = |key, value| { + // two bytes for "0x" prefix and 16 for the hex encoded number + let mut buffer = [0_u8; 18]; + let _ = write!(&mut buffer[..], "{value:#018x}"); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + s.serialize_field(key, str) + }; + serialize_field("mask", self.mask)?; + serialize_field("replacements", self.replacements)?; + s.end() + } +} + +impl<'de> Deserialize<'de> for FeatureMsrAdjustment { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + struct ProvisionalFeatureMsrAdjustment<'a> { + #[serde(borrow)] + mask: &'a str, + #[serde(borrow)] + replacements: &'a str, + } + + let ProvisionalFeatureMsrAdjustment { mask, replacements } = + ProvisionalFeatureMsrAdjustment::deserialize(deserializer)?; + let parse_u64 = |hex: &str, field_name: &str| { + u64::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("Unable to deserialize FeatureMsrAdjustment: could not deserialize {field_name} the value {hex} is not a hex encoded 64 bit integer")) + }) + }; + let mask = parse_u64(mask, "mask")?; + let replacements = parse_u64(replacements, "replacements")?; + Ok(FeatureMsrAdjustment { mask, replacements }) + } +} + +impl FeatureMsrAdjustment { + /// Returns a struct describing the Feature MSRs that should be set + /// and the ones that should be denied based on `adjustments` and the given + /// `feature_msrs`. + /// + /// # Errors + /// + /// The only way for this to error is if there exists one or more entries in + /// `adjustments` that do not have a corresponding entry in `feature_msrs`. + /// In this case the missing MSR will be logged and the unit type is returned + /// as the error variant. + pub(in crate::x86_64) fn adjust_to( + adjustments: &[(RegisterAddress, FeatureMsrAdjustment)], + feature_msrs: &[MsrEntry], + ) -> Result, ()> { + let mut output_feature_msrs = Vec::with_capacity(feature_msrs.len()); + for (reg_address, adjustment) in adjustments { + let Some(entry) = feature_msrs + .iter() + .find(|entry| entry.index == reg_address.0) + else { + error!( + "Did not find feature based MSR entry for MSR:={:#x}", + reg_address.0 + ); + return Err(()); + }; + // Adjust the entry and push it to outputs + { + let mut entry = *entry; + let data = entry.data; + entry.data = (adjustment.mask & data) | adjustment.replacements; + // TODO: Perhaps trace! would be a better log level? + log::debug!( + "adjusted MSR-based feature: register address:={:#x} value:={:#x}, previous value:={data:#x}", + entry.index, + entry.data + ); + output_feature_msrs.push(entry); + } + } + Ok(output_feature_msrs) + } +} + +pub struct RequiredMsrUpdates { + pub msr_based_features: Vec, + pub denied_msrs: Vec, +} + +/// Every [`CpuProfile`] different from `Host` has associated [`MsrProfileData`]. +/// +/// New constructors of this struct may only be generated through the CHV CLI (when built from source with +/// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to +/// increase the likelihood of successful live migrations among all hosts that opted in to the given +/// CPU profile. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(in crate::x86_64) struct MsrProfileData { + pub(in crate::x86_64) cpu_vendor: CpuVendor, + pub(in crate::x86_64) hypervisor_type: HypervisorType, + pub(in crate::x86_64) adjustments: Vec<(RegisterAddress, FeatureMsrAdjustment)>, + pub(in crate::x86_64) permitted_msrs: Vec, +} + #[derive(Debug, Error)] #[error("Required CPUID entries not found")] pub struct MissingCpuidEntriesError; +#[derive(Debug, Error)] +#[error("Required MSR entries not found")] +pub struct MissingMsrEntriesError; + #[cfg(test)] mod tests { use proptest::prelude::*; diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 762e9d39d3..3a7dacb9b4 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -23,6 +23,7 @@ mod mptable; mod smbios; use std::arch::x86_64; +use std::collections::{HashMap, HashSet}; use std::mem; use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry}; @@ -31,7 +32,7 @@ use linux_loader::loader::bootparam::{boot_params, setup_header}; use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; use serde::{Deserialize, Serialize}; pub use smbios::{SmbiosChassisConfig, SmbiosConfig, SmbiosSystem}; use thiserror::Error; @@ -40,7 +41,10 @@ use vm_memory::{ GuestMemoryRegion, }; -use crate::x86_64::cpu_profile::CpuidOutputRegisterAdjustments; +use crate::x86_64::cpu_profile::{ + CpuidOutputRegisterAdjustments, FeatureMsrAdjustment, RequiredMsrUpdates, +}; +use crate::x86_64::msr_definitions::RegisterAddress; use crate::{CpuProfile, GuestMemoryMmap, InitramfsConfig, RegionType}; // While modern architectures support more than 255 CPUs via x2APIC, @@ -140,11 +144,22 @@ pub enum Error { /// Error getting supported CPUID through the hypervisor (kvm/mshv) API #[error("Error getting supported CPUID through the hypervisor API")] CpuidGetSupported(#[source] HypervisorError), + /// Error getting the MSR-based features through the hypervisor (kvm) API + #[error("Error getting the MSR-based features through the hypervisor API")] + MsrBasedFeaturesGetSupported(#[source] HypervisorError), + + #[error("Error getting the MSRs supported by the hypervisor")] + MsrIndexList(#[source] HypervisorError), #[error( "The selected CPU profile cannot be utilized because the host's CPUID entries are not compatible with the profile" )] CpuProfileCpuidIncompatibility, + + #[error( + "The selected CPU profile cannot be utilized because the host's MSR-based features are not compatible with the profile" + )] + CpuProfileMsrIncompatibility, /// Error because TDX cannot be enabled when a custom (non host) CPU profile has been selected #[error("TDX cannot be enabled when a custom CPU profile has been selected")] CpuProfileTdxIncompatibility, @@ -961,6 +976,129 @@ pub fn generate_common_cpuid( } } +/// This function computes the [`RequiredMsrUpdates`] according to the +/// given `cpu_profile`, and `kvm_hyperv` parameters. +/// +/// If [`CpuProfile::Host`] is used then this function immediately returns `Ok(None)`, +/// regardless of the other parameters. +/// +/// ## Consistency with CPUID +/// +/// Some MSRs are only present when certain related bits in CPUID leaves are. +/// The CPU profile definition ensures consistency between the MSRs it permits and the +/// CPUID adjustments it prescribes. +/// +/// There are however certain CPUID values that can be modified by the VMM independently of the +/// CPUID profile and there may be corresponding MSRs that should then not be accessible. +/// At this point in time this only concerns the KVM and Hyper-V specific CPUID leaves and we +/// assume that the end user checks CPUID before accessing any of the related MSRs for now. +// TODO: Add `cpuid: &[CpuidEntry]` as a parameter and patch the permitted MSRs accordingly +// before upstreaming. +pub fn compute_required_msr_updates( + hypervisor: &dyn hypervisor::Hypervisor, + cpu_profile: CpuProfile, + kvm_hyperv: bool, +) -> super::Result> { + let Some(data) = cpu_profile.msr_data() else { + return Ok(None); + }; + + let cpu_vendor_host = hypervisor.get_cpu_vendor(); + let cpu_vendor_profile = data.cpu_vendor; + if cpu_vendor_host != cpu_vendor_profile { + return Err(Error::CpuProfileVendorIncompatibility { + cpu_vendor_profile, + cpu_vendor_host, + } + .into()); + } + + let msr_based_features = hypervisor + .get_msr_based_features() + .map_err(Error::MsrBasedFeaturesGetSupported)?; + + let msr_index_list = hypervisor + .get_msr_index_list() + .map_err(Error::MsrIndexList)?; + + let all_host_msrs: HashSet = msr_based_features + .iter() + .map(|entry| entry.index) + .chain(msr_index_list.iter().copied()) + .collect(); + + let mut permitted_msrs: HashSet = data.permitted_msrs.iter().map(|msr| msr.0).collect(); + + if kvm_hyperv { + // Log the Hyper-V MSRs that are not in the list of permitted MSRs. + // Some of these MSRs not being permitted by the profile might be benign or even intentional, + // but it might also indicate a BUG, or misconceptions that lead to bad CPU profiles. We thus + // log this at the info level for now. + for msr in msr_definitions::hyperv::HYPERV_MSRS { + if !permitted_msrs.contains(&msr) { + info!( + "NOTE: Hyper-V MSR: {msr:#x} is not in the list of MSRs supported by the CPU profile" + ); + } + } + } else { + // Remove all HYPER-V MSRs from the list of permitted MSRs + for msr in msr_definitions::hyperv::HYPERV_MSRS { + if permitted_msrs.remove(&msr) { + trace!("Removed Hyper-V MSR {msr:#x} from the set of supported MSRs"); + } + } + } + + let forbidden_msrs: Vec = all_host_msrs + .difference(&permitted_msrs) + .map(|msr| RegisterAddress(*msr)) + .collect(); + + if (all_host_msrs.len() - forbidden_msrs.len()) != permitted_msrs.len() { + error!("Host does not have all the permitted MSRS"); + for msr in permitted_msrs.iter() { + if !all_host_msrs.contains(msr) { + error!("Host is missing the required MSR:={msr:#x}"); + } + } + Err(Error::CpuProfileMsrIncompatibility)?; + } + + // NOTE: It is fine to ignore the inner error because the called function logs any missing MSRs. + let adjusted_msr_based_features = + FeatureMsrAdjustment::adjust_to(&data.adjustments, &msr_based_features) + .map_err(|_| Error::CpuProfileMsrIncompatibility)?; + + // TODO: CPU profiles are only available for Intel CPUs at the moment. We need to branch on the vendor + // once we also have CPU profiles for AMD. + assert!(matches!(cpu_vendor_host, CpuVendor::Intel)); + crate::x86_64::msr_definitions::intel::check_feature_msr_compatibility( + &HashMap::from_iter( + adjusted_msr_based_features + .iter() + .map(|entry| (entry.index, entry.data)), + ), + &HashMap::from_iter( + msr_based_features + .iter() + .map(|entry| (entry.index, entry.data)), + ), + "CPU Profile", + "Host", + ) + .map_err(|_| { + error!("feature-based MSR compatibility check failed"); + Error::CpuProfileMsrIncompatibility + })?; + + let update = RequiredMsrUpdates { + msr_based_features: adjusted_msr_based_features, + denied_msrs: forbidden_msrs, + }; + Ok(Some(update)) +} + #[allow(clippy::too_many_arguments)] pub fn configure_vcpu( vcpu: &dyn hypervisor::Vcpu, From 1ba49cf76c452e17979a5818fb9a335d6fee87ac Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Feb 2026 13:00:22 +0100 Subject: [PATCH 157/178] arch: Deny MSR functionality We introduce functionality to filter out MSRs which we want to deny guests from using. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/mod.rs | 13 +- arch/src/x86_64/msr_filter.rs | 361 ++++++++++++++++++++++++++++++++++ 2 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 arch/src/x86_64/msr_filter.rs diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 3a7dacb9b4..b0302ca34b 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -20,6 +20,7 @@ pub mod tdx; mod mpspec; mod mptable; +mod msr_filter; mod smbios; use std::arch::x86_64; @@ -27,12 +28,13 @@ use std::collections::{HashMap, HashSet}; use std::mem; use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry}; -use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError}; +use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError, HypervisorVmError}; use linux_loader::loader::bootparam::{boot_params, setup_header}; use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; use log::{debug, error, info, trace}; +pub use msr_filter::{MAX_BITMAP_SIZE, filter_denied_msrs}; use serde::{Deserialize, Serialize}; pub use smbios::{SmbiosChassisConfig, SmbiosConfig, SmbiosSystem}; use thiserror::Error; @@ -160,6 +162,15 @@ pub enum Error { "The selected CPU profile cannot be utilized because the host's MSR-based features are not compatible with the profile" )] CpuProfileMsrIncompatibility, + + #[error( + "Unable to apply MSR filter: Bitmaps exceed maximum permitted memory usage: {0} > {MAX_BITMAP_SIZE}" + )] + MsrFilterTooLarge(usize), + + #[error("The hypervisor failed to set the given MSR filter")] + MsrFilter(#[source] HypervisorVmError), + /// Error because TDX cannot be enabled when a custom (non host) CPU profile has been selected #[error("TDX cannot be enabled when a custom CPU profile has been selected")] CpuProfileTdxIncompatibility, diff --git a/arch/src/x86_64/msr_filter.rs b/arch/src/x86_64/msr_filter.rs new file mode 100644 index 0000000000..0edad0e364 --- /dev/null +++ b/arch/src/x86_64/msr_filter.rs @@ -0,0 +1,361 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::cell::Cell; +use std::fmt::Write; + +use hypervisor::MsrFilterRange; + +use super::Error; + +/// The maximum number of MSR filter ranges an MSR filter may consist of. +const MAX_FILTERS: usize = { + #[cfg(feature = "kvm")] + { + hypervisor::kvm::KVM_MSR_FILTER_MAX_RANGES + } + #[cfg(not(feature = "kvm"))] + { + // TODO: Change this when adding support for CPU profiles with MSHV + 16 + } +}; + +/// THE maximum number of bytes the bitmap arena used for the filter may occupy. +/// This is to ensure that we do not allocate too much memory for the bitmaps in +/// the filter ranges. +pub const MAX_BITMAP_SIZE: usize = MAX_FILTERS * 1024 * 1024; + +/// Apply a filter which denies guests any kind of access to the MSRs in `denied_msrs`. +/// +/// # Assumptions +/// +/// This function may explicitly mark certain MSRs different from those in `denied_msrs` as +/// both READ + Write permitted. We assume that the hypervisor will permit this filter being set +/// regardless and rather injects an exception if guests attempt to read/modify these MSRs in anyway +/// that is incompatible with the hardware and/or hypervisor. +/// +/// # Errors +/// +/// This errors if any of the following conditions hold: +/// +/// 1. Too much memory is required to construct the MSR filter that covers all of the denied MSRs. +/// 2. The VM/Hypervisor fails to apply the MSR filter. +pub fn filter_denied_msrs( + mut denied_msrs: Vec, + vm: &dyn hypervisor::Vm, +) -> Result<(), crate::Error> { + if denied_msrs.is_empty() { + return Ok(()); + } + denied_msrs.sort_unstable(); + + for msr in &denied_msrs { + log::debug!("MSR:={msr:#x} is set to be denied"); + } + + let mut bitmap_arena = Vec::new(); + let (filter, num_filter_ranges) = denied_to_filter(&denied_msrs, &mut bitmap_arena)?; + + if let Err(e) = vm.msr_filter(&filter[..num_filter_ranges], false) { + // Log more details at the debug level. Note that this error is likely to be reproducible and happens close to startup, hence + // it should be relatively easy to set the necessary log level if/when debugging becomes desirable. + for filter_range in &filter[..num_filter_ranges] { + // We want to encode the bitmap as a string of the form "[, ,... ]" + let mut bitmap_hex_encoded = String::with_capacity((4 * filter_range.bitmap.len()) + 2); + let _ = write!(&mut bitmap_hex_encoded, "["); + for b in filter_range.bitmap.iter() { + let _ = write!(&mut bitmap_hex_encoded, "{b:#x},"); + } + // Remove the final "," from the string + bitmap_hex_encoded.pop(); + let _ = write!(&mut bitmap_hex_encoded, "]"); + log::debug!( + "Failed to set MSR filter containing filter range: base:={:#x}, nmsrs:={:#x}, bitmap:={}", + filter_range.base, + filter_range.nmsrs, + bitmap_hex_encoded + ); + } + Err(Into::into(Error::MsrFilter(e))) + } else { + Ok(()) + } +} + +/// Essentially partitions `denied_sorted` into up to [`MAX_FILTERS`] ranges of +/// indices. +/// +/// These ranges may then be used to place the MSRs into distinct [`MsrFilterRanges`](MsrFilterRange). +/// In other words; If (a,b) is an entry in the output of this function, then all MSRs in +/// `denied_sorted[a..=b]` are intended to be placed in the same filter range. +/// +/// This partition minimizes the amount of memory necessary to construct the bitmaps for each +/// MSR filter range, that collectively cover all MSRs in `denied_sorted`, under the constraint +/// that none of the MSR filter ranges can intersect the x2APIC-related MSR range (0x801..=0x8ff). +/// +/// ## Performance +/// +/// This function has complexity` O(MAX_FILTERS * denied_sorted.len())` and does not allocate. +fn denied_to_range_indices<'a>( + denied_sorted: &[u32], + r_buff: &'a mut [(usize, usize); MAX_FILTERS], +) -> &'a [(usize, usize)] { + let mut d_prevs = [u32::MAX; MAX_FILTERS]; + let mut r_cnt = 0; + let mut min_dprev = u32::MAX; + let mut min_pos = 0_usize; + + let compute_dprev = |p: u32, n: u32| { + // Make dprev impractically large if it overlaps the x2apic MSR range + if (p <= 0x8ff) && (n > 0x800) { + u32::MAX + } else { + n - p + } + }; + + // Called as soon as we discover a full contiguous range of MSRs to be denied + // `r_s` is the index of the first MSR in this range and `r_e` the last. + let mut eval_deny_range = |r_s: usize, r_e: usize| { + const LAST_IDX: usize = MAX_FILTERS - 1; + let is_first = r_cnt == 0; + + let d_prev = if is_first { + u32::MAX + } else { + let l_prev_idx = r_buff[r_cnt - 1].1; + let l_prev = denied_sorted[l_prev_idx]; + compute_dprev(l_prev, denied_sorted[r_s]) + }; + + if r_cnt < MAX_FILTERS { + d_prevs[r_cnt] = d_prev; + r_buff[r_cnt] = (r_s, r_e); + if d_prev < min_dprev { + min_dprev = d_prev; + min_pos = r_cnt; + } + r_cnt += 1; + } else { + // Need to join ranges to find space + // The idea is to merge the range groups closest to each other + if d_prev <= min_dprev { + // Make the final range group cover this range + r_buff[LAST_IDX].1 = r_e; + } else { + // Merge some previously gathered range groups to make space + r_buff[min_pos - 1].1 = r_buff[min_pos].1; + // shift every thing after min_pos left + { + shift_left(&mut r_buff[min_pos..]); + shift_left(&mut d_prevs[min_pos..]); + } + // Now we have space for the new entry + r_buff[LAST_IDX] = (r_s, r_e); + d_prevs[LAST_IDX] = d_prev; + // Recompute minimum meta data + min_dprev = *d_prevs.iter().min().unwrap(); + min_pos = d_prevs.iter().position(|d| *d == min_dprev).unwrap(); + } + } + }; + // Produce all range groups + let mut offset = 0_usize; + let mut deny_slice = denied_sorted; + while let Some(deny_slice_skip1) = deny_slice.get(1..) { + let Some(pos) = deny_slice_skip1 + .iter() + .zip(deny_slice) + .position(|(n, p)| (n - p) > 1) + else { + break; + }; + let r_s = offset; + let r_e = offset + pos; + eval_deny_range(r_s, r_e); + offset = r_e + 1; + deny_slice = &denied_sorted[offset..]; + } + // Since there is no gap beyond the last element, we have one final deny range to + // evaluate + eval_deny_range(offset, denied_sorted.len() - 1); + &r_buff[..r_cnt] +} + +/// Construct `range_indices.len() (<= MAX_FILTERS)` [`MsrFilterRanges`](MsrFilterRange) +/// to deny all MSRs in `denied_sorted`. +/// +/// For each pair `(r_s, r_e)` in `range_indices` there will be a corresponding +/// filter range denying the MSRs in [`denied_sorted[r_s..=r_e]`]. +/// +/// # Errors +/// +/// This function can only error if more than [`MAX_BITMAP_SIZE`] bytes are required +/// to construct the filters. +/// +/// # Performance +/// +/// This function allocates once (but a possibly large allocation) and has otherwise +/// computational complexity `O(MAX_FILTERS * denied_sorted.len())`. +fn range_indices_to_filter<'a>( + denied_sorted: &[u32], + range_indices: &[(usize, usize)], + bitmap_arena: &'a mut Vec, +) -> Result<[MsrFilterRange<'a>; MAX_FILTERS], Error> { + let mut out = [MsrFilterRange::default().with_read_write_flags(); MAX_FILTERS]; + let bytes_to_allocate: usize = range_indices + .iter() + .copied() + .map(|(s, e)| ((denied_sorted[e] - denied_sorted[s]) + 1).div_ceil(8)) + .map(|v| v as usize) + .sum(); + + if bytes_to_allocate > MAX_BITMAP_SIZE { + return Err(Error::MsrFilterTooLarge(bytes_to_allocate)); + } + + bitmap_arena.extend(std::iter::repeat_n(u8::MAX, bytes_to_allocate)); + + let mut arena_slice = &mut bitmap_arena[..]; + for (idx, (r_s, r_e)) in range_indices.iter().enumerate() { + let base = denied_sorted[*r_s]; + let nmsrs = (denied_sorted[*r_e] - denied_sorted[*r_s]) + 1; + let (bm, rest) = arena_slice.split_at_mut(nmsrs.div_ceil(8) as usize); + arena_slice = rest; + for msr in &denied_sorted[*r_s..=*r_e] { + let d_base = *msr - base; + let byte_idx = (d_base) / 8; + let bit = 1 << (d_base % 8); + bm[byte_idx as usize] ^= bit; + } + // Set the fields in the range filter + { + let filter_range = &mut out[idx]; + filter_range.base = base; + filter_range.nmsrs = nmsrs; + filter_range.bitmap = bm; + } + } + + Ok(out) +} + +/// Prepare up to [`MAX_FILTERS`] [`MsrFilterRanges`](MsrFilterRange) +/// that collectively deny each of the MSRs specified in `denied_sorted`. +/// +/// The second component returned from this function is the number of +/// valid entries in the returned array. +/// +/// # Errors +/// +/// This function can only error if more than [`MAX_BITMAP_SIZE`] bytes are required +/// to construct the filters. +fn denied_to_filter<'a>( + denied_sorted: &[u32], + bitmap_arena: &'a mut Vec, +) -> Result<([MsrFilterRange<'a>; MAX_FILTERS], usize), Error> { + let mut range_indices_buffer = [(0, 0); MAX_FILTERS]; + let range_indices = denied_to_range_indices(denied_sorted, &mut range_indices_buffer); + + range_indices_to_filter(denied_sorted, range_indices, bitmap_arena) + .map(|filter| (filter, range_indices.len())) +} + +/// Convenience function that moves all elements apart from the first and last left by one. +/// +/// The slice's first element will be removed from the slice, while the modified +/// slice's last element will be equal to the second last (prior to calling this method). +fn shift_left(slice: &mut [T]) { + for w in Cell::from_mut(slice).as_slice_of_cells().windows(2) { + Cell::swap(&w[0], &w[1]); + } +} + +#[cfg(test)] +mod unit_tests { + use hypervisor::MsrFilterRange; + use proptest::prelude::*; + + use super::{MAX_BITMAP_SIZE, MAX_FILTERS, denied_to_filter}; + + /// transforms entries out of the x2apic MSR range and sorts + dedups the vector + fn prepare(bases: Vec) -> Vec { + // Remove bases in the x2apic MSR range + let mut v: Vec = bases + .into_iter() + .map(|b| { + if (0x800..=0x8ff).contains(&b) { + b % 0x800 + } else { + b + } + }) + .collect(); + v.sort_unstable(); + v.dedup(); + v + } + + fn filter_to_msrs(filter: &[MsrFilterRange<'_>]) -> Vec { + let mut out = Vec::new(); + for filter_range in filter { + let base = filter_range.base; + let mut num_msrs: u32 = 0; + for byte in filter_range.bitmap { + let mut inverse = !(*byte); + while inverse != 0 { + let idx = inverse.trailing_zeros(); + if num_msrs + idx > filter_range.nmsrs { + break; + } + out.push(base + num_msrs + idx); + let lsb = inverse & inverse.wrapping_neg(); + inverse ^= lsb; + } + num_msrs += 8; + } + } + out + } + + proptest! { + #[test] + fn denied_to_filer_works_short(prepared_msrs in (prop::collection::vec(0..u32::MAX, 1..MAX_FILTERS)).prop_map(prepare)) { + let mut bitmap_arena = Vec::new(); + let Ok((filter, num_filter_ranges)) = denied_to_filter(&prepared_msrs, &mut bitmap_arena) else { + return Ok(()); + }; + let mut recomputed_msrs = filter_to_msrs(&filter[..num_filter_ranges]); + recomputed_msrs.sort_unstable(); + prop_assert_eq!(prepared_msrs, recomputed_msrs); + } + } + + proptest! { + #[test] + fn denied_to_filer_works(prepared_msrs in (prop::collection::vec(0..u32::MAX, 17..70)).prop_map(prepare)) { + let mut bitmap_arena = Vec::new(); + let Ok((filter, num_filter_ranges)) = denied_to_filter(&prepared_msrs, &mut bitmap_arena) else { + return Ok(()); + }; + let mut recomputed_msrs = filter_to_msrs(&filter[..num_filter_ranges]); + recomputed_msrs.sort_unstable(); + prop_assert_eq!(prepared_msrs, recomputed_msrs); + } + } + + // Simple test that doesn't take too long to execute. We can + // include a more thorough test later if desired. + #[test] + fn catches_attempt_to_allocate_too_much_memory() { + let mut bitmap_arena = Vec::new(); + let denied_msrs: Vec = (0..MAX_FILTERS * 8 * 2) + .map(|i| i * MAX_BITMAP_SIZE) + .map(|v| u32::try_from(v).unwrap()) + .collect(); + let _ = denied_to_filter(&denied_msrs, &mut bitmap_arena).unwrap_err(); + } +} From 6c332a1564a48e37e1df9ff58ef4945d0a90def7 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 26 Jan 2026 15:37:17 +0100 Subject: [PATCH 158/178] vmm: Apply MSR adjustments according to CPU profile (if any) We record the necessary MSR-based feature modifications that need to be set in the `CpuManager` and make sure to set these MSR values upon vCPU configuration. We also use the Vm to filter access to MSRs that are incompatible with the chosen CPU profile. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/mod.rs | 5 +-- arch/src/x86_64/regs.rs | 36 +++++++++++++++++--- vmm/src/cpu.rs | 74 ++++++++++++++++++++++++++++++++++++++++- vmm/src/vm.rs | 26 ++++++++++----- 4 files changed, 124 insertions(+), 17 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index b0302ca34b..b8c10d9e0e 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -27,7 +27,7 @@ use std::arch::x86_64; use std::collections::{HashMap, HashSet}; use std::mem; -use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry}; +use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry, MsrEntry}; use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError, HypervisorVmError}; use linux_loader::loader::bootparam::{boot_params, setup_header}; use linux_loader::loader::elf::start_info::{ @@ -1116,6 +1116,7 @@ pub fn configure_vcpu( id: u32, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, cpuid: Vec, + feature_msrs: &[MsrEntry], kvm_hyperv: bool, cpu_vendor: CpuVendor, topology: (u16, u16, u16, u16), @@ -1192,7 +1193,7 @@ pub fn configure_vcpu( vcpu.enable_hyperv_synic().unwrap(); } - regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; + regs::setup_msrs(vcpu, feature_msrs).map_err(Error::MsrsConfiguration)?; if let Some((kernel_entry_point, guest_memory)) = boot_setup { if setup_registers { regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; diff --git a/arch/src/x86_64/regs.rs b/arch/src/x86_64/regs.rs index c93f39520b..baaedf57ed 100644 --- a/arch/src/x86_64/regs.rs +++ b/arch/src/x86_64/regs.rs @@ -10,7 +10,7 @@ use std::{mem, result}; use hypervisor::arch::x86::gdt::{gdt_entry, segment_from_gdt}; use hypervisor::arch::x86::regs::CR0_PE; -use hypervisor::arch::x86::{FpuState, SpecialRegisters}; +use hypervisor::arch::x86::{FpuState, MsrEntry, SpecialRegisters}; use thiserror::Error; use vm_memory::{Address, Bytes, GuestMemory, GuestMemoryError}; @@ -33,6 +33,8 @@ pub enum Error { /// Setting up MSRs failed. #[error("Setting up MSRs failed")] SetModelSpecificRegisters(#[source] hypervisor::HypervisorCpuError), + #[error("Setting up MSRs failed: Not all MSRs could be set. See logs for more info.")] + SetModelSpecificRegistersPartial, /// Failed to set SREGs for this CPU. #[error("Failed to set SREGs for this CPU")] SetStatusRegisters(#[source] hypervisor::HypervisorCpuError), @@ -81,11 +83,35 @@ pub fn setup_fpu(vcpu: &dyn hypervisor::Vcpu) -> Result<()> { /// # Arguments /// /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_msrs(vcpu: &dyn hypervisor::Vcpu) -> Result<()> { - vcpu.set_msrs(vcpu.boot_msr_entries()) - .map_err(Error::SetModelSpecificRegisters)?; +/// * `feature_msr_updates` - A (possibly empty) slice of MSR-based features +/// that should be set as as part of the setup. If the slice is empty then +/// only boot msr entries are set, otherwise the given slice will also be +/// included in the setup. +pub fn setup_msrs(vcpu: &dyn hypervisor::Vcpu, feature_msr_updates: &[MsrEntry]) -> Result<()> { + let boot_entries = vcpu.boot_msr_entries(); + let mut entries_for_update = Vec::new(); + let setup_entries: &mut &[MsrEntry] = &mut (&boot_entries[..]); - Ok(()) + if !feature_msr_updates.is_empty() { + entries_for_update.extend_from_slice(feature_msr_updates); + entries_for_update.extend_from_slice(boot_entries); + *setup_entries = &entries_for_update[..]; + } + let num_msrs_written = vcpu + .set_msrs(setup_entries) + .map_err(Error::SetModelSpecificRegisters)?; + if num_msrs_written < setup_entries.len() { + for msr in &setup_entries[num_msrs_written..] { + log::error!( + "Could not set MSR with register address:={:#x} and value:={:#x}", + msr.index, + msr.data + ); + } + Err(Into::into(Error::SetModelSpecificRegistersPartial)) + } else { + Ok(()) + } } /// Configure base registers for a given CPU. diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 49002d47dc..b23ae0a413 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -143,6 +143,12 @@ pub enum Error { #[error("Error generating common CPUID")] CommonCpuId(#[source] arch::Error), + #[error("Error computing required MSR updates")] + RequiredMsrUpdates(#[source] arch::Error), + + #[error("Error applying MSR filter")] + MsrFilter(#[source] arch::Error), + #[error("Error configuring vCPU")] VcpuConfiguration(#[source] arch::Error), @@ -556,11 +562,13 @@ impl Vcpu { /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. /// * `guest_memory` - Guest memory. /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. + #[cfg_attr(feature = "igvm", expect(clippy::too_many_arguments))] pub fn configure( &mut self, #[cfg(target_arch = "aarch64")] vm: &dyn hypervisor::Vm, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, #[cfg(target_arch = "x86_64")] cpuid: Vec, + #[cfg(target_arch = "x86_64")] feature_msr_updates: &[MsrEntry], #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), #[cfg(target_arch = "x86_64")] nested: bool, @@ -595,6 +603,7 @@ impl Vcpu { self.id, boot_setup, cpuid, + feature_msr_updates, kvm_hyperv, self.vendor, topology, @@ -723,6 +732,8 @@ pub struct CpuManager { #[cfg(target_arch = "x86_64")] /// A buffer for MSRs supported by the hardware and hypervisor msr_buffer: Vec, + #[cfg(target_arch = "x86_64")] + profile_msr_based_features: Vec, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm: Arc, vcpus_kill_signalled: Arc, @@ -958,6 +969,8 @@ impl CpuManager { cpuid: Vec::new(), #[cfg(target_arch = "x86_64")] msr_buffer: Self::construct_msr_buffer(hypervisor.as_ref())?, + #[cfg(target_arch = "x86_64")] + profile_msr_based_features: Vec::new(), vm, vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), @@ -1024,6 +1037,63 @@ impl CpuManager { Ok(()) } + #[cfg(target_arch = "x86_64")] + /// Prepares common MSR-based feature value updates that will be set when vCPUs are configured. + /// + /// This is only relevant when (non-host) CPU profiles are present, otherwise it is infallible + /// and we set an empty vector. + pub fn apply_msr_updates(&mut self) -> Result<()> { + let profile_msr_based_features = { + if let Some(arch::x86_64::cpu_profile::RequiredMsrUpdates { + msr_based_features, + denied_msrs, + }) = arch::x86_64::compute_required_msr_updates( + self.hypervisor.as_ref(), + self.config.profile, + self.config.kvm_hyperv, + ) + .map_err(Error::RequiredMsrUpdates)? + { + // Remove denied MSRS from the MSR buffer + self.msr_buffer.retain(|entry| { + !denied_msrs + .contains(&arch::x86_64::msr_definitions::RegisterAddress(entry.index)) + }); + + // Assert that all required msr_based_features to be set are part of the MSR buffer. + // It is a bug if this is not the case + for msr in &msr_based_features { + if !self + .msr_buffer + .iter() + .any(|msr_container| msr_container.index == msr.index) + { + error!( + "BUG: feature based MSR:={:#x} is not contained in the set MSR buffer for the CPU manager", + msr.index + ); + panic!( + "Broken invariant: The CPU Manager's MSR buffer does not have an entry for the computed MSR-based feature update" + ); + } + } + + // Create and apply a filter to prevent guests from accessing the denied MSRs + // TODO: Better error! + arch::x86_64::filter_denied_msrs( + denied_msrs.into_iter().map(|reg| reg.0).collect(), + self.vm.as_ref(), + ) + .map_err(Error::MsrFilter)?; + msr_based_features + } else { + Vec::new() + } + }; + self.profile_msr_based_features = profile_msr_based_features; + Ok(()) + } + fn create_vcpu( &mut self, cpu_id: u32, @@ -1118,6 +1188,7 @@ impl CpuManager { vcpu.configure( boot_setup, self.cpuid.clone(), + &self.profile_msr_based_features, self.config.kvm_hyperv, topology, self.config.nested, @@ -3471,8 +3542,9 @@ mod unit_tests { let vm = hv .create_vm(HypervisorVmConfig::default()) .expect("new VM fd creation failed"); + // TODO: Use a proper MSR buffer here let vcpu = vm.create_vcpu(0, None, vec![]).unwrap(); - setup_msrs(vcpu.as_ref()).unwrap(); + setup_msrs(vcpu.as_ref(), &[]).unwrap(); // This test will check against the last MSR entry configured (the tenth one). // See create_msr_entries for details. diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index ecc7fc6aa7..d62173e5fa 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -821,15 +821,23 @@ impl Vm { .map_err(Error::CpuManager)?; #[cfg(target_arch = "x86_64")] - cpu_manager - .lock() - .unwrap() - .populate_cpuid( - hypervisor.as_ref(), - #[cfg(feature = "tdx")] - tdx_enabled, - ) - .map_err(Error::CpuManager)?; + { + cpu_manager + .lock() + .unwrap() + .populate_cpuid( + hypervisor.as_ref(), + #[cfg(feature = "tdx")] + tdx_enabled, + ) + .map_err(Error::CpuManager)?; + + cpu_manager + .lock() + .unwrap() + .apply_msr_updates() + .map_err(Error::CpuManager)?; + } Ok(cpu_manager) } From 6ef99eadeb35cf23a516a724912c5b38c2fe2329 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 26 Jan 2026 19:54:01 +0100 Subject: [PATCH 159/178] arch: Make the CPU profile generation tool MSR aware We adapt the CPU profile generation tool to also take the MSR-based features into account. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpu_profile_generation.rs | 325 +++++++++++++++++++--- 1 file changed, 281 insertions(+), 44 deletions(-) diff --git a/arch/src/x86_64/cpu_profile_generation.rs b/arch/src/x86_64/cpu_profile_generation.rs index 8ef3fac900..a66a336bfa 100644 --- a/arch/src/x86_64/cpu_profile_generation.rs +++ b/arch/src/x86_64/cpu_profile_generation.rs @@ -2,20 +2,25 @@ // // SPDX-License-Identifier: Apache-2.0 // + +use std::collections::HashSet; use std::fs::File; use std::io::Write; -use std::ops::RangeInclusive; +use std::ops::{BitOr, RangeInclusive, Shl}; +use std::path::PathBuf; use anyhow::{Context, anyhow}; -use hypervisor::arch::x86::CpuIdEntry; +use hypervisor::arch::x86::{CpuIdEntry, MsrEntry}; use hypervisor::{CpuVendor, Hypervisor, HypervisorError, HypervisorType}; +use log::warn; -use crate::x86_64::cpu_profile::CpuIdProfileData; +use crate::x86_64::cpu_profile::{CpuIdProfileData, FeatureMsrAdjustment, MsrProfileData}; #[cfg(feature = "kvm")] use crate::x86_64::cpuid_definitions::CpuidDefinitions; use crate::x86_64::cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS; use crate::x86_64::cpuid_definitions::kvm::KVM_CPUID_DEFINITIONS; use crate::x86_64::cpuid_definitions::{Parameters, ProfilePolicy}; +use crate::x86_64::msr_definitions::{self, MsrDefinitions, RegisterAddress}; use crate::x86_64::{CpuidOutputRegisterAdjustments, CpuidReg}; /// Generate CPU profile data and convert it to a string, embeddable as Rust code, which is @@ -48,9 +53,11 @@ pub fn generate_profile_data( let Files { cpuid_data_file, cpuid_data_license_file, + msr_data_file, + msr_data_license_file, } = create_files(profile_name)?; - generate_cpu_profile_data_with( + generate_cpuid_profile_data_with( hypervisor_type, cpu_vendor, &supported_cpuid_sorted, @@ -58,12 +65,39 @@ pub fn generate_profile_data( &KVM_CPUID_DEFINITIONS, cpuid_data_file, cpuid_data_license_file, + )?; + + let supported_feature_msrs = hypervisor.get_msr_based_features().context("CPU profile generation failed: Could not get the supported MSR-based features from the hypervisor")?; + let supported_msrs = hypervisor + .get_msr_index_list() + .context("CPU profile generation failed: Could not get MSR index list")? + .into_iter() + .collect(); + + generate_msr_profile_data_with( + MsrProfileDataParams { + hypervisor_type, + cpu_vendor, + processor_feature_msr_definitions: + &msr_definitions::intel::INTEL_MSR_FEATURE_DEFINITIONS, + supported_feature_msrs: &supported_feature_msrs, + supported_msrs, + permitted_architectural_msrs: &msr_definitions::intel::PERMITTED_IA32_MSRS[..], + permitted_hypervisor_msrs: &msr_definitions::kvm::PROFILE_PERMITTED_KVM_MSRS[..], + permitted_hyperv_msrs: &msr_definitions::hyperv::HYPERV_MSRS[..], + non_architectural_msrs: &msr_definitions::intel::NON_ARCHITECTURAL_INTEL_MSRS[..], + forbidden_architectural_msrs: &msr_definitions::intel::FORBIDDEN_IA32_MSR_RANGES[..], + }, + msr_data_file, + msr_data_license_file, ) } struct Files { cpuid_data_file: File, cpuid_data_license_file: File, + msr_data_file: File, + msr_data_license_file: File, } /// Create empty files with names derived from the name given to the CPU profile. /// The name will be lowercase and spaces are replaced with "-". @@ -79,40 +113,53 @@ fn create_files(profile_name: &str) -> anyhow::Result { name }; + let create_file = |path: PathBuf| { + File::create(path.clone()).with_context(|| { + format!( + "CPU profile generation failed: Could not create file:={}", + path.to_string_lossy() + ) + }) + }; + + let path_with_license = |mut path: PathBuf| { + path.as_mut_os_string().push(".license"); + path + }; + + let current_dir = std::env::current_dir() + .context("CPU profile generation failed: Unable to get the current working directory")?; + + let common_path = format!("arch/src/x86_64/cpu_profiles/{profile_file_name}"); + let cpuid_profile_file_name = { - let mut path = std::env::current_dir().context( - "CPU profile generation failed: Unable to get the current working directory", - )?; - path.push(format!( - "arch/src/x86_64/cpu_profiles/{profile_file_name}.cpuid.json" - )); + let mut path = current_dir.clone(); + path.push(format!("{common_path}.cpuid.json")); path }; - let cpuid_data_file = File::create(cpuid_profile_file_name.clone()).with_context(|| { - format!( - "CPU profile generation failed: Could not create file:={}", - cpuid_profile_file_name.to_string_lossy() - ) - })?; + let cpuid_data_file = create_file(cpuid_profile_file_name.clone())?; - let cpuid_data_license_file_path = { - let mut path = cpuid_profile_file_name.clone(); - path.as_mut_os_string().push(".license"); + let cpuid_data_license_file_path = path_with_license(cpuid_profile_file_name); + + let cpuid_data_license_file = create_file(cpuid_data_license_file_path)?; + + let msr_profile_file_name = { + let mut path = current_dir; + path.push(format!("{common_path}.msr.json")); path }; - let cpuid_data_license_file = - File::create(cpuid_data_license_file_path.clone()).with_context(|| { - format!( - "CPU profile generation failed: Could not create file:={}", - cpuid_data_license_file_path.to_string_lossy() - ) - })?; + let msr_data_file = create_file(msr_profile_file_name.clone())?; + + let msr_data_license_file_path = path_with_license(msr_profile_file_name); + let msr_data_license_file = create_file(msr_data_license_file_path)?; Ok(Files { cpuid_data_file, cpuid_data_license_file, + msr_data_file, + msr_data_license_file, }) } @@ -138,21 +185,21 @@ fn cpu_brand_string_bytes(cpu_vendor: CpuVendor, profile_name: &str) -> anyhow:: } Ok(brand_string_bytes) } -/// Computes [`CpuProfileData`] based on the given sorted vector of CPUID entries, hypervisor type, cpu_vendor +/// Computes [`CpuIdProfileData`] based on the given sorted vector of CPUID entries, hypervisor type, cpu_vendor /// and cpuid_definitions. /// -/// The computed [`CpuProfileData`] is then converted to a string representation, embeddable as Rust code, which is +/// The computed [`CpuIdProfileData`] is then converted to a string representation, embeddable as Rust code, which is /// then written by the given `writer`. /// // TODO: Consider making a snapshot test or two for this function. -fn generate_cpu_profile_data_with( +fn generate_cpuid_profile_data_with( hypervisor_type: HypervisorType, cpu_vendor: CpuVendor, supported_cpuid_sorted: &[CpuIdEntry], processor_cpuid_definitions: &CpuidDefinitions, hypervisor_cpuid_definitions: &CpuidDefinitions, mut cpuid_data_file: impl Write, - mut cpuid_license_file: impl Write, + cpuid_license_file: impl Write, ) -> anyhow::Result<()> { let mut adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)> = Vec::new(); @@ -180,7 +227,7 @@ fn generate_cpu_profile_data_with( // The profile should take whatever we get from the host, hence there is no adjustment, but our // mask needs to retain all bits in the range of bits corresponding to this value let (first_bit_pos, last_bit_pos) = value.bits_range; - mask |= bit_range_mask(first_bit_pos, last_bit_pos); + mask |= bit_range_mask::(first_bit_pos, last_bit_pos); } ProfilePolicy::Static(overwrite_value) => { replacements |= overwrite_value << value.bits_range.0; @@ -190,7 +237,8 @@ fn generate_cpu_profile_data_with( let (first_bit_pos, last_bit_pos) = value.bits_range; if let Some(matching_register_value) = maybe_matching_register_output_value { - let extraction_mask = bit_range_mask(first_bit_pos, last_bit_pos); + let extraction_mask = + bit_range_mask::(first_bit_pos, last_bit_pos); let value = matching_register_value & extraction_mask; replacements |= value; } @@ -219,22 +267,206 @@ fn generate_cpu_profile_data_with( cpuid_data_file .flush() .context("CPU profile generation failed: Unable to flush cpuid profile data")?; + write_license_file(cpuid_license_file, "CPUID") +} + +struct MsrProfileDataParams<'a, const N: usize> { + hypervisor_type: HypervisorType, + cpu_vendor: CpuVendor, + processor_feature_msr_definitions: &'a MsrDefinitions, + + /// MSR-based features supported by the hardware and hypervisor used to + /// generate this CPU profile. + supported_feature_msrs: &'a [MsrEntry], + /// MSRs supported by the hardware and hypervisor used to generate this + /// CPU profile. + supported_msrs: HashSet, + /// A list of all architectural MSRs that are permitted if they are also + /// contained in `supported_msrs`. + permitted_architectural_msrs: &'a [u32], + /// MSRs defined by the hypervisor that are permitted if they are supported + /// by the hardware and hypervisor used when generating this CPU profile + /// + /// We let CHV make the final decision at runtime whether they should be + /// available to guests (currently via CPUID) + permitted_hypervisor_msrs: &'a [u32], + /// Hyper-V related MSRs. + /// + /// NOTE: We can only know if these are truly permitted when the profile is + ///applied at runtime, hence we include them in the profile regardless and + ///let CHV remove them if necessary upon applying the CPU profile. + permitted_hyperv_msrs: &'a [u32], + /// A list of known non-architectural MSRs. This list is only used to help + /// us detect MSRs that we might not be aware of. + non_architectural_msrs: &'a [u32], + /// A list of known ranges of architectural msrs, that should not be + /// permitted by any generated CPU profile. This list is only used to help + /// us detect MSRs that we might not be aware of. + forbidden_architectural_msrs: &'a [(u32, u32)], +} + +fn generate_msr_profile_data_with<'a, const N: usize>( + MsrProfileDataParams { + hypervisor_type, + cpu_vendor, + processor_feature_msr_definitions, + supported_feature_msrs, + supported_msrs, + permitted_architectural_msrs, + permitted_hypervisor_msrs, + permitted_hyperv_msrs, + non_architectural_msrs, + forbidden_architectural_msrs, + }: MsrProfileDataParams<'a, N>, + mut msr_data_file: impl Write, + msr_license_file: impl Write, +) -> anyhow::Result<()> { + const KVM_GET_NOT_SET_MSRS: [RegisterAddress; 6] = [ + RegisterAddress::IA32_VMX_PINBASED_CTLS, + RegisterAddress::IA32_VMX_PROCBASED_CTLS, + RegisterAddress::IA32_VMX_EXIT_CTLS, + RegisterAddress::IA32_VMX_ENTRY_CTLS, + RegisterAddress::IA32_VMX_CR0_FIXED1, + RegisterAddress::IA32_VMX_CR4_FIXED1, + ]; + let mut entries_encountered = 0; + let mut adjustments = Vec::new(); + let mut permitted_msrs = HashSet::new(); + 'table: for (reg_addr, definitions) in processor_feature_msr_definitions.as_slice() { + let Some(entry) = supported_feature_msrs + .iter() + .find(|e| e.index == reg_addr.0) + else { + continue; + }; + entries_encountered += 1; + + // NOTE: For now this tool only supports KVM, but we insert this check so we don't forget + // about (possible) KVM specific behavior. + if hypervisor_type == HypervisorType::Kvm && KVM_GET_NOT_SET_MSRS.contains(reg_addr) { + // In this case we do not want to record an update, but just that the MSR is permitted. + permitted_msrs.insert(reg_addr.0); + continue; + } + + let value = entry.data; + let mut replacements = 0; + let mut mask = 0; + let mut bits_accounted_for = 0; + for msr_definitions::ValueDefinition { + policy, + bits_range: (first_bit_pos, last_bit_pos), + .. + } in definitions.as_slice().iter().copied() + { + let temp_mask = bit_range_mask::(first_bit_pos, last_bit_pos); + bits_accounted_for |= temp_mask; + match policy { + msr_definitions::ProfilePolicy::Deny => { + // This can only be applied to the entire MSR + assert_eq!(first_bit_pos, 0); + assert_eq!(last_bit_pos, 63); + continue 'table; + } + msr_definitions::ProfilePolicy::Inherit => { + replacements |= value & temp_mask; + } + msr_definitions::ProfilePolicy::Passthrough => { + mask |= temp_mask; + } + msr_definitions::ProfilePolicy::Static(overwrite_value) => { + replacements |= (overwrite_value) << (first_bit_pos); + } + } + } + // Reserved bit positions within an MSR value may get assigned meaning by hardware vendors in the future. + // For this reason we decide to have an "inherit" policy for these bits during profile generation. + let reserved_values = value & (!bits_accounted_for); + replacements |= reserved_values; + + permitted_msrs.insert(reg_addr.0); + adjustments.push((*reg_addr, FeatureMsrAdjustment { mask, replacements })); + } + + if entries_encountered != supported_feature_msrs.len() { + let unknown_register_address = supported_feature_msrs.iter().find(|entry| !processor_feature_msr_definitions.as_slice().iter().any(|(reg_addr, _)| reg_addr.0 == entry.index )).expect("We have checked that there should be at least one unknown supported MSR-based feature").index; + Err(anyhow!( + "CPU profile generation failed: The hardware and hypervisor supports MSR-based feature with register address:={unknown_register_address:#x}, but the CPU profile generation tool does not know what to do with this MSR. Please update the appropriate `MsrDefinitions` and try again." + ))?; + } + + for msr in permitted_architectural_msrs + .iter() + .chain(permitted_hypervisor_msrs) + .chain(permitted_hyperv_msrs) + { + if supported_msrs.contains(msr) { + let _ = permitted_msrs.insert(*msr); + } + } + + // Also check to see if there are any MSRs on the system that we are not aware off. In that case + // it might be a sign that this tool needs to update its definitions! + for msr in supported_msrs.difference(&permitted_msrs) { + let is_proc_feat_msr = processor_feature_msr_definitions + .as_slice() + .iter() + .any(|(reg_addr, _)| reg_addr.0 == *msr); + + let is_architectural_msr = forbidden_architectural_msrs + .iter() + .any(|r| (r.0..=r.1).contains(msr)); + + let is_non_architectural_msr = non_architectural_msrs.contains(msr); + + if is_proc_feat_msr || is_architectural_msr || is_non_architectural_msr { + continue; + } + + // TODO: Make this a hard error before upstreaming + warn!( + "Encountered unknown MSR:={:#x} when generating CPU profile. This CPU profile generation tool might not be up-to-date", + *msr + ); + } + + let permitted_msrs: Vec = { + let mut permitted_msrs: Vec = permitted_msrs.into_iter().collect(); + permitted_msrs.sort(); + permitted_msrs.into_iter().map(RegisterAddress).collect() + }; + + let msr_profile_data = MsrProfileData { + hypervisor_type, + cpu_vendor, + adjustments, + permitted_msrs, + }; + + serde_json::to_writer_pretty(&mut msr_data_file, &msr_profile_data) + .context("Cpu profile generation failed: Could not serialize the generated MSR profile data to the given writer")?; + msr_data_file + .flush() + .context("CPU profile generation failed: Unable to flush MSR profile data")?; + write_license_file(msr_license_file, "MSR") +} + +fn write_license_file(mut license_file: impl Write, data_type: &str) -> anyhow::Result<()> { let license_text = { r#"SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH SPDX-License-Identifier: Apache-2.0 "# }; - cpuid_license_file + license_file .write_all(license_text.as_bytes()) - .context( - "CPU profile generation failed: Unable to write to cpuid profile data license file", - )?; - cpuid_license_file - .flush() - .context("CPU profile generation failed: Unable to flush cpuid profile data license file") + .with_context(|| { + format!("CPU profile generation failed: Unable to write to {data_type} profile data license file") + })?; + license_file.flush().context(format!( + "CPU profile generation failed: Unable to flush {data_type} profile data license file" + )) } - /// Get as many of the supported CPUID entries from the hypervisor as possible. fn supported_cpuid(hypervisor: &dyn Hypervisor) -> anyhow::Result> { // Check for AMX compatibility. If this is supported we need to call arch_prctl before requesting the supported @@ -302,11 +534,16 @@ fn sort_entries(mut cpuid: Vec) -> Vec { }); cpuid } -/// Returns a `u32` where each bit between `first_bit_pos` and `last_bit_pos` is set (including both ends) and all other bits are 0. -fn bit_range_mask(first_bit_pos: u8, last_bit_pos: u8) -> u32 { - (first_bit_pos..=last_bit_pos).fold(0, |acc, next| acc | (1 << next)) -} +/// Returns a numeric value where each bit between `first_bit_pos` and `last_bit_pos` is set (including both ends) and all other bits are 0. +fn bit_range_mask(first_bit_pos: u8, last_bit_pos: u8) -> T +where + T: Shl, + T: BitOr, + T: From, +{ + (first_bit_pos..=last_bit_pos).fold(T::from(0_u8), |acc, next| acc | ((T::from(1_u8)) << next)) +} /// Returns a vector of exact parameter matches ((sub_leaf ..= sub_leaf), register_value) interleaved by /// the sub_leaf ranges specified by `param` that did not match any cpuid entry. fn extract_parameter_matches( From 1b33c3c06a4e674202963e30a23e64e56c82fc38 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Thu, 29 Jan 2026 11:00:33 +0100 Subject: [PATCH 160/178] arch: Use MSR aware profiles We regenerate the CPU profiles and include the MSR-related data. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpu_profile.rs | 22 +- ...kylake.json => sapphire-rapids.cpuid.json} | 520 +++++++++++++++--- ...nse => sapphire-rapids.cpuid.json.license} | 2 +- .../cpu_profiles/sapphire-rapids.msr.json | 177 ++++++ ...cense => sapphire-rapids.msr.json.license} | 2 +- ...apphire-rapids.json => skylake.cpuid.json} | 388 +++++++------ .../cpu_profiles/skylake.cpuid.json.license | 3 + arch/src/x86_64/cpu_profiles/skylake.msr.json | 175 ++++++ .../cpu_profiles/skylake.msr.json.license | 3 + 9 files changed, 1029 insertions(+), 263 deletions(-) rename arch/src/x86_64/cpu_profiles/{skylake.json => sapphire-rapids.cpuid.json} (87%) rename arch/src/x86_64/cpu_profiles/{sapphire-rapids.json.license => sapphire-rapids.cpuid.json.license} (59%) create mode 100644 arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json rename arch/src/x86_64/cpu_profiles/{skylake.json.license => sapphire-rapids.msr.json.license} (59%) rename arch/src/x86_64/cpu_profiles/{sapphire-rapids.json => skylake.cpuid.json} (94%) create mode 100644 arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license create mode 100644 arch/src/x86_64/cpu_profiles/skylake.msr.json create mode 100644 arch/src/x86_64/cpu_profiles/skylake.msr.json.license diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index 0f6befbca3..de9fcc908d 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -44,14 +44,14 @@ impl CpuProfile { let mut data: CpuIdProfileData = match self { Self::Host => None, Self::Skylake => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/skylake.json")) + serde_json::from_slice(include_bytes!("cpu_profiles/skylake.cpuid.json")) .inspect_err(|e| { error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); }) .expect("should be able to deserialize pre-generated data"), ), Self::SapphireRapids => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.json")) + serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.cpuid.json")) .inspect_err(|e| { error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); }) @@ -97,7 +97,23 @@ impl CpuProfile { /// Loads pre-generated MSR data associated with a CPU profile. #[cfg(feature = "kvm")] pub(in crate::x86_64) fn msr_data(&self) -> Option { - todo!() + match self { + Self::Host => None, + Self::Skylake => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/skylake.msr.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); + }) + .expect("should be able to deserialize pre-generated data"), + ), + Self::SapphireRapids => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.msr.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); + }) + .expect("should be able to deserialize pre-generated data"), + ), + } } #[cfg(not(feature = "kvm"))] diff --git a/arch/src/x86_64/cpu_profiles/skylake.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json similarity index 87% rename from arch/src/x86_64/cpu_profiles/skylake.json rename to arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index ffd77fd00f..5c857f2eee 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -12,7 +12,7 @@ "register": "EAX" }, { - "replacements": "0x00000016", + "replacements": "0x00000020", "mask": "0x00000000" } ], @@ -68,7 +68,7 @@ "register": "EAX" }, { - "replacements": "0x00050654", + "replacements": "0x000806f8", "mask": "0x00000000" } ], @@ -97,7 +97,7 @@ }, { "replacements": "0x76fa3223", - "mask": "0x88000000" + "mask": "0x89000000" } ], [ @@ -110,7 +110,7 @@ "register": "EDX" }, { - "replacements": "0x078bfbff", + "replacements": "0x078bbbff", "mask": "0x08000000" } ], @@ -628,7 +628,7 @@ "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000002", "mask": "0x00000000" } ], @@ -642,7 +642,7 @@ "register": "EBX" }, { - "replacements": "0xd19f07ab", + "replacements": "0xf1bf07ab", "mask": "0x00002040" } ], @@ -656,8 +656,8 @@ "register": "ECX" }, { - "replacements": "0x0000000c", - "mask": "0x00000010" + "replacements": "0x1b415f46", + "mask": "0x00000000" } ], [ @@ -670,8 +670,8 @@ "register": "EDX" }, { - "replacements": "0xa4000000", - "mask": "0x18000400" + "replacements": "0xbfc04410", + "mask": "0x00000000" } ], [ @@ -684,7 +684,7 @@ "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00001c30", "mask": "0x00000000" } ], @@ -740,7 +740,7 @@ "register": "EDX" }, { - "replacements": "0x00000000", + "replacements": "0x00000017", "mask": "0x00000000" } ], @@ -936,7 +936,7 @@ "register": "EAX" }, { - "replacements": "0x000002e7", + "replacements": "0x000602e7", "mask": "0x00000000" } ], @@ -992,7 +992,7 @@ "register": "EAX" }, { - "replacements": "0x0000000f", + "replacements": "0x0000001f", "mask": "0x00000000" } ], @@ -1085,7 +1085,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", - "end": "0x3" + "end": "0x4" }, "register": "EAX" }, @@ -1098,10 +1098,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x4", + "start": "0x3", "end": "0x4" }, - "register": "EAX" + "register": "EBX" }, { "replacements": "0x00000000", @@ -1113,9 +1113,9 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", - "end": "0x3" + "end": "0x4" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1126,10 +1126,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x4", + "start": "0x3", "end": "0x4" }, - "register": "EBX" + "register": "EDX" }, { "replacements": "0x00000000", @@ -1140,13 +1140,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x3", - "end": "0x3" + "start": "0x5", + "end": "0x5" }, - "register": "ECX" + "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000040", "mask": "0x00000000" } ], @@ -1154,13 +1154,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x4", - "end": "0x4" + "start": "0x6", + "end": "0x6" }, - "register": "ECX" + "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000200", "mask": "0x00000000" } ], @@ -1168,10 +1168,24 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x3", - "end": "0x3" + "start": "0x7", + "end": "0x7" }, - "register": "EDX" + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EAX" }, { "replacements": "0x00000000", @@ -1182,10 +1196,24 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x4", - "end": "0x4" + "start": "0x9", + "end": "0x9" }, - "register": "EDX" + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EAX" }, { "replacements": "0x00000000", @@ -1199,10 +1227,10 @@ "start": "0x5", "end": "0x5" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000040", + "replacements": "0x00000440", "mask": "0x00000000" } ], @@ -1213,10 +1241,10 @@ "start": "0x6", "end": "0x6" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000200", + "replacements": "0x00000480", "mask": "0x00000000" } ], @@ -1227,10 +1255,10 @@ "start": "0x7", "end": "0x7" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000400", + "replacements": "0x00000680", "mask": "0x00000000" } ], @@ -1241,7 +1269,7 @@ "start": "0x8", "end": "0x8" }, - "register": "EAX" + "register": "EBX" }, { "replacements": "0x00000000", @@ -1255,10 +1283,10 @@ "start": "0x9", "end": "0x9" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000008", + "replacements": "0x00000a80", "mask": "0x00000000" } ], @@ -1267,9 +1295,9 @@ "leaf": "0xd", "sub_leaf": { "start": "0xa", - "end": "0x3f" + "end": "0xa" }, - "register": "EAX" + "register": "EBX" }, { "replacements": "0x00000000", @@ -1283,10 +1311,10 @@ "start": "0x5", "end": "0x5" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000440", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1297,10 +1325,10 @@ "start": "0x6", "end": "0x6" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000480", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1311,10 +1339,10 @@ "start": "0x7", "end": "0x7" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000680", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1325,7 +1353,7 @@ "start": "0x8", "end": "0x8" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1339,10 +1367,10 @@ "start": "0x9", "end": "0x9" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1351,7 +1379,35 @@ "leaf": "0xd", "sub_leaf": { "start": "0xa", - "end": "0x3f" + "end": "0xa" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xb", + "end": "0xc" }, "register": "EBX" }, @@ -1364,8 +1420,8 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x5", - "end": "0x5" + "start": "0xb", + "end": "0xc" }, "register": "ECX" }, @@ -1378,10 +1434,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x6", - "end": "0x6" + "start": "0xb", + "end": "0xc" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", @@ -1392,8 +1448,36 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x7", - "end": "0x7" + "start": "0xd", + "end": "0xd" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" }, "register": "ECX" }, @@ -1406,8 +1490,36 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0xe", + "end": "0xe" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xe", + "end": "0xe" }, "register": "ECX" }, @@ -1420,8 +1532,134 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0xe", + "end": "0xe" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0x10" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "EAX" + }, + { + "replacements": "0x00002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0x10" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" + }, + "register": "EBX" + }, + { + "replacements": "0x00000ac0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "EBX" + }, + { + "replacements": "0x00000b00", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", + "end": "0x3f" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0x10" }, "register": "ECX" }, @@ -1434,7 +1672,35 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", + "start": "0x11", + "end": "0x11" + }, + "register": "ECX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x12", + "end": "0x12" + }, + "register": "ECX" + }, + { + "replacements": "0x00000006", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x13", "end": "0x3f" }, "register": "ECX" @@ -1925,6 +2191,20 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "EBX" @@ -1939,6 +2219,20 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "ECX" @@ -1953,6 +2247,20 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x18", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "EDX" @@ -2014,7 +2322,7 @@ "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000001", "mask": "0x00000000" } ], @@ -2028,7 +2336,7 @@ "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x04002000", "mask": "0x00000000" } ], @@ -2042,7 +2350,7 @@ "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00080040", "mask": "0x00000000" } ], @@ -2056,7 +2364,7 @@ "register": "ECX" }, { - "replacements": "0x00000000", + "replacements": "0x00000010", "mask": "0x00000000" } ], @@ -2084,7 +2392,7 @@ "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00004010", "mask": "0x00000000" } ], @@ -2107,6 +2415,20 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "EAX" @@ -2121,6 +2443,20 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "EBX" @@ -2135,6 +2471,20 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "ECX" @@ -2149,6 +2499,20 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", + "end": "0x0" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x1f", + "sub_leaf": { + "start": "0x1", "end": "0xffffffff" }, "register": "EDX" @@ -2546,7 +2910,7 @@ "register": "EBX" }, { - "replacements": "0x6b53206c", + "replacements": "0x6153206c", "mask": "0x00000000" } ], @@ -2560,7 +2924,7 @@ "register": "ECX" }, { - "replacements": "0x6b616c79", + "replacements": "0x69687070", "mask": "0x00000000" } ], @@ -2574,7 +2938,7 @@ "register": "EDX" }, { - "replacements": "0x00000065", + "replacements": "0x52206572", "mask": "0x00000000" } ], @@ -2588,7 +2952,7 @@ "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x64697061", "mask": "0x00000000" } ], @@ -2602,7 +2966,7 @@ "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000073", "mask": "0x00000000" } ], diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license similarity index 59% rename from arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license rename to arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license index 7f7e3b5e1b..579657c531 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json.license +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json.license @@ -1,3 +1,3 @@ SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH -SPDX-License-Identifier: Apache-2.0 +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json new file mode 100644 index 0000000000..0798ec9a43 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json @@ -0,0 +1,177 @@ +{ + "cpu_vendor": "Intel", + "hypervisor_type": "Kvm", + "adjustments": [ + [ + "0x8b", + { + "mask": "0xffffffff00000000", + "replacements": "0x0000000000000000" + } + ], + [ + "0x10a", + { + "mask": "0x4000000000000000", + "replacements": "0x000000000c08e06b" + } + ], + [ + "0x480", + { + "mask": "0x0000000000000000", + "replacements": "0x00d8100011e57ed0" + } + ], + [ + "0x485", + { + "mask": "0x000000000000001f", + "replacements": "0x0000000020000060" + } + ], + [ + "0x486", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000080000021" + } + ], + [ + "0x488", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000002000" + } + ], + [ + "0x48a", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000032" + } + ], + [ + "0x48b", + { + "mask": "0x0000000000000000", + "replacements": "0x06137bff00000000" + } + ], + [ + "0x48c", + { + "mask": "0x0000000000000000", + "replacements": "0x00000f01063340c1" + } + ], + [ + "0x48d", + { + "mask": "0x0000000000000000", + "replacements": "0x000000ff00000016" + } + ], + [ + "0x48e", + { + "mask": "0x0000000000000000", + "replacements": "0xfff9fffe04006172" + } + ], + [ + "0x48f", + { + "mask": "0x0000000000000000", + "replacements": "0x007fefff00036dfb" + } + ], + [ + "0x490", + { + "mask": "0x0000000000000000", + "replacements": "0x0000d3ff000011fb" + } + ], + [ + "0x491", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000001" + } + ] + ], + "permitted_msrs": [ + "0x10", + "0x11", + "0x12", + "0x3a", + "0x3b", + "0x48", + "0x8b", + "0x10a", + "0x174", + "0x175", + "0x176", + "0x1a0", + "0x1c4", + "0x1c5", + "0x277", + "0x480", + "0x481", + "0x482", + "0x483", + "0x484", + "0x485", + "0x486", + "0x487", + "0x488", + "0x489", + "0x48a", + "0x48b", + "0x48c", + "0x48d", + "0x48e", + "0x48f", + "0x490", + "0x491", + "0x6e0", + "0x40000000", + "0x40000001", + "0x40000002", + "0x40000003", + "0x40000010", + "0x40000020", + "0x40000021", + "0x40000022", + "0x40000023", + "0x40000073", + "0x40000080", + "0x400000b0", + "0x400000f1", + "0x400000f2", + "0x400000f3", + "0x400000f4", + "0x400000f5", + "0x40000100", + "0x40000101", + "0x40000102", + "0x40000103", + "0x40000104", + "0x40000105", + "0x4b564d00", + "0x4b564d01", + "0x4b564d02", + "0x4b564d03", + "0x4b564d04", + "0x4b564d05", + "0x4b564d06", + "0x4b564d07", + "0xc0000081", + "0xc0000082", + "0xc0000083", + "0xc0000084", + "0xc0000102", + "0xc0000103" + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.json.license b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license similarity index 59% rename from arch/src/x86_64/cpu_profiles/skylake.json.license rename to arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license index 7f7e3b5e1b..579657c531 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.json.license +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json.license @@ -1,3 +1,3 @@ SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH -SPDX-License-Identifier: Apache-2.0 +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json similarity index 94% rename from arch/src/x86_64/cpu_profiles/sapphire-rapids.json rename to arch/src/x86_64/cpu_profiles/skylake.cpuid.json index ecbf7bc28c..3cdd9966e1 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -12,7 +12,7 @@ "register": "EAX" }, { - "replacements": "0x00000020", + "replacements": "0x00000016", "mask": "0x00000000" } ], @@ -68,7 +68,7 @@ "register": "EAX" }, { - "replacements": "0x000806f8", + "replacements": "0x00050654", "mask": "0x00000000" } ], @@ -97,7 +97,7 @@ }, { "replacements": "0x76fa3223", - "mask": "0x88000000" + "mask": "0x89000000" } ], [ @@ -110,7 +110,7 @@ "register": "EDX" }, { - "replacements": "0x078bfbff", + "replacements": "0x078bbbff", "mask": "0x08000000" } ], @@ -628,7 +628,7 @@ "register": "EAX" }, { - "replacements": "0x00000002", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -642,7 +642,7 @@ "register": "EBX" }, { - "replacements": "0xf1bf07ab", + "replacements": "0xd19f07ab", "mask": "0x00002040" } ], @@ -656,8 +656,8 @@ "register": "ECX" }, { - "replacements": "0x1b415f6e", - "mask": "0x00000010" + "replacements": "0x00000004", + "mask": "0x00000000" } ], [ @@ -670,8 +670,8 @@ "register": "EDX" }, { - "replacements": "0xa7c04010", - "mask": "0x18000400" + "replacements": "0xbc000400", + "mask": "0x00000000" } ], [ @@ -684,7 +684,7 @@ "register": "EAX" }, { - "replacements": "0x00001c30", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -740,7 +740,7 @@ "register": "EDX" }, { - "replacements": "0x00000017", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -936,7 +936,7 @@ "register": "EAX" }, { - "replacements": "0x000602e7", + "replacements": "0x000002e7", "mask": "0x00000000" } ], @@ -992,7 +992,7 @@ "register": "EAX" }, { - "replacements": "0x0000001f", + "replacements": "0x0000000f", "mask": "0x00000000" } ], @@ -1085,6 +1085,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", + "end": "0x3" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", "end": "0x4" }, "register": "EAX" @@ -1099,6 +1113,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", + "end": "0x3" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", "end": "0x4" }, "register": "EBX" @@ -1113,6 +1141,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", + "end": "0x3" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", "end": "0x4" }, "register": "ECX" @@ -1127,6 +1169,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0x3", + "end": "0x3" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x4", "end": "0x4" }, "register": "EDX" @@ -1211,7 +1267,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0xa", - "end": "0x10" + "end": "0xa" }, "register": "EAX" }, @@ -1224,13 +1280,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x11", - "end": "0x11" + "start": "0x5", + "end": "0x5" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000040", + "replacements": "0x00000440", "mask": "0x00000000" } ], @@ -1238,13 +1294,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x12", - "end": "0x12" + "start": "0x6", + "end": "0x6" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00002000", + "replacements": "0x00000480", "mask": "0x00000000" } ], @@ -1252,10 +1308,52 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x13", - "end": "0x3f" + "start": "0x7", + "end": "0x7" }, - "register": "EAX" + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x8", + "end": "0x8" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x9", + "end": "0x9" + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EBX" }, { "replacements": "0x00000000", @@ -1269,10 +1367,10 @@ "start": "0x5", "end": "0x5" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000440", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1283,10 +1381,10 @@ "start": "0x6", "end": "0x6" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000480", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1297,10 +1395,10 @@ "start": "0x7", "end": "0x7" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000680", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1311,7 +1409,7 @@ "start": "0x8", "end": "0x8" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1325,10 +1423,10 @@ "start": "0x9", "end": "0x9" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1337,9 +1435,9 @@ "leaf": "0xd", "sub_leaf": { "start": "0xa", - "end": "0x10" + "end": "0xa" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1350,13 +1448,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x11", - "end": "0x11" + "start": "0xb", + "end": "0xc" }, - "register": "EBX" + "register": "EAX" }, { - "replacements": "0x00000ac0", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1364,13 +1462,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x12", - "end": "0x12" + "start": "0xb", + "end": "0xc" }, "register": "EBX" }, { - "replacements": "0x00000b00", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1378,10 +1476,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x13", - "end": "0x3f" + "start": "0xb", + "end": "0xc" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1392,10 +1490,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x5", - "end": "0x5" + "start": "0xb", + "end": "0xc" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", @@ -1406,10 +1504,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x6", - "end": "0x6" + "start": "0xd", + "end": "0xd" }, - "register": "ECX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1420,10 +1518,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x7", - "end": "0x7" + "start": "0xd", + "end": "0xd" }, - "register": "ECX" + "register": "EBX" }, { "replacements": "0x00000000", @@ -1434,8 +1532,8 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0xd", + "end": "0xd" }, "register": "ECX" }, @@ -1448,10 +1546,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0xe", + "end": "0xe" }, - "register": "ECX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1462,10 +1560,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0x10" + "start": "0xe", + "end": "0xe" }, - "register": "ECX" + "register": "EBX" }, { "replacements": "0x00000000", @@ -1476,13 +1574,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x11", - "end": "0x11" + "start": "0xe", + "end": "0xe" }, "register": "ECX" }, { - "replacements": "0x00000002", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1490,13 +1588,41 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x12", - "end": "0x12" + "start": "0xe", + "end": "0xe" }, - "register": "ECX" + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0x3f" + }, + "register": "EBX" }, { - "replacements": "0x00000006", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1504,7 +1630,7 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x13", + "start": "0xf", "end": "0x3f" }, "register": "ECX" @@ -1995,20 +2121,6 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "EBX" - }, - { - "replacements": "0x00000000", - "mask": "0xffff070f" - } - ], - [ - { - "leaf": "0x18", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "EBX" @@ -2023,20 +2135,6 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "ECX" - }, - { - "replacements": "0x00000000", - "mask": "0xffffffff" - } - ], - [ - { - "leaf": "0x18", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "ECX" @@ -2051,20 +2149,6 @@ "leaf": "0x18", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "EDX" - }, - { - "replacements": "0x00000000", - "mask": "0x03ffc1ff" - } - ], - [ - { - "leaf": "0x18", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "EDX" @@ -2126,7 +2210,7 @@ "register": "EAX" }, { - "replacements": "0x00000001", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2140,7 +2224,7 @@ "register": "EAX" }, { - "replacements": "0x04002000", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2154,7 +2238,7 @@ "register": "EBX" }, { - "replacements": "0x00080040", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2168,7 +2252,7 @@ "register": "ECX" }, { - "replacements": "0x00000010", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2196,7 +2280,7 @@ "register": "EBX" }, { - "replacements": "0x00004010", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2219,20 +2303,6 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "EAX" - }, - { - "replacements": "0x00000000", - "mask": "0x0000001f" - } - ], - [ - { - "leaf": "0x1f", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "EAX" @@ -2247,20 +2317,6 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "EBX" - }, - { - "replacements": "0x00000000", - "mask": "0x0000ffff" - } - ], - [ - { - "leaf": "0x1f", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "EBX" @@ -2275,20 +2331,6 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "ECX" - }, - { - "replacements": "0x00000000", - "mask": "0x0000ffff" - } - ], - [ - { - "leaf": "0x1f", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "ECX" @@ -2303,20 +2345,6 @@ "leaf": "0x1f", "sub_leaf": { "start": "0x0", - "end": "0x0" - }, - "register": "EDX" - }, - { - "replacements": "0x00000000", - "mask": "0xffffffff" - } - ], - [ - { - "leaf": "0x1f", - "sub_leaf": { - "start": "0x1", "end": "0xffffffff" }, "register": "EDX" @@ -2714,7 +2742,7 @@ "register": "EBX" }, { - "replacements": "0x6153206c", + "replacements": "0x6b53206c", "mask": "0x00000000" } ], @@ -2728,7 +2756,7 @@ "register": "ECX" }, { - "replacements": "0x69687070", + "replacements": "0x6b616c79", "mask": "0x00000000" } ], @@ -2742,7 +2770,7 @@ "register": "EDX" }, { - "replacements": "0x52206572", + "replacements": "0x00000065", "mask": "0x00000000" } ], @@ -2756,7 +2784,7 @@ "register": "EAX" }, { - "replacements": "0x64697061", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -2770,7 +2798,7 @@ "register": "EBX" }, { - "replacements": "0x00000073", + "replacements": "0x00000000", "mask": "0x00000000" } ], diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json b/arch/src/x86_64/cpu_profiles/skylake.msr.json new file mode 100644 index 0000000000..d561dffc92 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json @@ -0,0 +1,175 @@ +{ + "cpu_vendor": "Intel", + "hypervisor_type": "Kvm", + "adjustments": [ + [ + "0x8b", + { + "mask": "0xffffffff00000000", + "replacements": "0x0000000000000000" + } + ], + [ + "0x10a", + { + "mask": "0x4000000000000000", + "replacements": "0x000000000c00004c" + } + ], + [ + "0x480", + { + "mask": "0x0000000000000000", + "replacements": "0x00d8100011e57ed0" + } + ], + [ + "0x485", + { + "mask": "0x000000000000001f", + "replacements": "0x0000000020000060" + } + ], + [ + "0x486", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000080000021" + } + ], + [ + "0x488", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000002000" + } + ], + [ + "0x48a", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000032" + } + ], + [ + "0x48b", + { + "mask": "0x0000000000000000", + "replacements": "0x02137bff00000000" + } + ], + [ + "0x48c", + { + "mask": "0x0000000000000000", + "replacements": "0x00000f0106334041" + } + ], + [ + "0x48d", + { + "mask": "0x0000000000000000", + "replacements": "0x000000ff00000016" + } + ], + [ + "0x48e", + { + "mask": "0x0000000000000000", + "replacements": "0xfff9fffe04006172" + } + ], + [ + "0x48f", + { + "mask": "0x0000000000000000", + "replacements": "0x007fefff00036dfb" + } + ], + [ + "0x490", + { + "mask": "0x0000000000000000", + "replacements": "0x0000d3ff000011fb" + } + ], + [ + "0x491", + { + "mask": "0x0000000000000000", + "replacements": "0x0000000000000001" + } + ] + ], + "permitted_msrs": [ + "0x10", + "0x11", + "0x12", + "0x3a", + "0x3b", + "0x48", + "0x8b", + "0x10a", + "0x174", + "0x175", + "0x176", + "0x1a0", + "0x277", + "0x480", + "0x481", + "0x482", + "0x483", + "0x484", + "0x485", + "0x486", + "0x487", + "0x488", + "0x489", + "0x48a", + "0x48b", + "0x48c", + "0x48d", + "0x48e", + "0x48f", + "0x490", + "0x491", + "0x6e0", + "0x40000000", + "0x40000001", + "0x40000002", + "0x40000003", + "0x40000010", + "0x40000020", + "0x40000021", + "0x40000022", + "0x40000023", + "0x40000073", + "0x40000080", + "0x400000b0", + "0x400000f1", + "0x400000f2", + "0x400000f3", + "0x400000f4", + "0x400000f5", + "0x40000100", + "0x40000101", + "0x40000102", + "0x40000103", + "0x40000104", + "0x40000105", + "0x4b564d00", + "0x4b564d01", + "0x4b564d02", + "0x4b564d03", + "0x4b564d04", + "0x4b564d05", + "0x4b564d06", + "0x4b564d07", + "0xc0000081", + "0xc0000082", + "0xc0000083", + "0xc0000084", + "0xc0000102", + "0xc0000103" + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json.license b/arch/src/x86_64/cpu_profiles/skylake.msr.json.license new file mode 100644 index 0000000000..579657c531 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2025 Cyberus Technology GmbH + +SPDX-License-Identifier: Apache-2.0 From 043bd6a39204fc52a9dd91052acd28c09f70de02 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 17 Mar 2026 16:28:40 +0100 Subject: [PATCH 161/178] arch: Inherit policy for MCA Windows server needs the machine check architecture (MCA) CPUID bit to be set in order to boot. Since Windows server is a use-case we want to support we need to revert our previous decision to disable MCA for non-host CPU profiles. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/cpuid_definitions/intel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 18cb6c8c01..ee944bbf9d 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -469,7 +469,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "mca", description: "Machine Check Architecture", bits_range: (14, 14), - policy: ProfilePolicy::Static(0), + policy: ProfilePolicy::Inherit, }, ValueDefinition { short: "cmov", From 83619ba63d7e9e8145dfd496f392935e7ab0090e Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 17 Mar 2026 17:04:57 +0100 Subject: [PATCH 162/178] arch: Permit IA32_MCG_CAP and IA32_MCG_STATUS We permit these MSRs because they are expected to be available when the CPUID 0x1.EDX[14](MCA) feature bit is set. Recall that MCA is necessary in order to boot Windows Server which we want to support. We also do not list the error reporting banks as forbidden any longer. Aside: The previous implementation did not end up denying those MSRs anyway, because KVM does not report them via KVM_GET_MSR_INDEX_LIST. Now with MCA explicitly set, the guest will certainly expect the presence of error reporting banks, so we make sure not to indicate otherwise. Recall that by default KVM reports all (32) error banks as available and leaves all feature bits of IA32_MCG_CAP unset, hence the information displayed to the guest in these MSRs will remain consistent before and after a live migration in the absence of machine check errors. Note that as of today Cloud hypervisor does not transfer the error reporting banks to the destination of a live migration which can indeed lead to surprises, but on the other hand the information is likely to be inaccurate at the point of resume anyway. As a follow up we could try to mitigate the aforementioned problem by checking for MCEs during live migration and marking the migration as failed if any MCE occurred before or during the live migration. That should however be addressed in a separate PR. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/architectural_msrs.rs | 566 +++++++++--------- 1 file changed, 282 insertions(+), 284 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs index 0585c75296..9a217a3845 100644 --- a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -51,6 +51,8 @@ mod permitted_architectural_msrs { const IA32_FZM_RANGE_STARTADDR: u32 = 0x84; const IA32_FZM_RANGE_ENDADDR: u32 = 0x85; const IA32_FZM_RANGE_WRITESTATUS: u32 = 0x86; + // NOTE: This is permitted, but will be zeroed out for all non-host CPU profiles. + const IA32_MCG_CAP: u32 = 0x179; /// DCA Capability (R) const IA32_PLATFORM_DCA_CAP: u32 = 0x1f8; @@ -98,13 +100,14 @@ mod permitted_architectural_msrs { const IA32_X2APIC_IRR7: u32 = 0x827; const IA32_X2APIC_CUR_COUNT: u32 = 0x839; - pub(super) const READ_ONLY_IA32_MSRS: [u32; 38] = [ + pub(super) const READ_ONLY_IA32_MSRS: [u32; 39] = [ IA32_BARRIER, IA32_MTRRCAP, IA32_FZM_DOMAIN_CONFIG, IA32_FZM_RANGE_STARTADDR, IA32_FZM_RANGE_ENDADDR, IA32_FZM_RANGE_WRITESTATUS, + IA32_MCG_CAP, IA32_PLATFORM_DCA_CAP, IA32_CPU_DCA_CAP, IA32_MCU_STAGING_MBOX_ADDR, @@ -183,6 +186,11 @@ mod permitted_architectural_msrs { /// SYSENTER_ESP_MSR const IA32_SYSENTER_EIP: u32 = 0x176; + // Technically permitted (as users will expect it given that MCA is available via CPUID), + // but probably not very useful since IA32_MCG_CAP will be zeroed out for all non-host + // CPU profiles + const IA32_MCG_STATUS: u32 = 0x17a; + // TODO: Does it really make sense to permit this MSR? const IA32_SMM_MONITOR_CTL: u32 = 0x9b; const _IA32_SMM_MONITOR_CTL_CPUID_CHECK: () = @@ -256,6 +264,146 @@ mod permitted_architectural_msrs { const IA32_MTRR_DEF_TYPE: u32 = 0x2ff; + // Error reporting banks. KVM always reports 32 + // of them by default. + // TODO: Consider conditionally compiling this based + // on whether we are using KVM + const IA32_MC0_CTL: u32 = 0x400; + const IA32_MC0_STATUS: u32 = 0x401; + const IA32_MC0_ADDR: u32 = 0x402; + const IA32_MC0_MISC: u32 = 0x403; + const IA32_MC1_CTL: u32 = 0x404; + const IA32_MC1_STATUS: u32 = 0x405; + const IA32_MC1_ADDR: u32 = 0x406; + + const IA32_MC1_MISC: u32 = 0x407; + const IA32_MC2_CTL: u32 = 0x408; + const IA32_MC2_STATUS: u32 = 0x409; + const IA32_MC2_ADDR: u32 = 0x40a; + const IA32_MC2_MISC: u32 = 0x40b; + const IA32_MC3_CTL: u32 = 0x40c; + const IA32_MC3_STATUS: u32 = 0x40d; + const IA32_MC3_ADDR1: u32 = 0x40e; + const IA32_MC3_MISC: u32 = 0x40f; + const IA32_MC4_CTL: u32 = 0x410; + const IA32_MC4_STATUS: u32 = 0x411; + const IA32_MC4_ADDR: u32 = 0x412; + const IA32_MC4_MISC: u32 = 0x413; + const IA32_MC5_CTL: u32 = 0x414; + const IA32_MC5_STATUS: u32 = 0x415; + const IA32_MC5_ADDR: u32 = 0x416; + const IA32_MC5_MISC: u32 = 0x417; + const IA32_MC6_CTL: u32 = 0x418; + + const IA32_MC6_STATUS: u32 = 0x419; + const IA32_MC6_ADDR1: u32 = 0x41a; + const IA32_MC6_MISC: u32 = 0x41b; + const IA32_MC7_CTL: u32 = 0x41c; + const IA32_MC7_STATUS: u32 = 0x41d; + const IA32_MC7_ADDR: u32 = 0x41e; + const IA32_MC7_MISC: u32 = 0x41f; + const IA32_MC8_CTL: u32 = 0x420; + const IA32_MC8_STATUS: u32 = 0x421; + const IA32_MC8_ADDR: u32 = 0x422; + const IA32_MC8_MISC: u32 = 0x423; + const IA32_MC9_CTL: u32 = 0x424; + const IA32_MC9_STATUS: u32 = 0x425; + const IA32_MC9_ADDR: u32 = 0x426; + const IA32_MC9_MISC: u32 = 0x427; + const IA32_MC10_CTL: u32 = 0x428; + const IA32_MC10_STATUS: u32 = 0x429; + const IA32_MC10_ADDR: u32 = 0x42a; + const IA32_MC10_MISC: u32 = 0x42b; + + const IA32_MC11_CTL: u32 = 0x42c; + const IA32_MC11_STATUS: u32 = 0x42d; + const IA32_MC11_ADDR: u32 = 0x42e; + const IA32_MC11_MISC: u32 = 0x42f; + const IA32_MC12_CTL: u32 = 0x430; + const IA32_MC12_STATUS: u32 = 0x431; + const IA32_MC12_ADDR: u32 = 0x432; + const IA32_MC12_MISC: u32 = 0x433; + const IA32_MC13_CTL: u32 = 0x434; + const IA32_MC13_STATUS: u32 = 0x435; + const IA32_MC13_ADDR: u32 = 0x436; + const IA32_MC13_MISC: u32 = 0x437; + const IA32_MC14_CTL: u32 = 0x438; + const IA32_MC14_STATUS: u32 = 0x439; + const IA32_MC14_ADDR: u32 = 0x43a; + const IA32_MC14_MISC: u32 = 0x43b; + const IA32_MC15_CTL: u32 = 0x43c; + const IA32_MC15_STATUS: u32 = 0x43d; + + const IA32_MC15_ADDR: u32 = 0x43e; + const IA32_MC15_MISC: u32 = 0x43f; + const IA32_MC16_CTL: u32 = 0x440; + const IA32_MC16_STATUS: u32 = 0x441; + const IA32_MC16_ADDR: u32 = 0x442; + const IA32_MC16_MISC: u32 = 0x443; + const IA32_MC17_CTL: u32 = 0x444; + const IA32_MC17_STATUS: u32 = 0x445; + const IA32_MC17_ADDR: u32 = 0x446; + const IA32_MC17_MISC: u32 = 0x447; + const IA32_MC18_CTL: u32 = 0x448; + const IA32_MC18_STATUS: u32 = 0x449; + const IA32_MC18_ADDR: u32 = 0x44a; + const IA32_MC18_MISC: u32 = 0x44b; + const IA32_MC19_CTL: u32 = 0x44c; + const IA32_MC19_STATUS: u32 = 0x44d; + const IA32_MC19_ADDR: u32 = 0x44e; + const IA32_MC19_MISC: u32 = 0x44f; + const IA32_MC20_CTL: u32 = 0x450; + + const IA32_MC20_STATUS: u32 = 0x451; + const IA32_MC20_ADDR: u32 = 0x452; + const IA32_MC20_MISC: u32 = 0x453; + const IA32_MC21_CTL: u32 = 0x454; + const IA32_MC21_STATUS: u32 = 0x455; + const IA32_MC21_ADDR: u32 = 0x456; + const IA32_MC21_MISC: u32 = 0x457; + const IA32_MC22_CTL: u32 = 0x458; + const IA32_MC22_STATUS: u32 = 0x459; + const IA32_MC22_ADDR: u32 = 0x45a; + const IA32_MC22_MISC: u32 = 0x45b; + const IA32_MC23_CTL: u32 = 0x45c; + const IA32_MC23_STATUS: u32 = 0x45d; + const IA32_MC23_ADDR: u32 = 0x45e; + const IA32_MC23_MISC: u32 = 0x45f; + const IA32_MC24_CTL: u32 = 0x460; + const IA32_MC24_STATUS: u32 = 0x461; + const IA32_MC24_ADDR: u32 = 0x462; + + const IA32_MC24_MISC: u32 = 0x463; + const IA32_MC25_CTL: u32 = 0x464; + const IA32_MC25_STATUS: u32 = 0x465; + const IA32_MC25_ADDR: u32 = 0x466; + const IA32_MC25_MISC: u32 = 0x467; + const IA32_MC26_CTL: u32 = 0x468; + const IA32_MC26_STATUS: u32 = 0x469; + const IA32_MC26_ADDR: u32 = 0x46a; + const IA32_MC26_MISC: u32 = 0x46b; + const IA32_MC27_CTL: u32 = 0x46c; + const IA32_MC27_STATUS: u32 = 0x46d; + const IA32_MC27_ADDR: u32 = 0x46e; + const IA32_MC27_MISC: u32 = 0x46f; + const IA32_MC28_CTL: u32 = 0x470; + const IA32_MC28_STATUS: u32 = 0x471; + const IA32_MC28_ADDR: u32 = 0x472; + const IA32_MC28_MISC: u32 = 0x473; + const IA32_MC29_CTL: u32 = 0x474; + const IA32_MC29_STATUS: u32 = 0x475; + + const IA32_MC29_ADDR: u32 = 0x476; + const IA32_MC29_MISC: u32 = 0x477; + const IA32_MC30_CTL: u32 = 0x478; + const IA32_MC30_STATUS: u32 = 0x479; + const IA32_MC30_ADDR: u32 = 0x47a; + const IA32_MC30_MISC: u32 = 0x47b; + const IA32_MC31_CTL: u32 = 0x47c; + const IA32_MC31_STATUS: u32 = 0x47d; + const IA32_MC31_ADDR: u32 = 0x47e; + const IA32_MC31_MISC: u32 = 0x47f; + const IA32_U_CET: u32 = 0x6a0; const IA32_S_CET: u32 = 0x6a2; @@ -316,7 +464,7 @@ mod permitted_architectural_msrs { register: CpuidReg::ECX, }); - pub(super) const READ_WRITE_IA32_MSRS: [u32; 73] = [ + pub(super) const READ_WRITE_IA32_MSRS: [u32; 202] = [ IA32_TIME_STAMP_COUNTER, IA32_APIC_BASE, IA32_FEATURE_CONTROL, @@ -326,6 +474,7 @@ mod permitted_architectural_msrs { IA32_SYSENTER_CS, IA32_SYSENTER_ESP, IA32_SYSENTER_EIP, + IA32_MCG_STATUS, IA32_SMM_MONITOR_CTL, IA32_MISC_ENABLE, IA32_XFD, @@ -364,6 +513,134 @@ mod permitted_architectural_msrs { IA32_MTRR_FIX4K_F8000, IA32_PAT, IA32_MTRR_DEF_TYPE, + IA32_MC0_CTL, + IA32_MC0_STATUS, + IA32_MC0_ADDR, + IA32_MC0_MISC, + IA32_MC1_CTL, + IA32_MC1_STATUS, + IA32_MC1_ADDR, + IA32_MC1_MISC, + IA32_MC2_CTL, + IA32_MC2_STATUS, + IA32_MC2_ADDR, + IA32_MC2_MISC, + IA32_MC3_CTL, + IA32_MC3_STATUS, + IA32_MC3_ADDR1, + IA32_MC3_MISC, + IA32_MC4_CTL, + IA32_MC4_STATUS, + IA32_MC4_ADDR, + IA32_MC4_MISC, + IA32_MC5_CTL, + IA32_MC5_STATUS, + IA32_MC5_ADDR, + IA32_MC5_MISC, + IA32_MC6_CTL, + IA32_MC6_STATUS, + IA32_MC6_ADDR1, + IA32_MC6_MISC, + IA32_MC7_CTL, + IA32_MC7_STATUS, + IA32_MC7_ADDR, + IA32_MC7_MISC, + IA32_MC8_CTL, + IA32_MC8_STATUS, + IA32_MC8_ADDR, + IA32_MC8_MISC, + IA32_MC9_CTL, + IA32_MC9_STATUS, + IA32_MC9_ADDR, + IA32_MC9_MISC, + IA32_MC10_CTL, + IA32_MC10_STATUS, + IA32_MC10_ADDR, + IA32_MC10_MISC, + IA32_MC11_CTL, + IA32_MC11_STATUS, + IA32_MC11_ADDR, + IA32_MC11_MISC, + IA32_MC12_CTL, + IA32_MC12_STATUS, + IA32_MC12_ADDR, + IA32_MC12_MISC, + IA32_MC13_CTL, + IA32_MC13_STATUS, + IA32_MC13_ADDR, + IA32_MC13_MISC, + IA32_MC14_CTL, + IA32_MC14_STATUS, + IA32_MC14_ADDR, + IA32_MC14_MISC, + IA32_MC15_CTL, + IA32_MC15_STATUS, + IA32_MC15_ADDR, + IA32_MC15_MISC, + IA32_MC16_CTL, + IA32_MC16_STATUS, + IA32_MC16_ADDR, + IA32_MC16_MISC, + IA32_MC17_CTL, + IA32_MC17_STATUS, + IA32_MC17_ADDR, + IA32_MC17_MISC, + IA32_MC18_CTL, + IA32_MC18_STATUS, + IA32_MC18_ADDR, + IA32_MC18_MISC, + IA32_MC19_CTL, + IA32_MC19_STATUS, + IA32_MC19_ADDR, + IA32_MC19_MISC, + IA32_MC20_CTL, + IA32_MC20_STATUS, + IA32_MC20_ADDR, + IA32_MC20_MISC, + IA32_MC21_CTL, + IA32_MC21_STATUS, + IA32_MC21_ADDR, + IA32_MC21_MISC, + IA32_MC22_CTL, + IA32_MC22_STATUS, + IA32_MC22_ADDR, + IA32_MC22_MISC, + IA32_MC23_CTL, + IA32_MC23_STATUS, + IA32_MC23_ADDR, + IA32_MC23_MISC, + IA32_MC24_CTL, + IA32_MC24_STATUS, + IA32_MC24_ADDR, + IA32_MC24_MISC, + IA32_MC25_CTL, + IA32_MC25_STATUS, + IA32_MC25_ADDR, + IA32_MC25_MISC, + IA32_MC26_CTL, + IA32_MC26_STATUS, + IA32_MC26_ADDR, + IA32_MC26_MISC, + IA32_MC27_CTL, + IA32_MC27_STATUS, + IA32_MC27_ADDR, + IA32_MC27_MISC, + IA32_MC28_CTL, + IA32_MC28_STATUS, + IA32_MC28_ADDR, + IA32_MC28_MISC, + IA32_MC29_CTL, + IA32_MC29_STATUS, + IA32_MC29_ADDR, + IA32_MC29_MISC, + IA32_MC30_CTL, + IA32_MC30_STATUS, + IA32_MC30_ADDR, + IA32_MC30_MISC, + IA32_MC31_CTL, + IA32_MC31_STATUS, + IA32_MC31_ADDR, + IA32_MC31_MISC, IA32_U_CET, IA32_S_CET, IA32_TSC_DEADLINE, @@ -433,8 +710,8 @@ mod permitted_architectural_msrs { /// /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM /// Vol. 4 from October 2025 - pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 115] = const { - let mut permitted = [0u32; 115]; + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 245] = const { + let mut permitted = [0u32; 245]; let read_only_len = READ_ONLY_IA32_MSRS.len(); let write_only_len = WRITE_ONLY_IA32_MSRS.len(); let read_write_len = READ_WRITE_IA32_MSRS.len(); @@ -569,10 +846,6 @@ mod forbidden_architectural_msrs { // NOTE: IA32_MCU_OPT_CTRL must necessarily be available, due to // what we set in CPUID for some CPU profiles (inherit policy) - const IA32_MCG_CAP: (u32, u32) = (0x179, 0x179); - - const IA32_MCG_STATUS: (u32, u32) = (0x17a, 0x17a); - const IA32_MCG_CTL: (u32, u32) = (0x17b, 0x17b); // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all @@ -652,142 +925,6 @@ mod forbidden_architectural_msrs { // should be disabled for non-host CPU profiles. const IA32_PEBS_ENABLE: (u32, u32) = (0x3f1, 0x3f1); - const IA32_MC0_CTL: (u32, u32) = (0x400, 0x400); - const IA32_MC0_STATUS: (u32, u32) = (0x401, 0x401); - const IA32_MC0_ADDR: (u32, u32) = (0x402, 0x402); - const IA32_MC0_MISC: (u32, u32) = (0x403, 0x403); - const IA32_MC1_CTL: (u32, u32) = (0x404, 0x404); - const IA32_MC1_STATUS: (u32, u32) = (0x405, 0x405); - const IA32_MC1_ADDR: (u32, u32) = (0x406, 0x406); - - const IA32_MC1_MISC: (u32, u32) = (0x407, 0x407); - const IA32_MC2_CTL: (u32, u32) = (0x408, 0x408); - const IA32_MC2_STATUS: (u32, u32) = (0x409, 0x409); - const IA32_MC2_ADDR: (u32, u32) = (0x40a, 0x40a); - const IA32_MC2_MISC: (u32, u32) = (0x40b, 0x40b); - const IA32_MC3_CTL: (u32, u32) = (0x40c, 0x40c); - const IA32_MC3_STATUS: (u32, u32) = (0x40d, 0x40d); - const IA32_MC3_ADDR1: (u32, u32) = (0x40e, 0x40e); - const IA32_MC3_MISC: (u32, u32) = (0x40f, 0x40f); - const IA32_MC4_CTL: (u32, u32) = (0x410, 0x410); - const IA32_MC4_STATUS: (u32, u32) = (0x411, 0x411); - const IA32_MC4_ADDR: (u32, u32) = (0x412, 0x412); - const IA32_MC4_MISC: (u32, u32) = (0x413, 0x413); - const IA32_MC5_CTL: (u32, u32) = (0x414, 0x414); - const IA32_MC5_STATUS: (u32, u32) = (0x415, 0x415); - const IA32_MC5_ADDR: (u32, u32) = (0x416, 0x416); - const IA32_MC5_MISC: (u32, u32) = (0x417, 0x417); - const IA32_MC6_CTL: (u32, u32) = (0x418, 0x418); - - const IA32_MC6_STATUS: (u32, u32) = (0x419, 0x419); - const IA32_MC6_ADDR1: (u32, u32) = (0x41a, 0x41a); - const IA32_MC6_MISC: (u32, u32) = (0x41b, 0x41b); - const IA32_MC7_CTL: (u32, u32) = (0x41c, 0x41c); - const IA32_MC7_STATUS: (u32, u32) = (0x41d, 0x41d); - const IA32_MC7_ADDR: (u32, u32) = (0x41e, 0x41e); - const IA32_MC7_MISC: (u32, u32) = (0x41f, 0x41f); - const IA32_MC8_CTL: (u32, u32) = (0x420, 0x420); - const IA32_MC8_STATUS: (u32, u32) = (0x421, 0x421); - const IA32_MC8_ADDR: (u32, u32) = (0x422, 0x422); - const IA32_MC8_MISC: (u32, u32) = (0x423, 0x423); - const IA32_MC9_CTL: (u32, u32) = (0x424, 0x424); - const IA32_MC9_STATUS: (u32, u32) = (0x425, 0x425); - const IA32_MC9_ADDR: (u32, u32) = (0x426, 0x426); - const IA32_MC9_MISC: (u32, u32) = (0x427, 0x427); - const IA32_MC10_CTL: (u32, u32) = (0x428, 0x428); - const IA32_MC10_STATUS: (u32, u32) = (0x429, 0x429); - const IA32_MC10_ADDR: (u32, u32) = (0x42a, 0x42a); - const IA32_MC10_MISC: (u32, u32) = (0x42b, 0x42b); - - const IA32_MC11_CTL: (u32, u32) = (0x42c, 0x42c); - const IA32_MC11_STATUS: (u32, u32) = (0x42d, 0x42d); - const IA32_MC11_ADDR: (u32, u32) = (0x42e, 0x42e); - const IA32_MC11_MISC: (u32, u32) = (0x42f, 0x42f); - const IA32_MC12_CTL: (u32, u32) = (0x430, 0x430); - const IA32_MC12_STATUS: (u32, u32) = (0x431, 0x431); - const IA32_MC12_ADDR: (u32, u32) = (0x432, 0x432); - const IA32_MC12_MISC: (u32, u32) = (0x433, 0x433); - const IA32_MC13_CTL: (u32, u32) = (0x434, 0x434); - const IA32_MC13_STATUS: (u32, u32) = (0x435, 0x435); - const IA32_MC13_ADDR: (u32, u32) = (0x436, 0x436); - const IA32_MC13_MISC: (u32, u32) = (0x437, 0x437); - const IA32_MC14_CTL: (u32, u32) = (0x438, 0x438); - const IA32_MC14_STATUS: (u32, u32) = (0x439, 0x439); - const IA32_MC14_ADDR: (u32, u32) = (0x43a, 0x43a); - const IA32_MC14_MISC: (u32, u32) = (0x43b, 0x43b); - const IA32_MC15_CTL: (u32, u32) = (0x43c, 0x43c); - const IA32_MC15_STATUS: (u32, u32) = (0x43d, 0x43d); - - const IA32_MC15_ADDR: (u32, u32) = (0x43e, 0x43e); - const IA32_MC15_MISC: (u32, u32) = (0x43f, 0x43f); - const IA32_MC16_CTL: (u32, u32) = (0x440, 0x440); - const IA32_MC16_STATUS: (u32, u32) = (0x441, 0x441); - const IA32_MC16_ADDR: (u32, u32) = (0x442, 0x442); - const IA32_MC16_MISC: (u32, u32) = (0x443, 0x443); - const IA32_MC17_CTL: (u32, u32) = (0x444, 0x444); - const IA32_MC17_STATUS: (u32, u32) = (0x445, 0x445); - const IA32_MC17_ADDR: (u32, u32) = (0x446, 0x446); - const IA32_MC17_MISC: (u32, u32) = (0x447, 0x447); - const IA32_MC18_CTL: (u32, u32) = (0x448, 0x448); - const IA32_MC18_STATUS: (u32, u32) = (0x449, 0x449); - const IA32_MC18_ADDR: (u32, u32) = (0x44a, 0x44a); - const IA32_MC18_MISC: (u32, u32) = (0x44b, 0x44b); - const IA32_MC19_CTL: (u32, u32) = (0x44c, 0x44c); - const IA32_MC19_STATUS: (u32, u32) = (0x44d, 0x44d); - const IA32_MC19_ADDR: (u32, u32) = (0x44e, 0x44e); - const IA32_MC19_MISC: (u32, u32) = (0x44f, 0x44f); - const IA32_MC20_CTL: (u32, u32) = (0x450, 0x450); - - const IA32_MC20_STATUS: (u32, u32) = (0x451, 0x451); - const IA32_MC20_ADDR: (u32, u32) = (0x452, 0x452); - const IA32_MC20_MISC: (u32, u32) = (0x453, 0x453); - const IA32_MC21_CTL: (u32, u32) = (0x454, 0x454); - const IA32_MC21_STATUS: (u32, u32) = (0x455, 0x455); - const IA32_MC21_ADDR: (u32, u32) = (0x456, 0x456); - const IA32_MC21_MISC: (u32, u32) = (0x457, 0x457); - const IA32_MC22_CTL: (u32, u32) = (0x458, 0x458); - const IA32_MC22_STATUS: (u32, u32) = (0x459, 0x459); - const IA32_MC22_ADDR: (u32, u32) = (0x45a, 0x45a); - const IA32_MC22_MISC: (u32, u32) = (0x45b, 0x45b); - const IA32_MC23_CTL: (u32, u32) = (0x45c, 0x45c); - const IA32_MC23_STATUS: (u32, u32) = (0x45d, 0x45d); - const IA32_MC23_ADDR: (u32, u32) = (0x45e, 0x45e); - const IA32_MC23_MISC: (u32, u32) = (0x45f, 0x45f); - const IA32_MC24_CTL: (u32, u32) = (0x460, 0x460); - const IA32_MC24_STATUS: (u32, u32) = (0x461, 0x461); - const IA32_MC24_ADDR: (u32, u32) = (0x462, 0x462); - - const IA32_MC24_MISC: (u32, u32) = (0x463, 0x463); - const IA32_MC25_CTL: (u32, u32) = (0x464, 0x464); - const IA32_MC25_STATUS: (u32, u32) = (0x465, 0x465); - const IA32_MC25_ADDR: (u32, u32) = (0x466, 0x466); - const IA32_MC25_MISC: (u32, u32) = (0x467, 0x467); - const IA32_MC26_CTL: (u32, u32) = (0x468, 0x468); - const IA32_MC26_STATUS: (u32, u32) = (0x469, 0x469); - const IA32_MC26_ADDR: (u32, u32) = (0x46a, 0x46a); - const IA32_MC26_MISC: (u32, u32) = (0x46b, 0x46b); - const IA32_MC27_CTL: (u32, u32) = (0x46c, 0x46c); - const IA32_MC27_STATUS: (u32, u32) = (0x46d, 0x46d); - const IA32_MC27_ADDR: (u32, u32) = (0x46e, 0x46e); - const IA32_MC27_MISC: (u32, u32) = (0x46f, 0x46f); - const IA32_MC28_CTL: (u32, u32) = (0x470, 0x470); - const IA32_MC28_STATUS: (u32, u32) = (0x471, 0x471); - const IA32_MC28_ADDR: (u32, u32) = (0x472, 0x472); - const IA32_MC28_MISC: (u32, u32) = (0x473, 0x473); - const IA32_MC29_CTL: (u32, u32) = (0x474, 0x474); - const IA32_MC29_STATUS: (u32, u32) = (0x475, 0x475); - - const IA32_MC29_ADDR: (u32, u32) = (0x476, 0x476); - const IA32_MC29_MISC: (u32, u32) = (0x477, 0x477); - const IA32_MC30_CTL: (u32, u32) = (0x478, 0x478); - const IA32_MC30_STATUS: (u32, u32) = (0x479, 0x479); - const IA32_MC30_ADDR: (u32, u32) = (0x47a, 0x47a); - const IA32_MC30_MISC: (u32, u32) = (0x47b, 0x47b); - const IA32_MC31_CTL: (u32, u32) = (0x47c, 0x47c); - const IA32_MC31_STATUS: (u32, u32) = (0x47d, 0x47d); - const IA32_MC31_ADDR: (u32, u32) = (0x47e, 0x47e); - const IA32_MC31_MISC: (u32, u32) = (0x47f, 0x47f); - const IA32_A_PMC0: (u32, u32) = (0x4c1, 0x4c1); const IA32_A_PMC1: (u32, u32) = (0x4c2, 0x4c2); const IA32_A_PMC2: (u32, u32) = (0x4c3, 0x4c3); @@ -1133,7 +1270,7 @@ mod forbidden_architectural_msrs { const IA32_UARCH_MISC_CTL: (u32, u32) = (0x1b01, 0x1b01); /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not /// considered MSR-based FEATURE indices by KVM. - pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 356] = [ + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 226] = [ IA32_P5_MC_ADDR, IA32_P5_MC_TYPE, // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE @@ -1195,17 +1332,6 @@ mod forbidden_architectural_msrs { // TODO: Don't know about IA32_SYSENTER_CS, IA32_SYSENTER_ESP, // IA32_SYSENTER_EIP // - - // TODO: Not sure if we can/should deny this MSR, but - // it doesn't really make sense to have it available in - // a virtualized environment - // - // If we keep it denied we should document that - // even for 06_01H one cannot rely on the existence of this MSR - IA32_MCG_CAP, - // TODO: Also not sure if we may deny this MSR - IA32_MCG_STATUS, - // TODO: Can we deny this? IA32_MCG_CTL, // TODO: 0x180- 0x185 is reserved, we should not list these MSRS at all /// Disabled via CPUID for all non-host CPU profiles @@ -1263,134 +1389,6 @@ mod forbidden_architectural_msrs { // TODO: Not sure about this one, but seems to be related to performance monitoring which // should be disabled for non-host CPU profiles. IA32_PEBS_ENABLE, - IA32_MC0_CTL, - IA32_MC0_STATUS, - IA32_MC0_ADDR, - IA32_MC0_MISC, - IA32_MC1_CTL, - IA32_MC1_STATUS, - IA32_MC1_ADDR, - IA32_MC1_MISC, - IA32_MC2_CTL, - IA32_MC2_STATUS, - IA32_MC2_ADDR, - IA32_MC2_MISC, - IA32_MC3_CTL, - IA32_MC3_STATUS, - IA32_MC3_ADDR1, - IA32_MC3_MISC, - IA32_MC4_CTL, - IA32_MC4_STATUS, - IA32_MC4_ADDR, - IA32_MC4_MISC, - IA32_MC5_CTL, - IA32_MC5_STATUS, - IA32_MC5_ADDR, - IA32_MC5_MISC, - IA32_MC6_CTL, - IA32_MC6_STATUS, - IA32_MC6_ADDR1, - IA32_MC6_MISC, - IA32_MC7_CTL, - IA32_MC7_STATUS, - IA32_MC7_ADDR, - IA32_MC7_MISC, - IA32_MC8_CTL, - IA32_MC8_STATUS, - IA32_MC8_ADDR, - IA32_MC8_MISC, - IA32_MC9_CTL, - IA32_MC9_STATUS, - IA32_MC9_ADDR, - IA32_MC9_MISC, - IA32_MC10_CTL, - IA32_MC10_STATUS, - IA32_MC10_ADDR, - IA32_MC10_MISC, - IA32_MC11_CTL, - IA32_MC11_STATUS, - IA32_MC11_ADDR, - IA32_MC11_MISC, - IA32_MC12_CTL, - IA32_MC12_STATUS, - IA32_MC12_ADDR, - IA32_MC12_MISC, - IA32_MC13_CTL, - IA32_MC13_STATUS, - IA32_MC13_ADDR, - IA32_MC13_MISC, - IA32_MC14_CTL, - IA32_MC14_STATUS, - IA32_MC14_ADDR, - IA32_MC14_MISC, - IA32_MC15_CTL, - IA32_MC15_STATUS, - IA32_MC15_ADDR, - IA32_MC15_MISC, - IA32_MC16_CTL, - IA32_MC16_STATUS, - IA32_MC16_ADDR, - IA32_MC16_MISC, - IA32_MC17_CTL, - IA32_MC17_STATUS, - IA32_MC17_ADDR, - IA32_MC17_MISC, - IA32_MC18_CTL, - IA32_MC18_STATUS, - IA32_MC18_ADDR, - IA32_MC18_MISC, - IA32_MC19_CTL, - IA32_MC19_STATUS, - IA32_MC19_ADDR, - IA32_MC19_MISC, - IA32_MC20_CTL, - IA32_MC20_STATUS, - IA32_MC20_ADDR, - IA32_MC20_MISC, - IA32_MC21_CTL, - IA32_MC21_STATUS, - IA32_MC21_ADDR, - IA32_MC21_MISC, - IA32_MC22_CTL, - IA32_MC22_STATUS, - IA32_MC22_ADDR, - IA32_MC22_MISC, - IA32_MC23_CTL, - IA32_MC23_STATUS, - IA32_MC23_ADDR, - IA32_MC23_MISC, - IA32_MC24_CTL, - IA32_MC24_STATUS, - IA32_MC24_ADDR, - IA32_MC24_MISC, - IA32_MC25_CTL, - IA32_MC25_STATUS, - IA32_MC25_ADDR, - IA32_MC25_MISC, - IA32_MC26_CTL, - IA32_MC26_STATUS, - IA32_MC26_ADDR, - IA32_MC26_MISC, - IA32_MC27_CTL, - IA32_MC27_STATUS, - IA32_MC27_ADDR, - IA32_MC27_MISC, - IA32_MC28_CTL, - IA32_MC28_STATUS, - IA32_MC28_ADDR, - IA32_MC28_MISC, - IA32_MC29_CTL, - IA32_MC29_STATUS, - IA32_MC29_ADDR, - IA32_MC29_MISC, - IA32_MC30_CTL, - IA32_MC30_STATUS, - IA32_MC30_ADDR, - IA32_MC30_MISC, - IA32_MC31_CTL, - IA32_MC31_STATUS, - IA32_MC31_ADDR, - IA32_MC31_MISC, IA32_A_PMC0, IA32_A_PMC1, IA32_A_PMC2, From 1f1ddf5f5e99cd486ac4a19250ca6bc6273f8f03 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Wed, 18 Mar 2026 10:32:10 +0100 Subject: [PATCH 163/178] arch: Regenerate CPU profiles Regenerate CPU profiles in order to enable machine check architecture (MCA) for non-host CPU profiles which is required to boot Windows server. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 58 ++++++++++++++++++- .../cpu_profiles/sapphire-rapids.msr.json | 31 +++++++++- .../x86_64/cpu_profiles/skylake.cpuid.json | 2 +- arch/src/x86_64/cpu_profiles/skylake.msr.json | 31 +++++++++- 4 files changed, 118 insertions(+), 4 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index 5c857f2eee..b3389bf947 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -110,7 +110,7 @@ "register": "EDX" }, { - "replacements": "0x078bbbff", + "replacements": "0x078bfbff", "mask": "0x08000000" } ], @@ -1393,6 +1393,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0xb", + "end": "0xb" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", "end": "0xc" }, "register": "EAX" @@ -1407,6 +1421,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0xb", + "end": "0xb" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", "end": "0xc" }, "register": "EBX" @@ -1421,6 +1449,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0xb", + "end": "0xb" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", "end": "0xc" }, "register": "ECX" @@ -1435,6 +1477,20 @@ "leaf": "0xd", "sub_leaf": { "start": "0xb", + "end": "0xb" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xc", "end": "0xc" }, "register": "EDX" diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json index 0798ec9a43..cd2b7a5a97 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json @@ -113,10 +113,39 @@ "0x174", "0x175", "0x176", + "0x17a", "0x1a0", "0x1c4", "0x1c5", + "0x200", + "0x201", + "0x202", + "0x203", + "0x204", + "0x205", + "0x206", + "0x207", + "0x208", + "0x209", + "0x20a", + "0x20b", + "0x20c", + "0x20d", + "0x20e", + "0x20f", + "0x250", + "0x258", + "0x259", + "0x268", + "0x269", + "0x26a", + "0x26b", + "0x26c", + "0x26d", + "0x26e", + "0x26f", "0x277", + "0x2ff", "0x480", "0x481", "0x482", @@ -174,4 +203,4 @@ "0xc0000102", "0xc0000103" ] -} \ No newline at end of file +} diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index 3cdd9966e1..48c3f94ccd 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -110,7 +110,7 @@ "register": "EDX" }, { - "replacements": "0x078bbbff", + "replacements": "0x078bfbff", "mask": "0x08000000" } ], diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json b/arch/src/x86_64/cpu_profiles/skylake.msr.json index d561dffc92..5cc3398dc1 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.msr.json +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json @@ -113,8 +113,37 @@ "0x174", "0x175", "0x176", + "0x17a", "0x1a0", + "0x200", + "0x201", + "0x202", + "0x203", + "0x204", + "0x205", + "0x206", + "0x207", + "0x208", + "0x209", + "0x20a", + "0x20b", + "0x20c", + "0x20d", + "0x20e", + "0x20f", + "0x250", + "0x258", + "0x259", + "0x268", + "0x269", + "0x26a", + "0x26b", + "0x26c", + "0x26d", + "0x26e", + "0x26f", "0x277", + "0x2ff", "0x480", "0x481", "0x482", @@ -172,4 +201,4 @@ "0xc0000102", "0xc0000103" ] -} \ No newline at end of file +} From 683d613689ab4dc6ba5b3b39a08c74943cd6a617 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 23 Mar 2026 15:24:18 +0100 Subject: [PATCH 164/178] arch: FORDIB IA32_U_CET and IA32_S_CET in CPU Profiles These are already displayed as not available to guests via CPUID for non-host CPU profiles, but we forgot to forbid the corresponding MSRs. The profiles we have generated are OK with respect to this oversight because KVM_GET_MSR_INDEX_LIST did not report those MSRs at the time they were generated, but it does now. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/architectural_msrs.rs | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs index 9a217a3845..6fd61cc3d2 100644 --- a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -404,9 +404,6 @@ mod permitted_architectural_msrs { const IA32_MC31_ADDR: u32 = 0x47e; const IA32_MC31_MISC: u32 = 0x47f; - const IA32_U_CET: u32 = 0x6a0; - const IA32_S_CET: u32 = 0x6a2; - const IA32_TSC_DEADLINE: u32 = 0x6e0; const _IA32_TSC_DEADLINE_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<24>(&Parameters { @@ -464,7 +461,7 @@ mod permitted_architectural_msrs { register: CpuidReg::ECX, }); - pub(super) const READ_WRITE_IA32_MSRS: [u32; 202] = [ + pub(super) const READ_WRITE_IA32_MSRS: [u32; 200] = [ IA32_TIME_STAMP_COUNTER, IA32_APIC_BASE, IA32_FEATURE_CONTROL, @@ -641,8 +638,6 @@ mod permitted_architectural_msrs { IA32_MC31_STATUS, IA32_MC31_ADDR, IA32_MC31_MISC, - IA32_U_CET, - IA32_S_CET, IA32_TSC_DEADLINE, IA32_X2APIC_TPR, IA32_X2APIC_SIVR, @@ -710,8 +705,8 @@ mod permitted_architectural_msrs { /// /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM /// Vol. 4 from October 2025 - pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 245] = const { - let mut permitted = [0u32; 245]; + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 243] = const { + let mut permitted = [0u32; 243]; let read_only_len = READ_ONLY_IA32_MSRS.len(); let write_only_len = WRITE_ONLY_IA32_MSRS.len(); let read_write_len = READ_WRITE_IA32_MSRS.len(); @@ -968,6 +963,11 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles const IA32_DS_AREA: (u32, u32) = (0x600, 0x600); + // U_CET and S_CET are disabled via CPUID + // TODO: Include compile time checks for that + const IA32_U_CET: (u32, u32) = (0x6a0, 0x6a0); + const IA32_S_CET: (u32, u32) = (0x6a2, 0x6a2); + // TODO: IA32_TSC_DEADLINE should be available because the TSC_DEADLINE CPUID bit // is set by CHV unconditionally. The availability of this MSR probably needs to be // handled by CHV itself and not the CPU profiles @@ -1270,7 +1270,7 @@ mod forbidden_architectural_msrs { const IA32_UARCH_MISC_CTL: (u32, u32) = (0x1b01, 0x1b01); /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not /// considered MSR-based FEATURE indices by KVM. - pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 226] = [ + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 228] = [ IA32_P5_MC_ADDR, IA32_P5_MC_TYPE, // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE @@ -1422,6 +1422,8 @@ mod forbidden_architectural_msrs { IA32_RTIT_ADDR3_B, // Disabled via CPUID for non-host CPU profiles IA32_DS_AREA, + IA32_U_CET, + IA32_S_CET, // Disabled via CPUID for non-host CPU profiles IA32_PKRS, // Disabled via CPUID for non-host CPU profiles From fa08c472e7b878d81b2dd625e9de21751dd9f04d Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 16:20:17 +0100 Subject: [PATCH 165/178] arch: Disable HDC state components for CPU profiles Hardware duty cycling (HDC) does not make sense in the virtualization setting and should thus not be displayed as available to guests. We have already disabled certain HDC aspects via CPUID 0x6 ECX[13], but we forgot to disable the state components which is what we do in this commit. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 16 ++++- .../x86_64/cpu_profiles/skylake.cpuid.json | 14 +++++ arch/src/x86_64/cpuid_definitions/intel.rs | 60 +++++++++---------- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index b3389bf947..2f8ce47814 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -1542,6 +1542,20 @@ "mask": "0x00000000" } ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], [ { "leaf": "0xd", @@ -3251,4 +3265,4 @@ } ] ] -} \ No newline at end of file +} diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index 48c3f94ccd..9aaa83a1de 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -1542,6 +1542,20 @@ "mask": "0x00000000" } ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xd", + "end": "0xd" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], [ { "leaf": "0xd", diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index ee944bbf9d..7c173b9c16 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2409,10 +2409,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { policy: ProfilePolicy::Static(0), }, ValueDefinition { - short: "xcr0_ia32_xss_bits", + short: "xcr0_ia32_xss_hdc", description: "XCR0.IA32_XSS (bit 13) used for IA32_XSS", bits_range: (13, 13), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xcr0_ia32_xss_UINTR", @@ -2592,7 +2592,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { short: "xss_hdc", description: "HDC state, supported", bits_range: (13, 13), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_uintr", @@ -2855,6 +2855,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { policy: ProfilePolicy::Static(0), }]), ), + // Disable HDC for CPU profiles ( Parameters { leaf: 0xd, @@ -2862,10 +2863,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { - short: "xsave_sz", - description: "Size of save area for subleaf-N feature, in bytes", + short: "0xd-13-eax-edc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", bits_range: (0, 31), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }]), ), ( @@ -2875,10 +2876,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { - short: "xsave_offset", - description: "Offset of save area for subleaf-N feature, in bytes", + short: "0xd-13-ebx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", bits_range: (0, 31), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }]), ), ( @@ -2887,26 +2888,25 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<167> = const { sub_leaf: RangeInclusive::new(13, 13), register: CpuidReg::ECX, }, - ValueDefinitions::new(&[ - ValueDefinition { - short: "is_xss_bit", - description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", - bits_range: (0, 0), - policy: ProfilePolicy::Inherit, - }, - ValueDefinition { - short: "compacted_xsave_64byte_aligned", - description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", - bits_range: (1, 1), - policy: ProfilePolicy::Inherit, - }, - ValueDefinition { - short: "xfd_faulting", - description: "Indicates support for xfd faulting", - bits_range: (2, 2), - policy: ProfilePolicy::Inherit, - }, - ]), + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-ecx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(13, 13), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-13-edx-hdc-zero", + description: "This leaf has been zeroed out because CET state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), ), // We decided to disable UINTR for CPU profiles, hence we zero out these sub-leaves ( From 52a392070f1aa733d5fbee60d116e1198d6a2957 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 16:52:05 +0100 Subject: [PATCH 166/178] arch: Disable LBR state components We have already disabled architectural LBR (last branch record) for CPU profiles, but we forgot to disable the corresponding state components. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 62 +++++++++++++++- .../x86_64/cpu_profiles/skylake.cpuid.json | 60 +++++++++++++++- arch/src/x86_64/cpuid_definitions/intel.rs | 71 +++++++++++++++++-- 3 files changed, 182 insertions(+), 11 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index 2f8ce47814..4ea61cfa91 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -1617,6 +1617,62 @@ "leaf": "0xd", "sub_leaf": { "start": "0xf", + "end": "0xf" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", "end": "0x10" }, "register": "EAX" @@ -1672,7 +1728,7 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xf", + "start": "0x10", "end": "0x10" }, "register": "EBX" @@ -1728,7 +1784,7 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xf", + "start": "0x10", "end": "0x10" }, "register": "ECX" @@ -3265,4 +3321,4 @@ } ] ] -} +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index 9aaa83a1de..da9f6b967e 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -1617,7 +1617,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0xf", - "end": "0x3f" + "end": "0xf" }, "register": "EAX" }, @@ -1631,7 +1631,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0xf", - "end": "0x3f" + "end": "0xf" }, "register": "EBX" }, @@ -1645,6 +1645,62 @@ "leaf": "0xd", "sub_leaf": { "start": "0xf", + "end": "0xf" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xf", + "end": "0xf" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x3f" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", "end": "0x3f" }, "register": "ECX" diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 7c173b9c16..e2502cb565 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2420,10 +2420,16 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { bits_range: (14, 14), policy: ProfilePolicy::Static(0), }, + ValueDefinition { + short: "xcr0_ia32_xss_LBR", + description: "XCR0.IA32_XSS (bit 15) used for LBR in IA32_XSS", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, ValueDefinition { short: "xcr0_ia32_xss_bits_15_16", description: "XCR0.IA32_XSS (bit 15 - 16) used for IA32_XSS", - bits_range: (15, 16), + bits_range: (16, 16), policy: ProfilePolicy::Inherit, }, // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. @@ -2604,7 +2610,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { short: "xss_lbr", description: "LBR state, supported", bits_range: (15, 15), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_hwp", @@ -2961,12 +2967,65 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { policy: ProfilePolicy::Static(0), }]), ), + // Disable LBR for CPU Profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(15, 15), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-lbr-zero", + description: "This leaf has been zeroed out because LBR state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to // the policy declared here at runtime for those values. ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(15, 63), + sub_leaf: RangeInclusive::new(16, 63), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2979,7 +3038,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(15, 63), + sub_leaf: RangeInclusive::new(16, 63), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2992,7 +3051,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<168> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(15, 63), + sub_leaf: RangeInclusive::new(16, 63), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ From 6c31b6e0cd9fa64cbc9f47f9b60f90cb4f4a8d85 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 17:13:29 +0100 Subject: [PATCH 167/178] arch: Disable HWP state components Hardware P-states (HWP) is already disabled for non-host CPU profiles, but we forgot to also disable the associated state components. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 68 ++++++++++-------- .../x86_64/cpu_profiles/skylake.cpuid.json | 60 +++++++++++++++- arch/src/x86_64/cpuid_definitions/intel.rs | 69 ++++++++++++++++--- 3 files changed, 160 insertions(+), 37 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index 4ea61cfa91..522c5ce33c 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -1686,13 +1686,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x11", - "end": "0x11" + "start": "0x10", + "end": "0x10" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000040", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1700,13 +1700,41 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x12", - "end": "0x12" + "start": "0x10", + "end": "0x10" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x11" }, "register": "EAX" }, { - "replacements": "0x00002000", + "replacements": "0x00000040", "mask": "0x00000000" } ], @@ -1714,13 +1742,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x13", - "end": "0x3f" + "start": "0x12", + "end": "0x12" }, "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00002000", "mask": "0x00000000" } ], @@ -1728,10 +1756,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x10", - "end": "0x10" + "start": "0x13", + "end": "0x3f" }, - "register": "EBX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1780,20 +1808,6 @@ "mask": "0x00000000" } ], - [ - { - "leaf": "0xd", - "sub_leaf": { - "start": "0x10", - "end": "0x10" - }, - "register": "ECX" - }, - { - "replacements": "0x00000000", - "mask": "0x00000000" - } - ], [ { "leaf": "0xd", diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index da9f6b967e..4c08a64595 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -1673,7 +1673,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0x10", - "end": "0x3f" + "end": "0x10" }, "register": "EAX" }, @@ -1687,7 +1687,7 @@ "leaf": "0xd", "sub_leaf": { "start": "0x10", - "end": "0x3f" + "end": "0x10" }, "register": "EBX" }, @@ -1701,6 +1701,62 @@ "leaf": "0xd", "sub_leaf": { "start": "0x10", + "end": "0x10" + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x10", + "end": "0x10" + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x3f" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", + "end": "0x3f" + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0x11", "end": "0x3f" }, "register": "ECX" diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index e2502cb565..2fd575d252 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2427,10 +2427,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { policy: ProfilePolicy::Static(0), }, ValueDefinition { - short: "xcr0_ia32_xss_bits_15_16", - description: "XCR0.IA32_XSS (bit 15 - 16) used for IA32_XSS", + short: "xcr0_ia32_xss_bits_hwp", + description: "XCR0.IA32_XSS (bit 16) used for HWP in IA32_XSS", bits_range: (16, 16), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. ValueDefinition { @@ -2616,7 +2616,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { short: "xss_hwp", description: "HWP state, supported", bits_range: (16, 16), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xcr0_bits", @@ -3020,12 +3020,65 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { policy: ProfilePolicy::Static(0), }]), ), + // Disable HWP for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-eax-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ebx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-ecx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(16, 16), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-edx-hwp-zero", + description: "This leaf has been zeroed out because HWP state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to // the policy declared here at runtime for those values. ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(16, 63), + sub_leaf: RangeInclusive::new(17, 63), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -3038,7 +3091,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(16, 63), + sub_leaf: RangeInclusive::new(17, 63), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -3051,7 +3104,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<172> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(16, 63), + sub_leaf: RangeInclusive::new(17, 63), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ From 11cd2cc2b10fafbbd5ec17877af3a234135a4d8d Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 17:28:28 +0100 Subject: [PATCH 168/178] arch: Disable PT state components We already disabled Processor Trace (PT) for CPU profiles, but forgot to disable the associated state components. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 90 +++++++------ .../x86_64/cpu_profiles/skylake.cpuid.json | 90 +++++++------ arch/src/x86_64/cpuid_definitions/intel.rs | 121 ++++++++++++++++-- 3 files changed, 217 insertions(+), 84 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index 522c5ce33c..fc024bfe7e 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -1182,13 +1182,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0x5", + "end": "0x5" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000440", "mask": "0x00000000" } ], @@ -1196,13 +1196,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0x6", + "end": "0x6" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000008", + "replacements": "0x00000480", "mask": "0x00000000" } ], @@ -1210,13 +1210,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x7", + "end": "0x7" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000680", "mask": "0x00000000" } ], @@ -1227,10 +1227,10 @@ "start": "0x5", "end": "0x5" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000440", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1241,10 +1241,10 @@ "start": "0x6", "end": "0x6" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000480", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1255,10 +1255,10 @@ "start": "0x7", "end": "0x7" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000680", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1269,7 +1269,7 @@ "start": "0x8", "end": "0x8" }, - "register": "EBX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1280,13 +1280,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0x8", + "end": "0x8" }, "register": "EBX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1294,10 +1294,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x8", + "end": "0x8" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1308,10 +1308,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x5", - "end": "0x5" + "start": "0x8", + "end": "0x8" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", @@ -1322,13 +1322,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x6", - "end": "0x6" + "start": "0x9", + "end": "0x9" }, - "register": "ECX" + "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000008", "mask": "0x00000000" } ], @@ -1336,10 +1336,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x7", - "end": "0x7" + "start": "0xa", + "end": "0xa" }, - "register": "ECX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1350,10 +1350,24 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0x9", + "end": "0x9" }, - "register": "ECX" + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EBX" }, { "replacements": "0x00000000", diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index 4c08a64595..4e2e58232a 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -1238,13 +1238,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0x5", + "end": "0x5" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000440", "mask": "0x00000000" } ], @@ -1252,13 +1252,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0x6", + "end": "0x6" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000008", + "replacements": "0x00000480", "mask": "0x00000000" } ], @@ -1266,13 +1266,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x7", + "end": "0x7" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000680", "mask": "0x00000000" } ], @@ -1283,10 +1283,10 @@ "start": "0x5", "end": "0x5" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000440", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1297,10 +1297,10 @@ "start": "0x6", "end": "0x6" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000480", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1311,10 +1311,10 @@ "start": "0x7", "end": "0x7" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000680", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1325,7 +1325,7 @@ "start": "0x8", "end": "0x8" }, - "register": "EBX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1336,13 +1336,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0x8", + "end": "0x8" }, "register": "EBX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1350,10 +1350,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x8", + "end": "0x8" }, - "register": "EBX" + "register": "ECX" }, { "replacements": "0x00000000", @@ -1364,10 +1364,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x5", - "end": "0x5" + "start": "0x8", + "end": "0x8" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", @@ -1378,13 +1378,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x6", - "end": "0x6" + "start": "0x9", + "end": "0x9" }, - "register": "ECX" + "register": "EAX" }, { - "replacements": "0x00000000", + "replacements": "0x00000008", "mask": "0x00000000" } ], @@ -1392,10 +1392,10 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x7", - "end": "0x7" + "start": "0xa", + "end": "0xa" }, - "register": "ECX" + "register": "EAX" }, { "replacements": "0x00000000", @@ -1406,10 +1406,24 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x8", - "end": "0x8" + "start": "0x9", + "end": "0x9" }, - "register": "ECX" + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EBX" }, { "replacements": "0x00000000", diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 2fd575d252..2c5f97b386 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2386,9 +2386,9 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { // MSR related ValueDefinition { short: "xcr0_ia32_xss", - description: "XCR0.IA32_XSS (bit 8) used for IA32_XSS", + description: "XCR0.IA32_XSS (bit 8) used for PT in IA32_XSS", bits_range: (8, 8), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xcr0_pkru", @@ -2568,7 +2568,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { short: "xss_pt", description: "PT state, supported", bits_range: (8, 8), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xcr0_bit9", @@ -2758,7 +2758,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 10), + sub_leaf: RangeInclusive::new(5, 7), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2771,7 +2771,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 10), + sub_leaf: RangeInclusive::new(5, 7), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2784,7 +2784,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(5, 10), + sub_leaf: RangeInclusive::new(5, 7), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ @@ -2808,7 +2808,112 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<176> = const { }, ]), ), - // We leave CET out of CPU profiles for the time being + // Disable PT for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-eax-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-ebx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-ecx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(8, 8), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-8-edx-pt-zero", + description: "This leaf has been zeroed out because PT state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 10), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 10), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(9, 10), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), // We leave CET out of CPU profiles for the time being ( Parameters { leaf: 0xd, From 8086bdf08127814440d37b0d99ca370209946e6b Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 17:39:24 +0100 Subject: [PATCH 169/178] arch: Disable PASID state components We have already forbidden IA32_PASID, an MSR related to process address space identifiers (PASID), but we forgot to disable the associated state components. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../cpu_profiles/sapphire-rapids.cpuid.json | 32 ++++++--- .../x86_64/cpu_profiles/skylake.cpuid.json | 32 ++++++--- arch/src/x86_64/cpuid_definitions/intel.rs | 72 ++++++++++++++++--- 3 files changed, 109 insertions(+), 27 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json index fc024bfe7e..b0790bb426 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.cpuid.json @@ -1336,13 +1336,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x9", + "end": "0x9" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000a80", "mask": "0x00000000" } ], @@ -1353,10 +1353,24 @@ "start": "0x9", "end": "0x9" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1378,8 +1392,8 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0xa", + "end": "0xa" }, "register": "ECX" }, @@ -1395,7 +1409,7 @@ "start": "0xa", "end": "0xa" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", diff --git a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json index 4e2e58232a..bbe3ec73a8 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.cpuid.json +++ b/arch/src/x86_64/cpu_profiles/skylake.cpuid.json @@ -1392,13 +1392,13 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0xa", - "end": "0xa" + "start": "0x9", + "end": "0x9" }, - "register": "EAX" + "register": "EBX" }, { - "replacements": "0x00000000", + "replacements": "0x00000a80", "mask": "0x00000000" } ], @@ -1409,10 +1409,24 @@ "start": "0x9", "end": "0x9" }, - "register": "EBX" + "register": "ECX" }, { - "replacements": "0x00000a80", + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0xd", + "sub_leaf": { + "start": "0xa", + "end": "0xa" + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", "mask": "0x00000000" } ], @@ -1434,8 +1448,8 @@ { "leaf": "0xd", "sub_leaf": { - "start": "0x9", - "end": "0x9" + "start": "0xa", + "end": "0xa" }, "register": "ECX" }, @@ -1451,7 +1465,7 @@ "start": "0xa", "end": "0xa" }, - "register": "ECX" + "register": "EDX" }, { "replacements": "0x00000000", diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs index 2c5f97b386..61517e7e1b 100644 --- a/arch/src/x86_64/cpuid_definitions/intel.rs +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -39,7 +39,7 @@ use super::{ /// a few of the short names and descriptions to be more inline with what is written in the /// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every /// single [`ValueDefinition`] and manually appended those. -pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<187> = const { CpuidDefinitions([ // ========================================================================================= // Basic CPUID Information @@ -2397,10 +2397,10 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { policy: ProfilePolicy::Inherit, }, ValueDefinition { - short: "xcr0_ia32_xss_bits", - description: "XCR0.IA32_XSS (bit 10) used for IA32_XSS", + short: "xcr0_ia32_xss_pasid", + description: "XCR0.IA32_XSS (bit 10) used for PASID in IA32_XSS", bits_range: (10, 10), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xcr0_ia32_xss_cet", @@ -2580,7 +2580,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { short: "xss_pasid", description: "PASID state, supported", bits_range: (10, 10), - policy: ProfilePolicy::Inherit, + policy: ProfilePolicy::Static(0), }, ValueDefinition { short: "xss_cet_u", @@ -2864,7 +2864,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(9, 10), + sub_leaf: RangeInclusive::new(9, 9), register: CpuidReg::EAX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2877,7 +2877,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(9, 10), + sub_leaf: RangeInclusive::new(9, 9), register: CpuidReg::EBX, }, ValueDefinitions::new(&[ValueDefinition { @@ -2890,7 +2890,7 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { ( Parameters { leaf: 0xd, - sub_leaf: RangeInclusive::new(9, 10), + sub_leaf: RangeInclusive::new(9, 9), register: CpuidReg::ECX, }, ValueDefinitions::new(&[ @@ -2913,7 +2913,61 @@ pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<183> = const { policy: ProfilePolicy::Inherit, }, ]), - ), // We leave CET out of CPU profiles for the time being + ), + // Disable PASID for CPU profiles + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-eax-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-ebx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-ecx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(10, 10), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-10-edx-pasid-zero", + description: "This leaf has been zeroed out because PASID state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // We leave CET out of CPU profiles for the time being ( Parameters { leaf: 0xd, From fa9dc5f30e8d4e7823ddbbc08d3ec18a423f140a Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 18:49:13 +0100 Subject: [PATCH 170/178] arch: Disable VM_ENTRY_HARDWARE_EXCEPTIONS for CPU profiles Bit 56 of VM_ENTRY_HARDWARE_EXCEPTIONS in IA32_VMX_BASIC is only set on rather recent KVM versions. Thus whenever a CPU profile is generated on a machine with a recent Linux kernel, the current inherit policy will lead to the CPU profile being incompatible on deplyoments with older Linux kernels. This may not be the intention of the person generating the CPU profile, thus we change the policy to `Static(0)` for the time being. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/src/x86_64/msr_definitions/intel/msr_based_features.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index bc36eb8581..c268229677 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -370,11 +370,13 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { bits_range: (55,55), policy: ProfilePolicy::Inherit }, + // This is only available for relatively recent kernels + // TODO: Revisit this policy ValueDefinition { short: "VM_ENTRY_HARDWARE_EXCEPTIONS", description: "If 1, then software can use VM entry to deliver a hardware exception", bits_range: (56, 56), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) } ]) ), From 19a662d8c61bf6fd2de31418669f3c922928637b Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 19:18:28 +0100 Subject: [PATCH 171/178] arch: Forbid IA32_XSS for non-host CPU profiles IA32_XSS (Extended Supervisor State Mask) is only reported via KVM_GET_MSR_INDEX_LIST on rather recent kernels. This can lead to CPU profiles that are generated on a machine with the latest Linux kernel, not work with deployments where the hosts use a bit older kernels which may be unintentional. We thus decide to forbid this MSR for now, even though CPUID 0xd.0x1.EAX[3] can inform the guest that the MSR is available. We do not want to force the aforementioned feature bit to 0 because it is also used to report support for XSAVES/XRSTORS. Although not ideal, we consider denying access to IA32_XSS to be acceptable because the 0xd CPUID leaves report all IA32_XSS related state components to be unsupported. There is thus no reason for the guest to be interested in using this MSR. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../intel/architectural_msrs.rs | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs index 6fd61cc3d2..af7b4e7cc0 100644 --- a/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs +++ b/arch/src/x86_64/msr_definitions/intel/architectural_msrs.rs @@ -429,13 +429,6 @@ mod permitted_architectural_msrs { const IA32_X2APIC_INIT_COUNT: u32 = 0x838; const IA32_X2APIC_DIV_CONF: u32 = 0x83e; - const IA32_XSS: u32 = 0xda0; - const _IA32_XSS_CPUID_CHECK: () = assert_not_denied_cpuid_feature::<3>(&Parameters { - leaf: 0xd, - sub_leaf: 1..=1, - register: CpuidReg::EAX, - }); - /// Extended Feature Enable const IA32_EFER: u32 = 0xc0000080; @@ -461,7 +454,7 @@ mod permitted_architectural_msrs { register: CpuidReg::ECX, }); - pub(super) const READ_WRITE_IA32_MSRS: [u32; 200] = [ + pub(super) const READ_WRITE_IA32_MSRS: [u32; 199] = [ IA32_TIME_STAMP_COUNTER, IA32_APIC_BASE, IA32_FEATURE_CONTROL, @@ -652,7 +645,6 @@ mod permitted_architectural_msrs { IA32_X2APIC_LVT_ERROR, IA32_X2APIC_INIT_COUNT, IA32_X2APIC_DIV_CONF, - IA32_XSS, IA32_EFER, IA32_STAR, IA32_LSTAR, @@ -705,8 +697,8 @@ mod permitted_architectural_msrs { /// /// The MSRs listed here can be studied further in Table 2.2 in Section 2.1 of the Intel SDM /// Vol. 4 from October 2025 - pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 243] = const { - let mut permitted = [0u32; 243]; + pub(in crate::x86_64) const PERMITTED_IA32_MSRS: [u32; 242] = const { + let mut permitted = [0u32; 242]; let read_only_len = READ_ONLY_IA32_MSRS.len(); let write_only_len = WRITE_ONLY_IA32_MSRS.len(); let read_write_len = READ_WRITE_IA32_MSRS.len(); @@ -1105,6 +1097,21 @@ mod forbidden_architectural_msrs { const IA32_COPY_PLATFORM_TO_LOCAL: (u32, u32) = (0xd92, 0xd92); const IA32_PASID: (u32, u32) = (0xd93, 0xd93); + + /* + IA32_XSS is a bit problematic: Only never kernels will report it via + KVM_GET_MSR_INDEX_LIST, but CPUID 0xd.0x1.EAX[3] reports that this MSR + exists. + + In order for CPU profiles generated with recent kernels to work with + deployments operating with older kernels, we decide to forbid this MSR + for now even though CPUID indicates that it is available to the guest. + + We consider this OK because we have disabled every single IA32_XSS + related state component in the 0xd CPUID leaves, hence there is no + reason for the guest to want to use this. + */ + const IA32_XSS: (u32, u32) = (0xda0, 0xda0); // Disabled via CPUID for non-host CPU profiles const IA32_PKG_HDC_CTL: (u32, u32) = (0xdb0, 0xdb0); @@ -1270,7 +1277,7 @@ mod forbidden_architectural_msrs { const IA32_UARCH_MISC_CTL: (u32, u32) = (0x1b01, 0x1b01); /// A list of ARCHITECTURAL MSR register addresses that are forbidden for all non-host CPU profiles and also not /// considered MSR-based FEATURE indices by KVM. - pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 228] = [ + pub(in crate::x86_64) const FORBIDDEN_IA32_MSR_RANGES: [(u32, u32); 229] = [ IA32_P5_MC_ADDR, IA32_P5_MC_TYPE, // TODO: Not sure about IA32_P5_MC_ADDR & IA32_P5_MC_TYPE @@ -1520,6 +1527,7 @@ mod forbidden_architectural_msrs { // Disabled via CPUID for non-host CPU profiles IA32_COPY_PLATFORM_TO_LOCAL, IA32_PASID, + IA32_XSS, // Disabled via CPUID for non-host CPU profiles IA32_PKG_HDC_CTL, // Disabled via CPUID for non-host CPU profiles From b3abc68a379ffc7c2544e73ac904f4842d56ef44 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Tue, 24 Mar 2026 19:43:44 +0100 Subject: [PATCH 172/178] arch: Clear LBR related bits in the VM-Exit and VM-Entry CTL MSRs We have disabled LBR for non-host CPU profiles, but forgot to also do so in the VM-Exit and VM-Entry control MSRs. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- .../x86_64/cpu_profiles/sapphire-rapids.msr.json | 2 +- arch/src/x86_64/cpu_profiles/skylake.msr.json | 2 +- .../msr_definitions/intel/msr_based_features.rs | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json index cd2b7a5a97..c9b5d42089 100644 --- a/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.msr.json @@ -203,4 +203,4 @@ "0xc0000102", "0xc0000103" ] -} +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.msr.json b/arch/src/x86_64/cpu_profiles/skylake.msr.json index 5cc3398dc1..eceb91fcda 100644 --- a/arch/src/x86_64/cpu_profiles/skylake.msr.json +++ b/arch/src/x86_64/cpu_profiles/skylake.msr.json @@ -201,4 +201,4 @@ "0xc0000102", "0xc0000103" ] -} +} \ No newline at end of file diff --git a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs index c268229677..e5cb7b214d 100644 --- a/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs +++ b/arch/src/x86_64/msr_definitions/intel/msr_based_features.rs @@ -941,7 +941,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", bits_range: (26, 26), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ZERO_CLEAR_UINV", @@ -1082,7 +1082,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", bits_range: (58, 58), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ONE_CLEAR_UINV", @@ -1215,7 +1215,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", bits_range: (21, 21), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ZERO_LOAD_PKRS", @@ -1335,7 +1335,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", bits_range: (53, 53), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ONE_LOAD_PKRS", @@ -3126,7 +3126,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ZERO_CLEAR_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", bits_range: (26, 26), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ZERO_CLEAR_UINV", @@ -3266,7 +3266,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ONE_CLEAR_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.7.1 Table 26-14 (Definitions of Primary VM-Exit Controls)", bits_range: (58, 58), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ONE_CLEAR_UINV", @@ -3398,7 +3398,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ZERO_LOAD_GUEST_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", bits_range: (21, 21), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ZERO_LOAD_PKRS", @@ -3518,7 +3518,7 @@ pub static INTEL_MSR_FEATURE_DEFINITIONS: MsrDefinitions<24> = const { short:"ALLOWED_ONE_LOAD_GUEST_IA32_LBR_CTL", description: "See Intel SDM Vol.3C Section 26.8.1 Table 26-17. (Definitions of VM-Entry Controls)", bits_range: (53, 53), - policy: ProfilePolicy::Inherit + policy: ProfilePolicy::Static(0) }, ValueDefinition { short:"ALLOWED_ONE_LOAD_PKRS", From 8e5bc8fc5f80024e6ba1838f425eb92cf3a2656f Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 23 Mar 2026 18:40:03 +0100 Subject: [PATCH 173/178] docs: CPU Profile generation We add developer documentation on how to use the CPU profile generation tool. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- docs/cpu_profile_generation.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 docs/cpu_profile_generation.md diff --git a/docs/cpu_profile_generation.md b/docs/cpu_profile_generation.md new file mode 100644 index 0000000000..3f62872576 --- /dev/null +++ b/docs/cpu_profile_generation.md @@ -0,0 +1,27 @@ +# CPU Profile Generation + +## Generating a CPU profile for a new target + +To generate a new CPU profile you start by executing the following command + +```shell +$ cargo run --release -p arch --bin generate-cpu-profile --features="cpu_profile_generation" "" +``` +on the machine you want to create a CPU profile for. This creates four new files in the `arch/src/x86_64/cpu_profiles` directory: +- `.cpuid.json` +- `.msr.json` +- one license file for each of the two files listed above + +check them in to git and then extend the `arch::x86_64::CpuProfile` enum with a new variant for your freshly generated profile. + +The final step is then to adapt `arch::x86_64::CpuProfile::cpuid_data` and `arch::x86_64::CpuProfile::msr_data` to load the +cpuid and msr JSON files we created above. After doing this you will of course have to rebuild cloud hypervisor in order to +use the new CPU profile. + +## Can existing CPU profiles be updated? + +More recent KVM versions may introduce more support for already existing hardware features. When this happens it is of course +tempting to run the CPU profile generation tool again with the new KVM version as we then get a profile supporting more CPU +functionality. Doing this without giving the CPU profile a new name is however a breaking change and thus not permitted. +Such PRs will **not be accepted**. Instead we encourage you add a `V2` (or higher number if `V` already exists) suffix +when generating the profile. From 81fbb61618c6143f45c16198217fab3ba7d0819e Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 10 Apr 2026 14:22:45 +0200 Subject: [PATCH 174/178] build: flate2 Workspace dependency We will later use flate2 in arch/build.rs to compress CPU profile JSON files at compile time and also later to decompress them at runtime. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index ad0549de1b..e45dc39d19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,6 +103,8 @@ dhat = "0.3.3" dirs = "6.0.0" env_logger = "0.11.10" epoll = "4.4.0" +# Used for (de-) compressing CPU profiles +flate2 = "1.1.9" flume = "0.12.0" itertools = "0.14.0" jiff = { version = "0.2.23", default-features = false, features = [ From 65785e9d81ca26c20a5ae50787894668f3e56288 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Thu, 9 Apr 2026 16:09:59 +0200 Subject: [PATCH 175/178] arch: Code generation for CPU profiles We introduce a build.rs build script in the arch crate which automatically constructs the x86_64 CpuProfile enum with one variant per pre-generated CPU profile. In order to keep the binary size in check we also take the opportunity to compress the CPU profile JSON files into the binary which then get decompressed at runtime. We will adapt cpu_profile.rs in the next commit to use the output of build.rs Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- Cargo.lock | 4 + arch/Cargo.toml | 7 ++ arch/build.rs | 253 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 arch/build.rs diff --git a/Cargo.lock b/Cargo.lock index 759e0045ba..7dac00e030 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,13 +116,17 @@ dependencies = [ "byteorder", "clap", "fdt", + "flate2", "hypervisor", "libc", "linux-loader", "log", + "prettyplease", "proptest", + "quote", "serde", "serde_json", + "syn", "thiserror", "uuid", "vm-fdt", diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 8e1f5c0a2b..6b438e52d8 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -40,6 +40,13 @@ vmm-sys-util = { workspace = true, features = ["with-serde"] } fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } +[build-dependencies] +anyhow = { workspace = true } +flate2 = { workspace = true } +prettyplease = "0.2.37" +quote = "1.0.45" +syn = "2.0.117" + # Use this to test our custom serialization logic [dev-dependencies] proptest = "1.0.0" diff --git a/arch/build.rs b/arch/build.rs new file mode 100644 index 0000000000..314128142f --- /dev/null +++ b/arch/build.rs @@ -0,0 +1,253 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::BTreeSet; +use std::ffi::OsStr; +use std::io::{Read, Write}; +use std::path::Path; +use std::{env, fs}; + +use anyhow::Context; +use flate2::Compression; +use flate2::write::ZlibEncoder; +use quote::{format_ident, quote}; + +/// This is where the CPU profile generation tool writes the JSON files associated with +/// a CPU profile. +const X86_64_CPU_PROFILES_PATH: &str = "./src/x86_64/cpu_profiles"; + +fn main() -> anyhow::Result<()> { + let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH") + .context("Could not get env var CARGO_CFG_TARGET_ARCH")?; + + if target_arch == "x86_64" { + generate_code_for_x86_64_cpu_profiles().context("CPU profile code generation failed")?; + // We only want the build script to be rerun if new CPU profiles are generated, or the + // build script itself changes (see the final println! before this function returns). + println!("cargo::rerun-if-changed={X86_64_CPU_PROFILES_PATH}"); + } + + // Disable automatic rerun after package changes. + // See: https://doc.rust-lang.org/cargo/reference/build-scripts.html#rerun-if-changed + println!("cargo::rerun-if-changed=build.rs"); + Ok(()) +} + +/// This function generates the `generated_cpu_profiles.rs` file which consists of the following: +/// +/// - a `CpuProfile` enum with a `Host` variant and one additional variant per pre-generated CPU profile. +/// - A function `compressed_cpuid_data` that takes a `&CpuProfile` and returns the compressed CPUID adjustment data required for the given CPU profile. +/// - A function `compressed_msr_data` that takes a `&CpuProfile` and returns the compressed MSR adjustment data required for the given CPU profile. +/// +/// This function works by traversing the JSON files in `X86_64_CPU_PROFILES_PATH` generated by +/// the CPU profile generation tool. +fn generate_code_for_x86_64_cpu_profiles() -> anyhow::Result<()> { + let out_dir = env::var_os("OUT_DIR").unwrap(); + let profile_names = x86_64_cpu_profile_names() + .context("Failed to extract CPU profile names from pre-generated JSON files")?; + // Compress each CPUID and MSR JSON file + compress_json_files(&profile_names, &out_dir) + .context("Failed to create compressed CPU profile data files")?; + + let mut out = generate_cpu_profile_enum(&profile_names); + out.push('\n'); + out.push_str(&generate_compressed_data_fn( + &profile_names, + DataType::Cpuid, + )); + out.push('\n'); + out.push_str(&generate_compressed_data_fn(&profile_names, DataType::Msr)); + + let generated_file_path = Path::new(&out_dir).join("generated_cpu_profiles.rs"); + let mut f = fs::File::create(&generated_file_path) + .with_context(|| format!("Could not create file with path:={generated_file_path:#?}"))?; + f.write_all(out.as_bytes()) + .with_context(|| format!("Could not write to file with path:={generated_file_path:#?}"))?; + Ok(()) +} + +/// The name of a pre-generated CPU profile. +/// +/// Each CPU profile has two associated JSON files: +/// +/// 1. .cpuid.json +/// 2. .msr.json +/// +/// and each instance of `ProfileName` is extracted from +/// ``. +struct ProfileName { + /// The `kebab_case` name converted to camel case. + camel_case: String, + kebab_case: String, +} + +/// Each CPU profile has two associated JSON files: +/// +/// one for CPUID adjustment data and one for MSR adjustment data. +#[derive(Copy, Clone)] +enum DataType { + Cpuid, + Msr, +} + +impl DataType { + fn as_str(&self) -> &str { + match self { + Self::Cpuid => "cpuid", + Self::Msr => "msr", + } + } +} + +/// Traverse the `X86_64_CPU_PROFILES_PATH` and extract a `[ProfileName]` per encountered +/// pre-generated CPU profile. +fn x86_64_cpu_profile_names() -> anyhow::Result> { + let dir = fs::read_dir(X86_64_CPU_PROFILES_PATH) + .with_context(|| format!("Could not read directory:={X86_64_CPU_PROFILES_PATH}"))?; + + let mut profile_names_kebab_case = BTreeSet::new(); + for entry in dir { + let file = entry.with_context(|| { + format!("Encountered error while traversing directory:={X86_64_CPU_PROFILES_PATH}") + })?; + let file_name = file.file_name().into_string().unwrap(); + let profile_name_kebab_case = { + let dot_pos = file_name + .find('.') + .expect("all files in the cpu_profiles directory should contain a '.' character"); + file_name[..dot_pos].to_string() + }; + profile_names_kebab_case.insert(profile_name_kebab_case); + } + + let profile_name_iter = profile_names_kebab_case.into_iter().map(|kebab_case| { + let mut camel_case = String::new(); + for part in kebab_case.split('-') { + if let Some(first_char) = part.chars().next() { + camel_case.extend(first_char.to_uppercase()); + let rest = &part[first_char.len_utf8()..]; + camel_case.push_str(rest); + } + } + ProfileName { + camel_case, + kebab_case, + } + }); + Ok(profile_name_iter.collect()) +} + +/// Compresses the CPUID and MSR related JSON files per CPU profile +/// that are found in `X86_64_CPU_PROFILES_PATH`. +fn compress_json_files(names: &[ProfileName], out_dir: &OsStr) -> anyhow::Result<()> { + for ProfileName { + kebab_case, + camel_case: _, + } in names + { + let file_bytes = |data_type: &str| -> anyhow::Result> { + let path = + Path::new(X86_64_CPU_PROFILES_PATH).join(format!("{kebab_case}.{data_type}.json")); + let mut file = fs::File::open(&path) + .with_context(|| format!("Could not open file with path:={path:#?}"))?; + let mut v = Vec::new(); + file.read_to_end(&mut v) + .with_context(|| format!("Could not read contents of file with path:={path:#?}"))?; + Ok(v) + }; + let cpuid_bytes = file_bytes("cpuid")?; + let msr_bytes = file_bytes("msr")?; + let compress_to_file = |data_type: &str, data: &[u8]| -> anyhow::Result<()> { + let path = Path::new(&out_dir).join(format!("{kebab_case}.{data_type}.zz")); + let file = fs::File::create(&path) + .with_context(|| format!("Could not create file with path:={path:#?}"))?; + let mut encoder = ZlibEncoder::new(file, Compression::best()); + encoder.write_all(data).with_context(|| { + format!("Could not write compressed bytes to file with path:={path:#?}") + })?; + encoder + .flush() + .with_context(|| format!("Could not flush to file with path:={path:#?}"))?; + Ok(()) + }; + compress_to_file(DataType::Cpuid.as_str(), &cpuid_bytes)?; + compress_to_file(DataType::Msr.as_str(), &msr_bytes)?; + } + + Ok(()) +} + +/// Generates Rust code as a String defining a `CpuProfile` enum with a `Host` variant +/// together with a variant per entry in `profile_names`. +fn generate_cpu_profile_enum(profile_names: &[ProfileName]) -> String { + // Obtain a vector of the non-host CPU profile enum variants from the previously parsed camel case names + let non_host_enum_variants = non_host_cpu_profile_variants(profile_names); + + // Use the quote crate to build the CpuProfile enum as a TokenStream. + let tokens = quote! { + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize, Default)] + pub enum CpuProfile { + #[default] + Host, + #(#non_host_enum_variants),* + } + }; + + // Parse this to a syntax tree and return it and convert it to a pretty printed string of Rust code + let syntax_tree = syn::parse2(tokens).unwrap(); + prettyplease::unparse(&syntax_tree) +} + +/// Generates the function that extracts the compressed bytes for `data_type` corresponding to the user's +/// selected CPU profile. +fn generate_compressed_data_fn(profile_names: &[ProfileName], data_type: DataType) -> String { + let data_type_str = data_type.as_str(); + let doc_str = format!( + "Extract compressed {data_type_str} CPU profile data corresponding to the given profile" + ); + let non_host_enum_variants = non_host_cpu_profile_variants(profile_names); + let compressed_file_names: Vec = profile_names + .iter() + .map( + |ProfileName { + kebab_case, + camel_case: _, + }| format!("/{kebab_case}.{data_type_str}.zz"), + ) + .collect(); + + // Workaround to interpolate `data_type_str` in the function name within a `quote!` invocation. + let fn_name_ident = format_ident!("compressed_{data_type_str}_data"); + + // We now use quote! to produce our function that matches against each enum variant and returns the compressed file as a byte slice. + // + // Note that the compressed bytes are no longer stand alone files after compiling since we will use `include_bytes!` to compile them + // into the final binary. + let tokens = quote! { + #[doc=#doc_str] + fn #fn_name_ident (profile: &CpuProfile) -> Option<&'static [u8]> { + use CpuProfile::*; + match profile { + Host => None, + #(#non_host_enum_variants => Some(&include_bytes!(concat!(env!("OUT_DIR"), #compressed_file_names))[..])),* + } + } + }; + + // Parse this to a syntax tree and return it and convert it to a pretty printed string of Rust code + let syntax_tree = syn::parse2(tokens).unwrap(); + prettyplease::unparse(&syntax_tree) +} + +/// Converts the parsed CPU profile names to a enum variants that may be placed into a token stream. +fn non_host_cpu_profile_variants(names: &[ProfileName]) -> Vec { + names + .iter() + .map(|name| { + syn::parse_str(name.camel_case.as_str()) + .expect("Should be able to parse camelcase name to syn::Variant") + }) + .collect() +} From 37239a651f5908cab79e7bacbc2f3f2d45d54bde Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 10 Apr 2026 09:37:44 +0200 Subject: [PATCH 176/178] arch: Update cpu_profile.rs to include code generation from build.rs Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/Cargo.toml | 3 + arch/src/x86_64/cpu_profile.rs | 170 +++++++++++++++++++-------------- 2 files changed, 103 insertions(+), 70 deletions(-) diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 6b438e52d8..a0b8fdb5df 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -36,6 +36,9 @@ uuid = { workspace = true } vm-memory = { workspace = true, features = ["backend-bitmap", "backend-mmap"] } vmm-sys-util = { workspace = true, features = ["with-serde"] } +[target.'cfg(target_arch = "x86_64")'.dependencies] +flate2 = { workspace = true } + [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs index de9fcc908d..820c296b32 100644 --- a/arch/src/x86_64/cpu_profile.rs +++ b/arch/src/x86_64/cpu_profile.rs @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::Write; +use std::io::{Read, Write}; +use flate2::read::ZlibDecoder; use hypervisor::arch::x86::{CpuIdEntry, MsrEntry}; use hypervisor::{CpuVendor, HypervisorType}; use log::error; @@ -17,18 +18,14 @@ use crate::x86_64::CpuidReg; use crate::x86_64::cpuid_definitions::Parameters; use crate::x86_64::msr_definitions::RegisterAddress; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] -#[serde(rename_all = "kebab-case")] -/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility -/// between host's with potentially different CPU models. -pub enum CpuProfile { - #[default] - Host, - #[cfg(feature = "kvm")] - Skylake, - #[cfg(feature = "kvm")] - SapphireRapids, -} +// build.rs generates a CpuProfiles enum with a variant for each +// CPU profile in arch/x86_64/cpu_profiles and also has the default +// host variant as well. +// +// Furthermore the build script also generates the functions +// `compressed_cpuid_data`, `compressed_msr_data` for obtaining the +// compressed JSON data associated with the given cpu profile. +include!(concat!(env!("OUT_DIR"), "/generated_cpu_profiles.rs")); impl CpuProfile { /// Loads pre-generated CPUID data associated with a CPU profile. @@ -39,26 +36,18 @@ impl CpuProfile { /// obtained from the hypervisor. // // We can only generate CPU profiles for the KVM hypervisor for the time being. - #[cfg(feature = "kvm")] pub(in crate::x86_64) fn cpuid_data(&self, amx: bool) -> Option { - let mut data: CpuIdProfileData = match self { - Self::Host => None, - Self::Skylake => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/skylake.cpuid.json")) - .inspect_err(|e| { - error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); - }) - .expect("should be able to deserialize pre-generated data"), - ), - Self::SapphireRapids => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.cpuid.json")) - .inspect_err(|e| { - error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); - }) - .expect("should be able to deserialize pre-generated data"), - ), - }?; - + const ESTIMATED_CPUID_CPU_PROFILE_DATA_COMPRESSION_RATIO: usize = 32; + + // The compressed_cpuid_data function is generated by build.rs + let compressed: &[u8] = compressed_cpuid_data(self)?; + let mut data: CpuIdProfileData = { + serde_json::from_slice(&Self::decompress_cpu_profile_data( + compressed, + ESTIMATED_CPUID_CPU_PROFILE_DATA_COMPRESSION_RATIO, + )) + .expect("Should be able to deserialize CPU profile CPUID data") + }; if !amx { // In this case we will need to wipe out the AMX tile state components (if they are included in the profile) for adj in data.adjustments.iter_mut() { @@ -84,45 +73,34 @@ impl CpuProfile { Some(data) } - #[cfg(not(feature = "kvm"))] - pub(in crate::x86_64) fn cpuid_data(&self, _amx: bool) -> Option { - if matches!(*self, Self::Host) { - return None; - } - // This will need to be addressed before upstreaming. - // We will probably need one profile per hypervisor. - unreachable!() - } - /// Loads pre-generated MSR data associated with a CPU profile. - #[cfg(feature = "kvm")] pub(in crate::x86_64) fn msr_data(&self) -> Option { - match self { - Self::Host => None, - Self::Skylake => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/skylake.msr.json")) - .inspect_err(|e| { - error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); - }) - .expect("should be able to deserialize pre-generated data"), - ), - Self::SapphireRapids => Some( - serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.msr.json")) - .inspect_err(|e| { - error!("BUG: could not deserialize CPU profile. Got error: {e:?}"); - }) - .expect("should be able to deserialize pre-generated data"), - ), - } + const ESTIMATED_MSR_CPU_PROFILE_DATA_COMPRESSION_RATIO: usize = 4; + + // compressed_msr_data is created by build.rs + let compressed: &[u8] = compressed_msr_data(self)?; + serde_json::from_slice(&Self::decompress_cpu_profile_data( + compressed, + ESTIMATED_MSR_CPU_PROFILE_DATA_COMPRESSION_RATIO, + )) + .expect("Should be able to deserialize CPU profile MSR data") } - #[cfg(not(feature = "kvm"))] - pub(in crate::x86_64) fn msr_data(&self) -> Option { - if matches!(*self, Self::Host) { - return None; - } - // CPU profiles are currently only available when using KVM as the hypervisor. - unreachable!() + /// Decompress the `compressed` byte slice. + /// + /// The `estimated_compression_ratio` is just used for optimizing the number of necessary allocations + /// and does not have to be accurate. + fn decompress_cpu_profile_data( + compressed: &[u8], + estimated_compression_ratip: usize, + ) -> Vec { + let mut decoder = ZlibDecoder::new(compressed); + // Don't expect more than a 32x compression ratio + let mut v = Vec::with_capacity(compressed.len() * estimated_compression_ratip); + decoder + .read_to_end(&mut v) + .expect("Should be able to decompress CPU profile data"); + v } } @@ -132,7 +110,7 @@ impl CpuProfile { /// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to /// increase the likelihood of successful live migrations among all hosts that opted in to the given /// CPU profile. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] #[allow(dead_code)] pub struct CpuIdProfileData { /// The hypervisor used when generating this CPU profile. @@ -295,7 +273,7 @@ impl CpuidOutputRegisterAdjustments { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq, PartialEq)] pub(in crate::x86_64) struct FeatureMsrAdjustment { pub(in crate::x86_64) mask: u64, pub(in crate::x86_64) replacements: u64, @@ -403,7 +381,7 @@ pub struct RequiredMsrUpdates { /// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to /// increase the likelihood of successful live migrations among all hosts that opted in to the given /// CPU profile. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub(in crate::x86_64) struct MsrProfileData { pub(in crate::x86_64) cpu_vendor: CpuVendor, pub(in crate::x86_64) hypervisor_type: HypervisorType, @@ -424,6 +402,9 @@ mod tests { use proptest::prelude::*; use super::CpuidOutputRegisterAdjustments; + use crate::CpuProfile; + #[cfg(feature = "kvm")] + use crate::x86_64::cpu_profile::{CpuIdProfileData, MsrProfileData}; // Check that serializing and then deserializing `CpuidOutputResiterAdjustments` results in the same value we started with. // @@ -460,4 +441,53 @@ mod tests { check_str_invariants(mask_str)?; } } + + #[test] + fn cpu_profile_host_loads_no_data() { + assert_eq!(CpuProfile::Host.cpuid_data(true), None); + assert_eq!(CpuProfile::Host.cpuid_data(false), None); + assert_eq!(CpuProfile::Host.msr_data(), None); + } + + /// Check that the `CpuProfile::cpuid_data` and `CpuProfile::msr_data` methods + /// coincide with direct deserialization for the `sapphire-rapids` profile. + #[cfg(feature = "kvm")] + #[test] + fn cpu_profile_loading_sapphire_rapids() { + // Now check that the methods coincide with direct deserialization. For the + // Sapphire Rapids profile this should be the case when `amx` is enabled. + let profile = CpuProfile::SapphireRapids; + let cpuid_data = profile.cpuid_data(true).unwrap(); + let deserialized_cpuid_data: CpuIdProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/sapphire-rapids.cpuid.json")) + .unwrap(); + + assert_eq!(cpuid_data, deserialized_cpuid_data); + + let msr_data = profile.msr_data().unwrap(); + let deserialized_msr_data: MsrProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/sapphire-rapids.msr.json")) + .unwrap(); + assert_eq!(msr_data, deserialized_msr_data); + } + + /// Check that the `CpuProfile::cpuid_data` and `CpuProfile::msr_data` methods + /// coincide with direct deserialization for the `skylake` profile. + #[cfg(feature = "kvm")] + #[test] + fn cpu_profile_loading_skylake() { + // Now check that the methods coincide with direct deserialization. For the + // Sapphire Rapids profile this should be the case when `amx` is enabled. + let profile = CpuProfile::Skylake; + let cpuid_data = profile.cpuid_data(true).unwrap(); + let deserialized_cpuid_data: CpuIdProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/skylake.cpuid.json")).unwrap(); + + assert_eq!(cpuid_data, deserialized_cpuid_data); + + let msr_data = profile.msr_data().unwrap(); + let deserialized_msr_data: MsrProfileData = + serde_json::from_slice(include_bytes!("./cpu_profiles/skylake.msr.json")).unwrap(); + assert_eq!(msr_data, deserialized_msr_data); + } } From b88d1b9ab82b7799e0e07b086ac40d7a9a174c4f Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Fri, 10 Apr 2026 08:55:54 +0200 Subject: [PATCH 177/178] docs: Update CPU profile generation developer documentation Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- docs/cpu_profile_generation.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/cpu_profile_generation.md b/docs/cpu_profile_generation.md index 3f62872576..ccd7b47f70 100644 --- a/docs/cpu_profile_generation.md +++ b/docs/cpu_profile_generation.md @@ -2,21 +2,20 @@ ## Generating a CPU profile for a new target -To generate a new CPU profile you start by executing the following command +To generate a new CPU profile you execute the following command ```shell $ cargo run --release -p arch --bin generate-cpu-profile --features="cpu_profile_generation" "" ``` on the machine you want to create a CPU profile for. This creates four new files in the `arch/src/x86_64/cpu_profiles` directory: -- `.cpuid.json` -- `.msr.json` +- `.cpuid.json` +- `.msr.json` - one license file for each of the two files listed above -check them in to git and then extend the `arch::x86_64::CpuProfile` enum with a new variant for your freshly generated profile. +check them in to git and then simply rebuild cloud-hypervisor `cargo build --release --bin cloud-hypervisor`. -The final step is then to adapt `arch::x86_64::CpuProfile::cpuid_data` and `arch::x86_64::CpuProfile::msr_data` to load the -cpuid and msr JSON files we created above. After doing this you will of course have to rebuild cloud hypervisor in order to -use the new CPU profile. +You can now use the new profile by adding `,profile=` to the list of `--cpus` configuration +options on the command line. ## Can existing CPU profiles be updated? From 8a112737e90a9c5d0e99ae98c8134e5573a37172 Mon Sep 17 00:00:00 2001 From: Oliver Anderson Date: Mon, 13 Apr 2026 14:53:37 +0200 Subject: [PATCH 178/178] arch: Deserialize CPU profiles in kebab-case When we introduced our build script we forgot to tell `serde` to (de-) serialize the `CpuProfile` enum in kebab-case which is a breaking change. Signed-off-by: Oliver Anderson On-behalf-of: SAP oliver.anderson@sap.com --- arch/build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/build.rs b/arch/build.rs index 314128142f..01170f7289 100644 --- a/arch/build.rs +++ b/arch/build.rs @@ -188,6 +188,7 @@ fn generate_cpu_profile_enum(profile_names: &[ProfileName]) -> String { // Use the quote crate to build the CpuProfile enum as a TokenStream. let tokens = quote! { #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize, Default)] + #[serde(rename_all = "kebab-case")] pub enum CpuProfile { #[default] Host,