diff --git a/openvmm/membacking/src/mapping_manager/manager.rs b/openvmm/membacking/src/mapping_manager/manager.rs index 38cef87dd6..4ea083a33b 100644 --- a/openvmm/membacking/src/mapping_manager/manager.rs +++ b/openvmm/membacking/src/mapping_manager/manager.rs @@ -218,6 +218,8 @@ pub struct MappingParams { /// that external consumers (vhost-user backends, etc.) can share the /// backing memory. pub dma_target: bool, + /// Host NUMA node for this mapping. `None` means OS default placement. + pub numa_node: Option, } struct Mappers { @@ -440,6 +442,7 @@ mod tests { file_offset: 0, writable: true, dma_target: true, + numa_node: None, }) .await; @@ -450,6 +453,7 @@ mod tests { file_offset: 0, writable: true, dma_target: false, + numa_node: None, }) .await; @@ -481,6 +485,7 @@ mod tests { file_offset: 0, writable: true, dma_target: false, + numa_node: None, }) .await; diff --git a/openvmm/membacking/src/mapping_manager/va_mapper.rs b/openvmm/membacking/src/mapping_manager/va_mapper.rs index d3c8a7ea85..087712f12e 100644 --- a/openvmm/membacking/src/mapping_manager/va_mapper.rs +++ b/openvmm/membacking/src/mapping_manager/va_mapper.rs @@ -120,22 +120,63 @@ impl MapperTask { mappable, writable, file_offset, + numa_node, .. }) => { tracing::debug!(%range, "mapping received for range"); - self.inner - .mapping - .map_file( - range.start() as usize, - range.len() as usize, - &mappable, - file_offset, - writable, - ) - .expect("oom mapping file"); - - self.wake_waiters(range, Some(writable)); + let map_result = { + #[cfg(unix)] + { + self.inner.mapping.map_file( + range.start() as usize, + range.len() as usize, + &mappable, + file_offset, + writable, + ) + } + #[cfg(windows)] + { + self.inner.mapping.map_file_numa( + range.start() as usize, + range.len() as usize, + &mappable, + file_offset, + writable, + numa_node, + ) + } + }; + + match map_result { + Ok(()) => { + #[cfg(target_os = "linux")] + if let Some(node) = numa_node { + if let Err(e) = self.inner.mapping.mbind_at( + range.start() as usize, + range.len() as usize, + node, + ) { + tracing::error!( + error = &e as &dyn std::error::Error, + %range, + node, + "NUMA binding failed, using default placement" + ); + } + } + self.wake_waiters(range, Some(writable)); + } + Err(e) => { + tracing::error!( + error = &e as &dyn std::error::Error, + %range, + "failed to map file for range" + ); + self.wake_waiters(range, None); + } + } } MapperRequest::NoMapping(range) => { // Wake up waiters. They'll see a failure when they try to @@ -290,12 +331,30 @@ impl VaMapper { self.process.as_ref() } - /// Allocates private anonymous memory for a range within the mapping. + /// Allocates private anonymous memory for a range within the mapping, + /// optionally bound to a specific host NUMA node. /// /// This replaces the placeholder at the given offset with committed /// anonymous memory. - pub(crate) fn alloc_range(&self, offset: usize, len: usize) -> Result<(), std::io::Error> { - self.inner.mapping.alloc(offset, len) + pub(crate) fn alloc_range( + &self, + offset: usize, + len: usize, + numa_node: Option, + ) -> Result<(), std::io::Error> { + #[cfg(windows)] + { + self.inner.mapping.alloc_numa(offset, len, numa_node) + } + #[cfg(unix)] + { + self.inner.mapping.alloc(offset, len)?; + #[cfg(target_os = "linux")] + if let Some(node) = numa_node { + self.inner.mapping.mbind_at(offset, len, node)?; + } + Ok(()) + } } /// Names a range within the mapping for debugging (visible in smaps). diff --git a/openvmm/membacking/src/memory_manager/device_memory.rs b/openvmm/membacking/src/memory_manager/device_memory.rs index 46dd7fb40b..8e0f7643e5 100644 --- a/openvmm/membacking/src/memory_manager/device_memory.rs +++ b/openvmm/membacking/src/memory_manager/device_memory.rs @@ -116,6 +116,7 @@ impl MappedMemoryRegion for DeviceMemoryRegion { new_mapping.mappable.clone(), new_mapping.file_offset, new_mapping.writable, + None, )); } state.mappings.push(new_mapping); @@ -173,6 +174,7 @@ impl MappableGuestMemory for DeviceMemoryControl { mapping.mappable.clone(), mapping.file_offset, mapping.writable, + None, ) .await; } diff --git a/openvmm/membacking/src/memory_manager/mod.rs b/openvmm/membacking/src/memory_manager/mod.rs index 6db181adc1..0297ccf274 100644 --- a/openvmm/membacking/src/memory_manager/mod.rs +++ b/openvmm/membacking/src/memory_manager/mod.rs @@ -68,6 +68,8 @@ struct RamBacking { /// THP is enabled for this backing. #[cfg_attr(not(target_os = "linux"), expect(dead_code))] transparent_hugepages: bool, + /// Host NUMA node for this backing. `None` means OS default placement. + host_numa_node: Option, } #[derive(Debug)] @@ -193,6 +195,7 @@ pub struct RamBackingRequest { hugepages: bool, hugepage_size: Option, existing_mappable: Option, + host_numa_node: Option, } impl RamBackingRequest { @@ -209,6 +212,7 @@ impl RamBackingRequest { hugepages: false, hugepage_size: None, existing_mappable: None, + host_numa_node: None, } } @@ -244,6 +248,13 @@ impl RamBackingRequest { self.existing_mappable = Some(mappable); self } + + /// Bind this backing's memory to a specific host NUMA node + /// (Linux: `mbind(MPOL_BIND)`, Windows: `MemExtendedParameterNumaNode`). + pub fn host_numa_node(mut self, node: Option) -> Self { + self.host_numa_node = node; + self + } } fn validate_hugepage_size(size: u64) -> Result { @@ -441,6 +452,7 @@ impl GuestMemoryBuilder { ranges: req.ranges, prefetch: req.prefetch, transparent_hugepages: req.transparent_hugepages, + host_numa_node: req.host_numa_node, }); continue; } @@ -479,11 +491,13 @@ impl GuestMemoryBuilder { .into() } }; + backings.push(RamBacking { mappable: Some(mappable), ranges: req.ranges, prefetch: req.prefetch, transparent_hugepages: false, + host_numa_node: req.host_numa_node, }); } @@ -549,11 +563,20 @@ impl GuestMemoryBuilder { mappable.clone(), file_offset, true, + backing.host_numa_node, ) .await; + // TODO: file-backed RAM mappings are established lazily + // via page faults, so NUMA binding errors are not + // caught here. Replace lazy mapping with eager push + // model to propagate errors at build time. } else { va_mapper - .alloc_range(sub_range.start() as usize, sub_range.len() as usize) + .alloc_range( + sub_range.start() as usize, + sub_range.len() as usize, + backing.host_numa_node, + ) .map_err(|e| MemoryBuildError::PrivateRamAlloc(e, *sub_range))?; va_mapper.set_range_name( sub_range.start() as usize, diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 61dfecf0fe..595c4c16c4 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -271,6 +271,7 @@ struct RegionMappingParams { mappable: Mappable, file_offset: u64, writable: bool, + numa_node: Option, } fn range_within(outer: MemoryRange, inner: MemoryRange) -> MemoryRange { @@ -589,6 +590,7 @@ impl RegionManagerTask { file_offset: params.file_offset, writable: params.writable, dma_target: region.params.dma_target, + numa_node: params.numa_node, }) .await; @@ -684,6 +686,7 @@ impl RegionManagerTaskInner { file_offset: mapping.params.file_offset, writable: mapping.params.writable && map_params.writable, dma_target: region.params.dma_target, + numa_node: mapping.params.numa_node, }) .await; } @@ -917,6 +920,7 @@ impl RegionHandle { mappable: Mappable, file_offset: u64, writable: bool, + numa_node: Option, ) { let _ = self .req_send @@ -929,6 +933,7 @@ impl RegionHandle { mappable, file_offset, writable, + numa_node, }, ), ) @@ -1127,6 +1132,7 @@ mod tests { mappable: self.mappable.clone(), file_offset: 0, writable: true, + numa_node: None, }, ) .await; diff --git a/support/sparse_mmap/src/lib.rs b/support/sparse_mmap/src/lib.rs index 1506195017..a2b9feb115 100644 --- a/support/sparse_mmap/src/lib.rs +++ b/support/sparse_mmap/src/lib.rs @@ -378,4 +378,75 @@ mod tests { ); } } + + #[test] + #[cfg(any(target_os = "linux", windows))] + fn test_alloc_numa_node0() { + let page_size = SparseMapping::page_size(); + let size = 4 * page_size; + let mapping = SparseMapping::new(size).unwrap(); + + // Allocate with NUMA node 0 (always present). + #[cfg(unix)] + { + mapping.alloc(0, size).unwrap(); + mapping.mbind_at(0, size, 0).unwrap(); + } + #[cfg(windows)] + mapping.alloc_numa(0, size, Some(0)).unwrap(); + + // Memory should be accessible and writable. + let pattern = vec![0xABu8; page_size]; + mapping.write_at(0, &pattern).unwrap(); + let mut buf = vec![0u8; page_size]; + mapping.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, pattern); + } + + #[test] + #[cfg(any(target_os = "linux", windows))] + fn test_map_file_numa_node0() { + let page_size = SparseMapping::page_size(); + let size = 4 * page_size; + let mapping = SparseMapping::new(size).unwrap(); + let shmem = alloc_shared_memory(size, "test-numa").unwrap(); + + // Map with NUMA node 0 (always present). + #[cfg(unix)] + { + mapping.map_file(0, size, &shmem, 0, true).unwrap(); + mapping.mbind_at(0, size, 0).unwrap(); + } + #[cfg(windows)] + mapping + .map_file_numa(0, size, &shmem, 0, true, Some(0)) + .unwrap(); + + // Memory should be accessible and writable. + let pattern = vec![0xCDu8; page_size]; + mapping.write_at(0, &pattern).unwrap(); + let mut buf = vec![0u8; page_size]; + mapping.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, pattern); + } + + #[test] + #[cfg(any(target_os = "linux", windows))] + fn test_alloc_numa_invalid_node() { + let page_size = SparseMapping::page_size(); + let mapping = SparseMapping::new(page_size).unwrap(); + + // A very large NUMA node number should fail with an error (not panic). + #[cfg(unix)] + { + mapping.alloc(0, page_size).unwrap(); + let result = mapping.mbind_at(0, page_size, 99999); + assert!(result.is_err()); + } + #[cfg(windows)] + { + let result = mapping.alloc_numa(0, page_size, Some(99999)); + assert!(result.is_err()); + } + } } diff --git a/support/sparse_mmap/src/unix.rs b/support/sparse_mmap/src/unix.rs index 88374fe1d5..b0605b7ee7 100644 --- a/support/sparse_mmap/src/unix.rs +++ b/support/sparse_mmap/src/unix.rs @@ -279,6 +279,18 @@ impl SparseMapping { } } + /// Calls `mbind(MPOL_BIND)` on a range within this mapping, binding + /// pages to a specific host NUMA node. + /// + /// The range at `offset..offset+len` must already be mapped (via + /// `alloc`, `map_file`, etc.) before calling this. + #[cfg(target_os = "linux")] + pub fn mbind_at(&self, offset: usize, len: usize, numa_node: u32) -> Result<(), Error> { + // SAFETY: the caller has mapped this range within the + // SparseMapping, so `self.address + offset` is valid for `len` bytes. + unsafe { mbind_range(self.address.add(offset), len, numa_node) } + } + /// Maps memory into the mapping, passing parameters through to the mmap /// syscall. /// @@ -562,3 +574,46 @@ pub fn alloc_shared_memory_hugetlb( "hugetlb shared memory is only supported on Linux", )) } + +/// Calls `mbind(MPOL_BIND)` on an already-mapped virtual address range, +/// binding it to a specific host NUMA node. +/// +/// # Safety +/// +/// `addr` must point to a valid mapped region of at least `len` bytes. +#[cfg(target_os = "linux")] +unsafe fn mbind_range(addr: *mut c_void, len: usize, numa_node: u32) -> io::Result<()> { + // Build nodemask bitmask. The kernel expects an array of unsigned long with + // bit `numa_node` set. + // + // maxnode should be the number of bits in the nodemask, but the kernel's + // get_nodes() has an off-by-one: it decrements maxnode before use, so we + // must pass numa_node + 2 instead of numa_node + 1. This is a known kernel + // bug since 2004 that will not be fixed (ABI). See + // + let maxnode = numa_node as usize + 2; + let word_bits = libc::c_ulong::BITS as usize; + let num_words = maxnode.div_ceil(word_bits); + let mut nodemask = vec![0 as libc::c_ulong; num_words]; + nodemask[numa_node as usize / word_bits] = 1 << (numa_node as usize % word_bits); + + // Use flags = 0: just set the NUMA policy for future page faults. + // The memory was just mapped, so there are no resident pages to move. + let result = unsafe { + libc::syscall( + libc::SYS_mbind, + addr, + len, + libc::MPOL_BIND, + nodemask.as_ptr(), + maxnode, + 0, + ) + }; + + if result == -1 { + return Err(Error::last_os_error()); + } + + Ok(()) +} diff --git a/support/sparse_mmap/src/windows.rs b/support/sparse_mmap/src/windows.rs index bb234926bc..c0b39dfb67 100644 --- a/support/sparse_mmap/src/windows.rs +++ b/support/sparse_mmap/src/windows.rs @@ -71,9 +71,16 @@ unsafe fn virtual_alloc( size: usize, allocation_type: u32, page_protection: u32, - extended_parameters: *mut Memory::MEM_EXTENDED_PARAMETER, - parameter_count: u32, + extended_parameters: &mut [Memory::MEM_EXTENDED_PARAMETER], ) -> Result<*mut c_void, Error> { + let (params_ptr, params_count) = if extended_parameters.is_empty() { + (null_mut(), 0) + } else { + ( + extended_parameters.as_mut_ptr(), + extended_parameters.len() as u32, + ) + }; let address = unsafe { VirtualAlloc2( process.handle(), @@ -81,8 +88,8 @@ unsafe fn virtual_alloc( size, allocation_type, page_protection, - extended_parameters, - parameter_count, + params_ptr, + params_count, ) }; if address.is_null() { @@ -111,7 +118,16 @@ unsafe fn map_view_of_file( view_size: usize, allocation_type: u32, page_protection: u32, + extended_parameters: &mut [Memory::MEM_EXTENDED_PARAMETER], ) -> Result<*mut c_void, Error> { + let (params_ptr, params_count) = if extended_parameters.is_empty() { + (null_mut(), 0) + } else { + ( + extended_parameters.as_mut_ptr(), + extended_parameters.len() as u32, + ) + }; let address = unsafe { MapViewOfFile3( file_mapping, @@ -121,8 +137,8 @@ unsafe fn map_view_of_file( view_size, allocation_type, page_protection, - null_mut(), - 0, + params_ptr, + params_count, ) } .Value; @@ -132,6 +148,16 @@ unsafe fn map_view_of_file( Ok(address) } +/// Returns a NUMA node `MEM_EXTENDED_PARAMETER`, if a node is specified. +fn numa_extended_param(numa_node: Option) -> Option { + let node = numa_node?; + // SAFETY: MEM_EXTENDED_PARAMETER is a C union struct; zeroing is valid. + let mut param: Memory::MEM_EXTENDED_PARAMETER = unsafe { std::mem::zeroed() }; + param.Anonymous1._bitfield = Memory::MemExtendedParameterNumaNode as u64 & 0xff; + param.Anonymous2.ULong = node; + Some(param) +} + unsafe fn unmap_view_of_file( process: Option<&Process>, address: *mut c_void, @@ -339,8 +365,7 @@ impl SparseMapping { len, MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS, - null_mut(), - 0, + &mut [], )?; Ok(Self { address, @@ -388,12 +413,23 @@ impl SparseMapping { /// Allocates private, writable memory at the given offset within the /// mapping. pub fn alloc(&self, offset: usize, len: usize) -> Result<(), Error> { - self.virtual_alloc(offset, len, PAGE_READWRITE) + self.virtual_alloc(offset, len, PAGE_READWRITE, None) + } + + /// Allocates private, writable memory at the given offset, optionally + /// bound to a specific host NUMA node. + pub fn alloc_numa( + &self, + offset: usize, + len: usize, + numa_node: Option, + ) -> Result<(), Error> { + self.virtual_alloc(offset, len, PAGE_READWRITE, numa_node) } /// Maps read-only zero pages at the given offset within the mapping. pub fn map_zero(&self, offset: usize, len: usize) -> Result<(), Error> { - self.virtual_alloc(offset, len, PAGE_READONLY) + self.virtual_alloc(offset, len, PAGE_READONLY, None) } fn validate_offset_len(&self, offset: usize, len: usize) -> io::Result { @@ -450,17 +486,23 @@ impl SparseMapping { } /// Allocates private memory at the given offset with memory protection - /// `protect`. - pub fn virtual_alloc(&self, offset: usize, len: usize, protect: u32) -> Result<(), Error> { + /// `protect`, optionally bound to a specific host NUMA node. + pub fn virtual_alloc( + &self, + offset: usize, + len: usize, + protect: u32, + numa_node: Option, + ) -> Result<(), Error> { self.map(offset, len, |addr| unsafe { + let mut param = numa_extended_param(numa_node); virtual_alloc( self.process.as_ref(), addr, len, MEM_RESERVE | MEM_COMMIT | MEM_REPLACE_PLACEHOLDER, protect, - null_mut(), - 0, + param.as_mut_slice(), )?; Ok(MappingInfo::Anonymous) }) @@ -480,10 +522,44 @@ impl SparseMapping { } else { PAGE_READONLY }; - self.map_view_of_file(offset, len, file_mapping.as_handle(), file_offset, protect) + self.map_view_of_file( + offset, + len, + file_mapping.as_handle(), + file_offset, + protect, + None, + ) + } + + /// Maps a portion of a file mapping at `offset`, optionally bound to a + /// specific host NUMA node. + pub fn map_file_numa( + &self, + offset: usize, + len: usize, + file_mapping: impl AsHandle, + file_offset: u64, + writable: bool, + numa_node: Option, + ) -> Result<(), Error> { + let protect = if writable { + PAGE_READWRITE + } else { + PAGE_READONLY + }; + self.map_view_of_file( + offset, + len, + file_mapping.as_handle(), + file_offset, + protect, + numa_node, + ) } - /// Maps a portion of a file mapping at `offset` with protection `protect`. + /// Maps a portion of a file mapping at `offset` with protection `protect`, + /// optionally bound to a specific host NUMA node. pub fn map_view_of_file( &self, offset: usize, @@ -491,6 +567,7 @@ impl SparseMapping { file_mapping: impl AsHandle, file_offset: u64, protect: u32, + numa_node: Option, ) -> Result<(), Error> { assert_ne!(len, 0); self.map(offset, len, |addr| unsafe { @@ -505,6 +582,7 @@ impl SparseMapping { p => panic!("unknown protection {:#x}", p), }; let section = file_mapping.as_handle().duplicate(false, Some(access))?; + let mut param = numa_extended_param(numa_node); map_view_of_file( self.process.as_ref(), file_mapping.as_handle().as_raw_handle(), @@ -513,6 +591,7 @@ impl SparseMapping { len, MEM_REPLACE_PLACEHOLDER, protect, + param.as_mut_slice(), )?; Ok(MappingInfo::Section { handle: section, @@ -570,6 +649,7 @@ impl SparseMapping { len, MEM_REPLACE_PLACEHOLDER, *protection, + &mut [], ) .expect("remap failed"); } @@ -594,6 +674,7 @@ impl SparseMapping { len, MEM_REPLACE_PLACEHOLDER, *protection, + &mut [], ) .expect("remap failed"); } @@ -708,8 +789,7 @@ impl SparseMapping { len, MEM_COMMIT, PAGE_READWRITE, - null_mut(), - 0, + &mut [], )?; } Ok(()) @@ -784,7 +864,7 @@ mod tests { let shmem = alloc_shared_memory(0x100000, "test").unwrap(); let sparse = SparseMapping::new(0x100000).unwrap(); sparse - .map_view_of_file(0, 0x100000, &shmem, 0, PAGE_READWRITE) + .map_view_of_file(0, 0x100000, &shmem, 0, PAGE_READWRITE, None) .unwrap(); let data: &mut [u32] = unsafe { std::slice::from_raw_parts_mut(sparse.as_ptr().cast(), sparse.len() / 4) };