From 1e0ab2e8a89a1c74293282e09336bba66915a9a4 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 18 Feb 2026 17:46:41 +0000 Subject: [PATCH 1/3] mode: auto-detect mode from hardware topology, remove nvrc.mode Replace the nvrc.mode kernel parameter with automatic hardware topology detection from PCI and InfiniBand sysfs. The mode is now determined by scanning for NVIDIA GPUs, NVSwitches, and SW_MNG InfiniBand devices. - Remove mode field from NVRC struct - Remove nvrc.mode kernel parameter parsing - Pass detected mode directly to configure_fabricmanager - Simplify mode_gpu bare-metal NVSwitch path - Rename nvswitch-nvl{4,5} modes to servicevm-nvl{4,5} Signed-off-by: Zvonko Kaiser --- src/daemon.rs | 152 ++++++++--------- src/infiniband.rs | 91 ++-------- src/kernel_params.rs | 155 +---------------- src/lib.rs | 1 + src/main.rs | 61 +++---- src/mode.rs | 395 +++++++++++++++++++++++++++++++++++++++++++ src/nvrc.rs | 8 +- 7 files changed, 523 insertions(+), 340 deletions(-) create mode 100644 src/mode.rs diff --git a/src/daemon.rs b/src/daemon.rs index 78a1493..51d0617 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -6,6 +6,7 @@ use crate::execute::background; use crate::macros::ResultExt; use crate::nvrc::NVRC; use std::fs; +use std::os::unix::fs::PermissionsExt; /// UVM persistence mode keeps unified memory mappings alive between kernel launches, /// avoiding expensive page migrations. Enabled by default for ML workloads. @@ -30,8 +31,14 @@ fn dcgm_exporter_args() -> &'static [&'static str] { } const FM_CONFIG: &str = "/usr/share/nvidia/nvswitch/fabricmanager.cfg"; +const FM_RUNTIME_CONFIG: &str = "/run/fabricmanager.cfg"; const NVLSM_CONFIG: &str = "/usr/share/nvidia/nvlsm/nvlsm.conf"; +/// FABRIC_MODE=0: full GPU passthrough, FM manages NVSwitches directly. +pub const FABRIC_MODE_FULL: u8 = 0; +/// FABRIC_MODE=1: shared NVSwitch virtualization, GPUs in tenant VMs. +pub const FABRIC_MODE_SHARED: u8 = 1; + /// Configurable path parameters allow testing with /bin/true instead of real /// NVIDIA binaries that don't exist in the test environment. impl NVRC { @@ -80,16 +87,17 @@ impl NVRC { /// NVSwitch fabric manager is only needed for multi-GPU NVLink topologies. /// Disabled by default since most VMs have single GPUs. - pub fn nv_fabricmanager(&mut self) { - self.configure_fabricmanager(FM_CONFIG); + pub fn nv_fabricmanager(&mut self, fabric_mode: u8, rail_policy: &str) { + fs::copy(FM_CONFIG, FM_RUNTIME_CONFIG) + .or_panic(format_args!("copy {FM_CONFIG} to {FM_RUNTIME_CONFIG}")); + self.configure_fabricmanager(FM_RUNTIME_CONFIG, fabric_mode, rail_policy); + fs::set_permissions(FM_RUNTIME_CONFIG, fs::Permissions::from_mode(0o400)) + .or_panic(format_args!("set permissions {FM_RUNTIME_CONFIG}")); self.spawn_fabricmanager("/bin/nv-fabricmanager") } fn spawn_fabricmanager(&mut self, bin: &str) { - if self.fabric_mode.is_none() { - return; - } - let mut args = vec!["-c", FM_CONFIG]; + let mut args = vec!["-c", FM_RUNTIME_CONFIG]; let guid_owned: String; if let Some(ref guid) = self.port_guid { guid_owned = guid.clone(); @@ -102,7 +110,7 @@ impl NVRC { /// CX7 bridges require NVLSM to manage NVLink subnet before FM can initialize the fabric. pub fn nv_nvlsm(&mut self) { - self.spawn_nvlsm("/opt/nvidia/nvlsm/sbin/nvlsm") + self.spawn_nvlsm("/sbin/nvlsm") } fn spawn_nvlsm(&mut self, bin: &str) { @@ -110,28 +118,17 @@ impl NVRC { return; }; let guid_owned = guid.clone(); - let args = vec!["-F", NVLSM_CONFIG, "-g", &guid_owned]; + let args = vec!["-F", NVLSM_CONFIG, "-g", &guid_owned, "-f", "stdout"]; let child = background(bin, &args); self.track_daemon("nvlsm", child); } - /// Write FABRIC_MODE, FABRIC_MODE_RESTART, and PARTITION_RAIL_POLICY to fabricmanager.cfg. - /// ServiceVM (mode 1) requires FABRIC_MODE_RESTART=1 for resiliency. - fn configure_fabricmanager(&self, cfg_path: &str) { - let Some(mode) = self.fabric_mode else { - return; - }; - - let mode_str = mode.to_string(); - let restart = if mode == 1 { "1" } else { "0" }; - let policy = self.rail_policy.as_deref().unwrap_or("greedy"); - - let updates = &[ - ("FABRIC_MODE", mode_str.as_str()), - ("FABRIC_MODE_RESTART", restart), - ("PARTITION_RAIL_POLICY", policy), - ]; - + /// Write FABRIC_MODE and PARTITION_RAIL_POLICY to fabricmanager.cfg. + /// FABRIC_MODE: 0 = bare metal (GPUs local), 1 = service VM (GPUs in tenant VMs) + /// PARTITION_RAIL_POLICY: "greedy" (NVL4) or "symmetric" (NVL5, required for CC on Blackwell) + fn configure_fabricmanager(&self, cfg_path: &str, fabric_mode: u8, rail_policy: &str) { + let fm = &fabric_mode.to_string(); + let updates = &[("FABRIC_MODE", fm.as_str()), ("PARTITION_RAIL_POLICY", rail_policy)]; update_config_file(cfg_path, updates); } } @@ -190,9 +187,20 @@ mod tests { } #[test] - fn test_nv_fabricmanager_skipped_by_default() { + fn test_nv_fabricmanager_gpu_mode() { + use tempfile::NamedTempFile; + + let tmpfile = NamedTempFile::new().unwrap(); + let cfg = tmpfile.path().to_str().unwrap(); + fs::write(cfg, "FABRIC_MODE=0\n").unwrap(); + let mut nvrc = NVRC::default(); - nvrc.nv_fabricmanager(); + nvrc.configure_fabricmanager(cfg, FABRIC_MODE_FULL, "greedy"); + nvrc.spawn_fabricmanager("/bin/true"); + + let content = fs::read_to_string(cfg).unwrap(); + assert!(content.contains("FABRIC_MODE=0")); + nvrc.health_checks(); } #[test] @@ -238,14 +246,12 @@ mod tests { #[test] fn test_spawn_fabricmanager_success() { let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); nvrc.spawn_fabricmanager("/bin/true"); } #[test] fn test_spawn_fabricmanager_with_port_guid() { let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); nvrc.port_guid = Some("0xdeadbeef".to_string()); nvrc.spawn_fabricmanager("/bin/true"); nvrc.health_checks(); @@ -289,37 +295,48 @@ mod tests { // === Fabricmanager configuration tests === #[test] - fn test_configure_fabricmanager_mode_0_bare_metal() { + fn test_configure_fabricmanager_bare_metal() { use tempfile::NamedTempFile; let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); fs::write(path, "").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(0); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_FULL, "greedy"); let content = fs::read_to_string(path).unwrap(); assert!(content.contains("FABRIC_MODE=0")); - assert!(content.contains("FABRIC_MODE_RESTART=0")); } #[test] - fn test_configure_fabricmanager_mode_1_servicevm() { + fn test_configure_fabricmanager_servicevm_nvl4() { use tempfile::NamedTempFile; let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); fs::write(path, "").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "greedy"); + + let content = fs::read_to_string(path).unwrap(); + assert!(content.contains("FABRIC_MODE=1")); + } + + #[test] + fn test_configure_fabricmanager_servicevm_nvl5() { + use tempfile::NamedTempFile; + + let tmpfile = NamedTempFile::new().unwrap(); + let path = tmpfile.path().to_str().unwrap(); + fs::write(path, "").unwrap(); + + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "symmetric"); let content = fs::read_to_string(path).unwrap(); assert!(content.contains("FABRIC_MODE=1")); - assert!(content.contains("FABRIC_MODE_RESTART=1")); } #[test] @@ -328,16 +345,13 @@ mod tests { let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); - fs::write(path, "FABRIC_MODE=0\nFABRIC_MODE_RESTART=0\n").unwrap(); + fs::write(path, "FABRIC_MODE=0\n").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "greedy"); let content = fs::read_to_string(path).unwrap(); assert!(content.contains("FABRIC_MODE=1")); - assert!(content.contains("FABRIC_MODE_RESTART=1")); - // Should not have old values let lines: Vec<&str> = content.lines().collect(); assert_eq!( lines @@ -346,13 +360,6 @@ mod tests { .count(), 1 ); - assert_eq!( - lines - .iter() - .filter(|l| l.starts_with("FABRIC_MODE_RESTART=")) - .count(), - 1 - ); } #[test] @@ -363,9 +370,8 @@ mod tests { let path = tmpfile.path().to_str().unwrap(); fs::write(path, "# Comment\nOTHER_SETTING=value\nFABRIC_MODE=0\n").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "greedy"); let content = fs::read_to_string(path).unwrap(); assert!(content.contains("# Comment")); @@ -374,53 +380,48 @@ mod tests { } #[test] - fn test_configure_fabricmanager_no_fabric_mode() { + fn test_configure_fabricmanager_nvl4_greedy_rail_policy() { use tempfile::NamedTempFile; let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); - fs::write(path, "ORIGINAL=content\n").unwrap(); + fs::write(path, "").unwrap(); let nvrc = NVRC::default(); - // fabric_mode is None - nvrc.configure_fabricmanager(path); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "greedy"); - // File should be unchanged let content = fs::read_to_string(path).unwrap(); - assert_eq!(content, "ORIGINAL=content\n"); + assert!(content.contains("PARTITION_RAIL_POLICY=greedy")); } #[test] - fn test_configure_fabricmanager_default_rail_policy() { + fn test_configure_fabricmanager_nvl5_symmetric_rail_policy() { use tempfile::NamedTempFile; let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); fs::write(path, "").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - // rail_policy is None, should default to greedy - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "symmetric"); let content = fs::read_to_string(path).unwrap(); - assert!(content.contains("PARTITION_RAIL_POLICY=greedy")); + assert!(content.contains("PARTITION_RAIL_POLICY=symmetric")); } #[test] - fn test_configure_fabricmanager_symmetric_rail_policy() { + fn test_configure_fabricmanager_gpu_nvl5_symmetric_rail_policy() { use tempfile::NamedTempFile; let tmpfile = NamedTempFile::new().unwrap(); let path = tmpfile.path().to_str().unwrap(); fs::write(path, "").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - nvrc.rail_policy = Some("symmetric".to_owned()); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_FULL, "symmetric"); let content = fs::read_to_string(path).unwrap(); + assert!(content.contains("FABRIC_MODE=0")); assert!(content.contains("PARTITION_RAIL_POLICY=symmetric")); } @@ -432,14 +433,11 @@ mod tests { let path = tmpfile.path().to_str().unwrap(); fs::write(path, "").unwrap(); - let mut nvrc = NVRC::default(); - nvrc.fabric_mode = Some(1); - nvrc.rail_policy = Some("symmetric".to_owned()); - nvrc.configure_fabricmanager(path); + let nvrc = NVRC::default(); + nvrc.configure_fabricmanager(path, FABRIC_MODE_SHARED, "symmetric"); let content = fs::read_to_string(path).unwrap(); assert!(content.contains("FABRIC_MODE=1")); - assert!(content.contains("FABRIC_MODE_RESTART=1")); assert!(content.contains("PARTITION_RAIL_POLICY=symmetric")); } } diff --git a/src/infiniband.rs b/src/infiniband.rs index d3ae68f..346b222 100644 --- a/src/infiniband.rs +++ b/src/infiniband.rs @@ -18,10 +18,19 @@ pub fn detect_port_guid() -> Option { } fn detect_port_guid_from(ib_class_path: &str) -> Option { + if !Path::new(ib_class_path).is_dir() { + panic!("{ib_class_path} not found — mlx5_ib module not loaded"); + } + let mut entries: Vec<_> = fs::read_dir(ib_class_path) .or_panic(format_args!("read {ib_class_path}")) .flatten() .collect(); + + if entries.is_empty() { + panic!("{ib_class_path} is empty — mlx5_ib loaded but no IB devices registered"); + } + // Deterministic selection: mlx5_0 before mlx5_1, so first valid SW_MNG device wins. entries.sort_by_key(|e| e.file_name()); @@ -29,11 +38,6 @@ fn detect_port_guid_from(ib_class_path: &str) -> Option { let device_name = entry.file_name().to_string_lossy().to_string(); let device_path = entry.path(); - if !is_sw_mng_device(&device_path.join("device/vpd")) { - continue; - } - debug!("{}: SW_MNG device", device_name); - if !is_sm_enabled(&device_path.join("ports/1/cap_mask")) { debug!("{}: SM disabled, skipping", device_name); continue; @@ -48,13 +52,6 @@ fn detect_port_guid_from(ib_class_path: &str) -> Option { None } -/// SW_MNG in VPD identifies CX7 bridges vs regular IB HCAs. -fn is_sw_mng_device(vpd_path: &Path) -> bool { - fs::read(vpd_path) - .map(|data| data.windows(6).any(|w| w == b"SW_MNG")) - .unwrap_or(false) -} - /// NVLSM cannot manage a port with SM disabled. fn is_sm_enabled(cap_mask_path: &Path) -> bool { let Ok(content) = fs::read_to_string(cap_mask_path) else { @@ -84,22 +81,13 @@ mod tests { use std::fs; use tempfile::TempDir; - fn create_ib_device( - tmpdir: &TempDir, - name: &str, - vpd_content: &[u8], - cap_mask: &str, - gid: &str, - ) { + fn create_ib_device(tmpdir: &TempDir, name: &str, cap_mask: &str, gid: &str) { let dev_path = tmpdir.path().join(name); - let vpd_path = dev_path.join("device/vpd"); let cap_path = dev_path.join("ports/1/cap_mask"); let gid_path = dev_path.join("ports/1/gids/0"); - fs::create_dir_all(vpd_path.parent().unwrap()).unwrap(); fs::create_dir_all(gid_path.parent().unwrap()).unwrap(); - fs::write(&vpd_path, vpd_content).unwrap(); fs::write(&cap_path, cap_mask).unwrap(); fs::write(&gid_path, gid).unwrap(); } @@ -110,7 +98,6 @@ mod tests { create_ib_device( &tmpdir, "mlx5_0", - b"some data SW_MNG more data", "0x00000200\n", // bit 10 unset, SM enabled "fe80:0000:0000:0000:0002:c903:0029:7de1\n", ); @@ -119,28 +106,12 @@ mod tests { assert_eq!(guid, Some("0x0002c90300297de1".to_owned())); } - #[test] - fn test_detect_port_guid_no_sw_mng() { - let tmpdir = TempDir::new().unwrap(); - create_ib_device( - &tmpdir, - "mlx5_0", - b"some other data", // no SW_MNG - "0x00000200\n", - "fe80:0000:0000:0000:0002:c903:0029:7de1\n", - ); - - let guid = detect_port_guid_from(tmpdir.path().to_str().unwrap()); - assert!(guid.is_none()); - } - #[test] fn test_detect_port_guid_sm_disabled() { let tmpdir = TempDir::new().unwrap(); create_ib_device( &tmpdir, "mlx5_0", - b"SW_MNG", "0x00000400\n", // bit 10 set, SM disabled "fe80:0000:0000:0000:0002:c903:0029:7de1\n", ); @@ -150,23 +121,21 @@ mod tests { } #[test] - fn test_detect_port_guid_multiple_devices_first_valid() { + fn test_detect_port_guid_skips_sm_disabled() { let tmpdir = TempDir::new().unwrap(); - // First device: no SW_MNG + // First device: SM disabled create_ib_device( &tmpdir, "mlx5_0", - b"no marker", - "0x00000200\n", + "0x00000400\n", "fe80:0000:0000:0000:aaaa:bbbb:cccc:dddd\n", ); - // Second device: valid + // Second device: SM enabled create_ib_device( &tmpdir, "mlx5_1", - b"SW_MNG", "0x00000200\n", "fe80:0000:0000:0000:1111:2222:3333:4444\n", ); @@ -176,44 +145,18 @@ mod tests { } #[test] + #[should_panic(expected = "is empty")] fn test_detect_port_guid_empty_dir() { let tmpdir = TempDir::new().unwrap(); - let guid = detect_port_guid_from(tmpdir.path().to_str().unwrap()); - assert!(guid.is_none()); + detect_port_guid_from(tmpdir.path().to_str().unwrap()); } #[test] - #[should_panic(expected = "read /nonexistent/path")] + #[should_panic(expected = "/nonexistent/path not found")] fn test_detect_port_guid_nonexistent_dir() { detect_port_guid_from("/nonexistent/path"); } - #[test] - fn test_is_sw_mng_device_found() { - let tmpdir = TempDir::new().unwrap(); - let vpd_path = tmpdir.path().join("vpd"); - fs::write(&vpd_path, b"some data SW_MNG more data").unwrap(); - - assert!(is_sw_mng_device(&vpd_path)); - } - - #[test] - fn test_is_sw_mng_device_not_found() { - let tmpdir = TempDir::new().unwrap(); - let vpd_path = tmpdir.path().join("vpd"); - fs::write(&vpd_path, b"some other data").unwrap(); - - assert!(!is_sw_mng_device(&vpd_path)); - } - - #[test] - fn test_is_sw_mng_device_no_file() { - let tmpdir = TempDir::new().unwrap(); - let vpd_path = tmpdir.path().join("nonexistent"); - - assert!(!is_sw_mng_device(&vpd_path)); - } - #[test] fn test_is_sm_enabled_bit_unset() { let tmpdir = TempDir::new().unwrap(); diff --git a/src/kernel_params.rs b/src/kernel_params.rs index 3e2ac85..01e70fc 100644 --- a/src/kernel_params.rs +++ b/src/kernel_params.rs @@ -28,12 +28,10 @@ impl NVRC { for (k, v) in content.split_whitespace().filter_map(|p| p.split_once('=')) { match k { - "nvrc.mode" => nvrc_mode(v, self), "nvrc.log" => nvrc_log(v, self), "nvrc.uvm.persistence.mode" => uvm_persistenced_mode(v, self), "nvrc.dcgm" => nvrc_dcgm(v, self), - "nvrc.fm.mode" => nvrc_fm_mode(v, self), - "nvrc.fm.rail.policy" => nvrc_fm_rail_policy(v, self), + "nvrc.smi.srs" => nvidia_smi_srs(v, self), "nvrc.smi.lgc" => nvidia_smi_lgc(v, self), "nvrc.smi.lmc" => nvidia_smi_lmc(v, self), @@ -44,13 +42,6 @@ impl NVRC { } } -/// Operation mode: "gpu" (default) or "cpu" (skip GPU management). -/// Use nvrc.mode=cpu for CPU-only workloads that don't need GPU initialization. -fn nvrc_mode(value: &str, ctx: &mut NVRC) { - ctx.mode = Some(value.to_lowercase()); - debug!("nvrc.mode: {}", value); -} - /// DCGM (Data Center GPU Manager) provides telemetry and health monitoring. /// Off by default—only enable when observability infrastructure expects it. fn nvrc_dcgm(value: &str, ctx: &mut NVRC) { @@ -59,31 +50,6 @@ fn nvrc_dcgm(value: &str, ctx: &mut NVRC) { debug!("nvrc.dcgm: {dcgm}"); } -/// Fabric Manager mode: 0=bare metal, 1=servicevm (shared nvswitch). -/// ServiceVM mode enables FABRIC_MODE_RESTART for resiliency. -fn nvrc_fm_mode(value: &str, ctx: &mut NVRC) { - let mode: u8 = value.parse().expect("nvrc.fm.mode: must be 0 or 1"); - if mode > 1 { - panic!("nvrc.fm.mode: invalid mode {}, must be 0 or 1", mode); - } - ctx.fabric_mode = Some(mode); - debug!("nvrc.fm.mode: {mode}"); -} - -/// Partition rail policy: greedy maximizes bandwidth, symmetric ensures isolation. -/// Symmetric required for Confidential Computing on Blackwell. -fn nvrc_fm_rail_policy(value: &str, ctx: &mut NVRC) { - let policy = value.to_lowercase(); - if policy != "greedy" && policy != "symmetric" { - panic!( - "nvrc.fm.rail.policy: invalid policy '{}', must be 'greedy' or 'symmetric'", - value - ); - } - ctx.rail_policy = Some(policy); - debug!("nvrc.fm.rail.policy: {}", ctx.rail_policy.as_ref().unwrap()); -} - /// Control log verbosity at runtime. Defaults to off to minimize noise. /// Enabling devkmsg allows kernel log output even in minimal init environments. fn nvrc_log(value: &str, _ctx: &mut NVRC) { @@ -270,57 +236,6 @@ mod tests { assert_eq!(c.dcgm_enabled, Some(false)); } - #[test] - fn test_nvrc_fm_mode() { - let mut c = NVRC::default(); - - nvrc_fm_mode("0", &mut c); - assert_eq!(c.fabric_mode, Some(0)); - - nvrc_fm_mode("1", &mut c); - assert_eq!(c.fabric_mode, Some(1)); - - // Invalid mode should panic - let result = panic::catch_unwind(|| { - nvrc_fm_mode("2", &mut NVRC::default()); - }); - assert!(result.is_err()); - - let result = panic::catch_unwind(|| { - nvrc_fm_mode("3", &mut NVRC::default()); - }); - assert!(result.is_err()); - - let result = panic::catch_unwind(|| { - nvrc_fm_mode("invalid", &mut NVRC::default()); - }); - assert!(result.is_err()); - } - - #[test] - fn test_nvrc_fm_rail_policy() { - let mut c = NVRC::default(); - - nvrc_fm_rail_policy("greedy", &mut c); - assert_eq!(c.rail_policy, Some("greedy".to_owned())); - - nvrc_fm_rail_policy("symmetric", &mut c); - assert_eq!(c.rail_policy, Some("symmetric".to_owned())); - - // Case insensitive - nvrc_fm_rail_policy("GREEDY", &mut c); - assert_eq!(c.rail_policy, Some("greedy".to_owned())); - - nvrc_fm_rail_policy("Symmetric", &mut c); - assert_eq!(c.rail_policy, Some("symmetric".to_owned())); - - // Invalid policy should panic - let result = panic::catch_unwind(|| { - nvrc_fm_rail_policy("invalid", &mut NVRC::default()); - }); - assert!(result.is_err()); - } - #[test] fn test_nvidia_smi_srs() { let mut c = NVRC::default(); @@ -448,76 +363,12 @@ mod tests { } #[test] - fn test_process_kernel_params_with_fm_mode_and_uvm() { + fn test_process_kernel_params_with_uvm_and_srs() { let mut c = NVRC::default(); - c.process_kernel_params(Some( - "nvrc.fm.mode=1 nvrc.uvm.persistence.mode=true nvrc.smi.srs=enabled", - )); + c.process_kernel_params(Some("nvrc.uvm.persistence.mode=true nvrc.smi.srs=enabled")); - assert_eq!(c.fabric_mode, Some(1)); assert_eq!(c.uvm_persistence_mode, Some(true)); assert_eq!(c.nvidia_smi_srs, Some("enabled".to_owned())); } - - #[test] - fn test_nvrc_mode() { - let mut c = NVRC::default(); - - nvrc_mode("cpu", &mut c); - assert_eq!(c.mode, Some("cpu".to_owned())); - - nvrc_mode("GPU", &mut c); - assert_eq!(c.mode, Some("gpu".to_owned())); // normalized to lowercase - - nvrc_mode("nvswitch-nvl4", &mut c); - assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); - - nvrc_mode("NVSWITCH-NVL4", &mut c); - assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); // normalized to lowercase - - nvrc_mode("nvswitch-nvl5", &mut c); - assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); - - nvrc_mode("NVSWITCH-NVL5", &mut c); - assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); // normalized to lowercase - } - - #[test] - fn test_process_kernel_params_with_mode() { - let mut c = NVRC::default(); - - c.process_kernel_params(Some("nvrc.mode=cpu nvrc.dcgm=on")); - - assert_eq!(c.mode, Some("cpu".to_owned())); - assert_eq!(c.dcgm_enabled, Some(true)); - } - - #[test] - fn test_process_kernel_params_nvswitch_nvl4_mode() { - let mut c = NVRC::default(); - - c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl4")); - - assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); - } - - #[test] - fn test_process_kernel_params_nvswitch_nvl5_mode() { - let mut c = NVRC::default(); - - c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl5")); - - assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); - } - - #[test] - fn test_process_kernel_params_with_rail_policy() { - let mut c = NVRC::default(); - - c.process_kernel_params(Some("nvrc.fm.mode=1 nvrc.fm.rail.policy=symmetric")); - - assert_eq!(c.fabric_mode, Some(1)); - assert_eq!(c.rail_policy, Some("symmetric".to_owned())); - } } diff --git a/src/lib.rs b/src/lib.rs index f412286..8225c1a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ pub mod kmsg; pub mod lockdown; #[macro_use] pub mod macros; +pub mod mode; pub mod modprobe; pub mod mount; pub mod nvrc; diff --git a/src/main.rs b/src/main.rs index cb1b6ee..1f38820 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ mod kernel_params; mod kmsg; mod lockdown; mod macros; +mod mode; mod modprobe; mod mount; mod nvrc; @@ -26,17 +27,17 @@ mod test_utils; extern crate log; extern crate kernlog; -use std::collections::HashMap; - use kata_agent::SYSLOG_POLL_FOREVER as POLL_FOREVER; +use daemon::FABRIC_MODE_FULL as FABRIC_MODE_FULL; +use daemon::FABRIC_MODE_SHARED as FABRIC_MODE_SHARED; use nvrc::NVRC; use toolkit::nvidia_ctk_cdi; -type ModeFn = fn(&mut NVRC); - /// VMs with GPU passthrough need driver setup, clock tuning, /// and monitoring daemons before workloads can use the GPU. -fn mode_gpu(init: &mut NVRC) { +/// On bare metal HGX systems (GPUs + NVSwitches), also starts +/// the fabric manager via the appropriate NVSwitch mode. +fn mode_gpu(init: &mut NVRC, nvswitch: Option<&str>) { modprobe::load("nvidia"); modprobe::load("nvidia-uvm"); @@ -48,54 +49,51 @@ fn mode_gpu(init: &mut NVRC) { init.nv_hostengine(); init.dcgm_exporter(); - init.nv_fabricmanager(); nvidia_ctk_cdi(); init.nvidia_smi_srs(); + + nvswitch.inspect(|&nv| { + let policy = match nv { + "nvl5" => "symmetric", + _ => "greedy", + }; + init.nv_fabricmanager(FABRIC_MODE_FULL, policy); + }); + init.health_checks(); } /// NVSwitch NVL4 mode for HGX H100/H200/H800 systems (third-gen NVSwitch). /// Service VM mode for NVLink 4.0 topologies in shared virtualization. /// Loads NVIDIA driver and starts fabric manager. GPUs are assigned to service VM. -fn mode_nvswitch_nvl4(init: &mut NVRC) { - // Service VM mode requires FABRIC_MODE=1 (shared nvswitch) - init.fabric_mode = Some(1); - +fn mode_servicevm_nvl4(init: &mut NVRC) { modprobe::load("nvidia"); - init.nv_fabricmanager(); + init.nv_fabricmanager(FABRIC_MODE_SHARED, "greedy"); init.health_checks(); } /// HGX Bx00 systems use CX7 bridges for NVLink management instead of direct GPU access. /// GPUs are passed to tenant VMs; only the CX7 IB devices are visible here. -fn mode_nvswitch_nvl5(init: &mut NVRC) { - init.fabric_mode = Some(1); - - // CX7 bridges expose management interface via InfiniBand MAD protocol +fn mode_servicevm_nvl5(init: &mut NVRC) { + // ib_umad exposes /dev/umad* for InfiniBand MAD protocol access; + // mlx5_ib creates /sys/class/infiniband/mlx5_* entries for the CX7 bridges. modprobe::load("ib_umad"); + modprobe::load("mlx5_ib"); // CX7 port GUID identifies which bridge to use for fabric management init.port_guid = Some( infiniband::detect_port_guid() - .expect("nvswitch-nvl5 requires SW_MNG IB device with valid port GUID"), + .expect("servicevm-nvl5 requires SW_MNG IB device with valid port GUID"), ); // NVLSM must initialize the NVLink subnet before FM can manage the fabric init.nv_nvlsm(); init.health_checks(); - init.nv_fabricmanager(); + init.nv_fabricmanager(FABRIC_MODE_SHARED, "symmetric"); init.health_checks(); } fn main() { - // Dispatch table allows adding new modes without touching control flow. - let modes: HashMap<&str, ModeFn> = HashMap::from([ - ("gpu", mode_gpu as ModeFn), - ("cpu", (|_| {}) as ModeFn), - ("nvswitch-nvl4", mode_nvswitch_nvl4 as ModeFn), - ("nvswitch-nvl5", mode_nvswitch_nvl5 as ModeFn), - ]); - lockdown::set_panic_hook(); let mut init = NVRC::default(); mount::setup(); @@ -103,11 +101,14 @@ fn main() { syslog::poll(); init.process_kernel_params(None); - // Kernel param nvrc.mode selects runtime behavior; GPU is the safe default - // since most users expect full GPU functionality. - let mode = init.mode.as_deref().unwrap_or("gpu"); - let setup = modes.get(mode).copied().unwrap_or(mode_gpu); - setup(&mut init); + let detected = mode::detect(); + match detected.mode { + "cpu" => info!("executing cpu mode"), + "gpu" => mode_gpu(&mut init, detected.nvswitch), + "servicevm-nvl4" => mode_servicevm_nvl4(&mut init), + "servicevm-nvl5" => mode_servicevm_nvl5(&mut init), + unknown => panic!("unknown mode: {unknown}"), + } mount::readonly("/"); lockdown::disable_modules_loading(); diff --git a/src/mode.rs b/src/mode.rs new file mode 100644 index 0000000..8580211 --- /dev/null +++ b/src/mode.rs @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) NVIDIA CORPORATION + +//! Auto-detect NVRC mode from PCI hardware topology. +//! +//! Scans `/sys/bus/pci/devices` for NVIDIA GPUs, NVSwitches, and +//! Mellanox CX7 bridge LPFs (SW_MNG marker in PCI VPD), to determine +//! the correct operating mode and fabric manager configuration. +//! +//! NVL5 CX7 bridges expose 2 LPF (SW_MNG) + 2 FC PF per baseboard. +//! VPD is read directly from PCI sysfs to avoid dependency on IB drivers. + +use log::debug; +use std::fs; + +const PCI_DEVICES: &str = "/sys/bus/pci/devices"; + +/// Result of hardware topology detection. +pub struct Detection { + /// Operating mode: "cpu", "gpu", "servicevm-nvl4", or "servicevm-nvl5" + pub mode: &'static str, + /// NVSwitch generation when present: "nvl4" or "nvl5" + pub nvswitch: Option<&'static str>, +} + +/// Detect NVRC mode from real sysfs paths. +pub fn detect() -> Detection { + detect_from(PCI_DEVICES) +} + +fn detect_from(pci_path: &str) -> Detection { + let nvswitches = count_nvswitches_from(pci_path); + let gpus = count_gpus_from(pci_path); + let sw_mng = count_sw_mng_from(pci_path); + + debug!( + "topology: {} GPU, {} NVSWITCH, {} PCI_SW_MNG", + gpus, nvswitches, sw_mng + ); + + match (nvswitches, gpus, sw_mng) { + (0, 0, 0) => { + debug!("mode: cpu"); + Detection { + mode: "cpu", + nvswitch: None, + } + } + (0, _, 0) => { + debug!("mode: gpu {} GPU", gpus); + Detection { + mode: "gpu", + nvswitch: None, + } + } + (4, 8, 0) => { + debug!( + "mode: gpu FABRIC_MODE=0, {} GPU + {} NVSWITCH", + gpus, nvswitches + ); + Detection { + mode: "gpu", + nvswitch: Some("nvl4"), + } + } + (4, 0, 0) => { + debug!("mode: servicevm-nvl4 FABRIC_MODE=1"); + Detection { + mode: "servicevm-nvl4", + nvswitch: Some("nvl4"), + } + } + (0, 8, 4) => { + debug!( + "mode: gpu FABRIC_MODE=0, {} GPU + {} PCI_SW_MNG", + gpus, sw_mng + ); + Detection { + mode: "gpu", + nvswitch: Some("nvl5"), + } + } + (0, 0, 4) => { + debug!("mode: servicevm-nvl5 FABRIC_MODE=1"); + Detection { + mode: "servicevm-nvl5", + nvswitch: Some("nvl5"), + } + } + _ => { + panic!( + "unexpected topology: {} NVSWITCH, {} GPU, {} PCI_SW_MNG — cannot determine mode", + nvswitches, gpus, sw_mng + ); + } + } +} + +fn count_nvswitches_from(pci_path: &str) -> usize { + let Ok(entries) = fs::read_dir(pci_path) else { + return 0; + }; + entries + .flatten() + .filter(|e| { + let vendor = fs::read_to_string(e.path().join("vendor")).unwrap_or_default(); + let class = fs::read_to_string(e.path().join("class")).unwrap_or_default(); + vendor.trim() == "0x10de" && class.trim().starts_with("0x0680") + }) + .count() +} + +fn count_gpus_from(pci_path: &str) -> usize { + let Ok(entries) = fs::read_dir(pci_path) else { + return 0; + }; + entries + .flatten() + .filter(|e| { + let vendor = fs::read_to_string(e.path().join("vendor")).unwrap_or_default(); + let class = fs::read_to_string(e.path().join("class")).unwrap_or_default(); + vendor.trim() == "0x10de" && class.trim().starts_with("0x03") + }) + .count() +} + +/// Count NVLink management NICs (SW_MNG marker in PCI VPD). +/// Scans Mellanox (0x15b3) PCI devices and checks VPD directly, +/// avoiding dependency on IB drivers being loaded. +fn count_sw_mng_from(pci_path: &str) -> usize { + let Ok(entries) = fs::read_dir(pci_path) else { + return 0; + }; + entries + .flatten() + .filter(|e| { + let vendor = fs::read_to_string(e.path().join("vendor")).unwrap_or_default(); + if vendor.trim() != "0x15b3" { + return false; + } + fs::read(e.path().join("vpd")) + .map(|data| data.windows(6).any(|w| w == b"SW_MNG")) + .unwrap_or(false) + }) + .count() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::panic; + use tempfile::TempDir; + + fn create_pci_device(dir: &TempDir, name: &str, vendor: &str, class: &str) { + let dev = dir.path().join(name); + fs::create_dir_all(&dev).unwrap(); + fs::write(dev.join("vendor"), vendor).unwrap(); + fs::write(dev.join("class"), class).unwrap(); + } + + fn create_mlx_pci_device(dir: &TempDir, name: &str, vpd_content: &[u8]) { + let dev = dir.path().join(name); + fs::create_dir_all(&dev).unwrap(); + fs::write(dev.join("vendor"), "0x15b3\n").unwrap(); + fs::write(dev.join("vpd"), vpd_content).unwrap(); + } + + // --- NVSwitch counting --- + + #[test] + fn test_count_nvswitches_single() { + let tmpdir = TempDir::new().unwrap(); + create_pci_device(&tmpdir, "0000:00:00.0", "0x10de\n", "0x068000\n"); + assert_eq!(count_nvswitches_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_nvswitches_four() { + let tmpdir = TempDir::new().unwrap(); + for i in 0..4 { + create_pci_device( + &tmpdir, + &format!("0000:0{}:00.0", i), + "0x10de\n", + "0x068000\n", + ); + } + assert_eq!(count_nvswitches_from(tmpdir.path().to_str().unwrap()), 4); + } + + #[test] + fn test_count_nvswitches_skips_gpus() { + let tmpdir = TempDir::new().unwrap(); + create_pci_device(&tmpdir, "0000:00:00.0", "0x10de\n", "0x068000\n"); + create_pci_device(&tmpdir, "0000:41:00.0", "0x10de\n", "0x030200\n"); + assert_eq!(count_nvswitches_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_nvswitches_skips_non_nvidia() { + let tmpdir = TempDir::new().unwrap(); + create_pci_device(&tmpdir, "0000:00:00.0", "0x10de\n", "0x068000\n"); + create_pci_device(&tmpdir, "0000:01:00.0", "0x8086\n", "0x068000\n"); + assert_eq!(count_nvswitches_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_nvswitches_empty() { + let tmpdir = TempDir::new().unwrap(); + assert_eq!(count_nvswitches_from(tmpdir.path().to_str().unwrap()), 0); + } + + #[test] + fn test_count_nvswitches_nonexistent() { + assert_eq!(count_nvswitches_from("/nonexistent/path"), 0); + } + + // --- GPU counting --- + + #[test] + fn test_count_gpus_single() { + let tmpdir = TempDir::new().unwrap(); + create_pci_device(&tmpdir, "0000:41:00.0", "0x10de\n", "0x030200\n"); + assert_eq!(count_gpus_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_gpus_multiple() { + let tmpdir = TempDir::new().unwrap(); + for i in 0..8 { + create_pci_device( + &tmpdir, + &format!("0000:4{}:00.0", i), + "0x10de\n", + "0x030200\n", + ); + } + assert_eq!(count_gpus_from(tmpdir.path().to_str().unwrap()), 8); + } + + #[test] + fn test_count_gpus_skips_nvswitches() { + let tmpdir = TempDir::new().unwrap(); + create_pci_device(&tmpdir, "0000:41:00.0", "0x10de\n", "0x030200\n"); + create_pci_device(&tmpdir, "0000:00:00.0", "0x10de\n", "0x068000\n"); + assert_eq!(count_gpus_from(tmpdir.path().to_str().unwrap()), 1); + } + + // --- SW_MNG device counting (PCI-based) --- + + #[test] + fn test_count_sw_mng_single() { + let tmpdir = TempDir::new().unwrap(); + create_mlx_pci_device(&tmpdir, "0000:b1:00.0", b"some data SW_MNG more data"); + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_sw_mng_four() { + let tmpdir = TempDir::new().unwrap(); + for i in 0..4 { + create_mlx_pci_device(&tmpdir, &format!("0000:b{}:00.0", i), b"SW_MNG"); + } + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 4); + } + + #[test] + fn test_count_sw_mng_skips_non_sw_mng() { + let tmpdir = TempDir::new().unwrap(); + create_mlx_pci_device(&tmpdir, "0000:b1:00.0", b"SW_MNG"); + create_mlx_pci_device(&tmpdir, "0000:b2:00.0", b"some other data"); + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_sw_mng_skips_non_mellanox() { + let tmpdir = TempDir::new().unwrap(); + create_mlx_pci_device(&tmpdir, "0000:b1:00.0", b"SW_MNG"); + // Non-Mellanox device with SW_MNG in VPD + let dev = tmpdir.path().join("0000:b2:00.0"); + fs::create_dir_all(&dev).unwrap(); + fs::write(dev.join("vendor"), "0x10de\n").unwrap(); + fs::write(dev.join("vpd"), b"SW_MNG").unwrap(); + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 1); + } + + #[test] + fn test_count_sw_mng_no_vpd_file() { + let tmpdir = TempDir::new().unwrap(); + let dev = tmpdir.path().join("0000:b1:00.0"); + fs::create_dir_all(&dev).unwrap(); + fs::write(dev.join("vendor"), "0x15b3\n").unwrap(); + // No vpd file + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 0); + } + + #[test] + fn test_count_sw_mng_no_pci_dir() { + assert_eq!(count_sw_mng_from("/nonexistent/path"), 0); + } + + #[test] + fn test_count_sw_mng_empty_dir() { + let tmpdir = TempDir::new().unwrap(); + assert_eq!(count_sw_mng_from(tmpdir.path().to_str().unwrap()), 0); + } + + // --- Mode detection --- + + #[test] + fn test_detect_cpu_mode() { + let pci = TempDir::new().unwrap(); + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "cpu"); + assert!(d.nvswitch.is_none()); + } + + #[test] + fn test_detect_gpu_mode() { + let pci = TempDir::new().unwrap(); + create_pci_device(&pci, "0000:41:00.0", "0x10de\n", "0x030200\n"); + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "gpu"); + assert!(d.nvswitch.is_none()); + } + + #[test] + fn test_detect_gpu_bare_metal_nvl4() { + let pci = TempDir::new().unwrap(); + for i in 0..4 { + create_pci_device(&pci, &format!("0000:0{}:00.0", i), "0x10de\n", "0x068000\n"); + } + for i in 0..8 { + create_pci_device(&pci, &format!("0000:4{}:00.0", i), "0x10de\n", "0x030200\n"); + } + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "gpu"); + assert_eq!(d.nvswitch, Some("nvl4")); + } + + #[test] + fn test_detect_gpu_bare_metal_nvl5() { + let pci = TempDir::new().unwrap(); + // 8 GPUs + 4 CX7 PFs (all SW_MNG) on PCIe, no NVSwitches + for i in 0..8 { + create_pci_device(&pci, &format!("0000:4{}:00.0", i), "0x10de\n", "0x030200\n"); + } + for i in 0..4 { + create_mlx_pci_device(&pci, &format!("0000:ab:00.{}", i), b"SW_MNG"); + } + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "gpu"); + assert_eq!(d.nvswitch, Some("nvl5")); + } + + #[test] + fn test_detect_servicevm_nvl4() { + let pci = TempDir::new().unwrap(); + for i in 0..4 { + create_pci_device(&pci, &format!("0000:0{}:00.0", i), "0x10de\n", "0x068000\n"); + } + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "servicevm-nvl4"); + assert_eq!(d.nvswitch, Some("nvl4")); + } + + #[test] + fn test_detect_servicevm_nvl5() { + let pci = TempDir::new().unwrap(); + // NVL5: no NVSwitches or GPUs on PCIe, only 4 CX7 PFs (all SW_MNG) + for i in 0..4 { + create_mlx_pci_device(&pci, &format!("0000:ab:00.{}", i), b"SW_MNG"); + } + let d = detect_from(pci.path().to_str().unwrap()); + assert_eq!(d.mode, "servicevm-nvl5"); + assert_eq!(d.nvswitch, Some("nvl5")); + } + + #[test] + fn test_detect_unexpected_topology_panics() { + let pci = TempDir::new().unwrap(); + // 2 NVSwitches + 3 GPUs — not a known topology + for i in 0..2 { + create_pci_device(&pci, &format!("0000:0{}:00.0", i), "0x10de\n", "0x068000\n"); + } + for i in 0..3 { + create_pci_device(&pci, &format!("0000:4{}:00.0", i), "0x10de\n", "0x030200\n"); + } + let result = panic::catch_unwind(|| { + detect_from(pci.path().to_str().unwrap()); + }); + assert!(result.is_err()); + } +} diff --git a/src/nvrc.rs b/src/nvrc.rs index 16b7058..136442c 100644 --- a/src/nvrc.rs +++ b/src/nvrc.rs @@ -11,8 +11,6 @@ use std::process::Child; #[derive(Default)] #[allow(clippy::upper_case_acronyms)] pub struct NVRC { - /// Operation mode: "gpu" (default) or "cpu" (skip GPU management) - pub mode: Option, /// Set/unset ready state pub nvidia_smi_srs: Option, /// Lock GPU clocks to specific frequency @@ -25,10 +23,7 @@ pub struct NVRC { pub uvm_persistence_mode: Option, /// Enable DCGM exporter for GPU metrics pub dcgm_enabled: Option, - /// Fabric Manager mode: 0=bare metal, 1=servicevm - pub fabric_mode: Option, - /// Fabric Manager rail policy: "greedy" (default) or "symmetric" - pub rail_policy: Option, + /// Port GUID for NVL5+ systems (0x-prefixed hex string) pub port_guid: Option, /// Tracked background daemons for health monitoring @@ -66,7 +61,6 @@ mod tests { #[test] fn test_default() { let nvrc = NVRC::default(); - assert!(nvrc.mode.is_none()); assert!(nvrc.nvidia_smi_srs.is_none()); assert!(nvrc.nvidia_smi_lgc.is_none()); assert!(nvrc.children.is_empty()); From f0a9c76751bcab738cddc211e549f30fef1f0a0c Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 25 Feb 2026 22:28:51 +0000 Subject: [PATCH 2/3] mount: remove /dev mount, kernel does it Signed-off-by: Zvonko Kaiser --- src/daemon.rs | 5 ++++- src/main.rs | 4 ++-- src/mount.rs | 23 ++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/daemon.rs b/src/daemon.rs index 51d0617..4c43606 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -128,7 +128,10 @@ impl NVRC { /// PARTITION_RAIL_POLICY: "greedy" (NVL4) or "symmetric" (NVL5, required for CC on Blackwell) fn configure_fabricmanager(&self, cfg_path: &str, fabric_mode: u8, rail_policy: &str) { let fm = &fabric_mode.to_string(); - let updates = &[("FABRIC_MODE", fm.as_str()), ("PARTITION_RAIL_POLICY", rail_policy)]; + let updates = &[ + ("FABRIC_MODE", fm.as_str()), + ("PARTITION_RAIL_POLICY", rail_policy), + ]; update_config_file(cfg_path, updates); } } diff --git a/src/main.rs b/src/main.rs index 1f38820..be092db 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,9 +27,9 @@ mod test_utils; extern crate log; extern crate kernlog; +use daemon::FABRIC_MODE_FULL; +use daemon::FABRIC_MODE_SHARED; use kata_agent::SYSLOG_POLL_FOREVER as POLL_FOREVER; -use daemon::FABRIC_MODE_FULL as FABRIC_MODE_FULL; -use daemon::FABRIC_MODE_SHARED as FABRIC_MODE_SHARED; use nvrc::NVRC; use toolkit::nvidia_ctk_cdi; diff --git a/src/mount.rs b/src/mount.rs index 858c83a..7c805ec 100644 --- a/src/mount.rs +++ b/src/mount.rs @@ -50,18 +50,6 @@ fn setup_at(root: &str) { let common = MsFlags::MS_NOSUID | MsFlags::MS_NOEXEC | MsFlags::MS_NODEV | MsFlags::MS_RELATIME; mount("proc", &format!("{root}/proc"), "proc", common, None); - - // devtmpfs automatically creates /dev/null, /dev/zero, /dev/random, /dev/urandom - // Symlinks (/dev/stdin, /dev/stdout, /dev/stderr, /dev/fd, /dev/core) are created by kata-agent - let dev_flags = MsFlags::MS_NOSUID | MsFlags::MS_NOEXEC | MsFlags::MS_RELATIME; - mount( - "dev", - &format!("{root}/dev"), - "devtmpfs", - dev_flags, - Some("mode=0755"), - ); - mount("sysfs", &format!("{root}/sys"), "sysfs", common, None); mount( "run", @@ -239,9 +227,18 @@ mod tests { fs::create_dir_all(format!("{root}/{dir}")).unwrap(); } + // Kernel mounts devtmpfs on /dev before init runs; simulate that here + mount( + "devtmpfs", + &format!("{root}/dev"), + "devtmpfs", + MsFlags::MS_NOSUID | MsFlags::MS_RELATIME, + None, + ); + setup_at(root); - // devtmpfs auto-creates device nodes to avoid manual mknod calls + // devtmpfs auto-creates device nodes assert!(Path::new(&format!("{root}/dev/null")).exists()); assert!(Path::new(&format!("{root}/dev/zero")).exists()); assert!(Path::new(&format!("{root}/dev/random")).exists()); From 3b4f6b896be69907e339fba7d2059fdfcb7ec691 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 26 Feb 2026 21:55:09 +0000 Subject: [PATCH 3/3] mount: remove readonly remount, Kata/CoCo runs a read-only image CoCo/Kata runs on a read-only root image so the readonly("/") remount after init is redundant. Remove mount::readonly() and its call from main. We used the readonly for initrd which will be deprecated for production. Signed-off-by: Zvonko Kaiser --- src/main.rs | 1 - src/mount.rs | 19 ------------------- 2 files changed, 20 deletions(-) diff --git a/src/main.rs b/src/main.rs index be092db..64e5dc6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -110,7 +110,6 @@ fn main() { unknown => panic!("unknown mode: {unknown}"), } - mount::readonly("/"); lockdown::disable_modules_loading(); kata_agent::fork_agent(POLL_FOREVER); } diff --git a/src/mount.rs b/src/mount.rs index 7c805ec..a8e0da0 100644 --- a/src/mount.rs +++ b/src/mount.rs @@ -14,15 +14,6 @@ fn mount(source: &str, target: &str, fstype: &str, flags: MsFlags, data: Option< .or_panic(format_args!("mount {source} on {target}")); } -/// Remount a filesystem as read-only. -/// Security hardening: prevents writes to the root filesystem after init, -/// reducing attack surface in the confidential VM. -pub fn readonly(target: &str) { - let flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT; - nix::mount::mount(None::<&str>, target, None::<&str>, flags, None::<&str>) - .or_panic(format_args!("remount {target} readonly")); -} - /// Check if a filesystem type is available in the kernel. fn fs_available(filesystems: &str, fstype: &str) -> bool { filesystems.lines().any(|line| line.contains(fstype)) @@ -193,16 +184,6 @@ mod tests { assert!(result.is_err()); } - #[test] - fn test_readonly_fails_nonexistent() { - use std::panic; - - let result = panic::catch_unwind(|| { - readonly("/nonexistent/path"); - }); - assert!(result.is_err()); - } - // === setup_at() tests with temp directory === #[test]