Skip to content

Commit fe5e4f8

Browse files
committed
nvswtich: Add fm modes
Replace boolean nvrc.fabricmanager with nvrc.fm.mode (0=bare metal, 1=servicevm, 2=vgpu) to properly configure FABRIC_MODE and FABRIC_MODE_RESTART in fabricmanager.cfg. Add nvrc.fm.rail.policy (greedy|symmetric) for PARTITION_RAIL_POLICY. Symmetric policy required for Confidential Computing on Blackwell to ensure memory isolation boundaries during attestation. Introduce generic update_config_file() helper in src/config.rs to eliminate repetitive KEY=VALUE file manipulation and enable easy addition of future configuration parameters. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
1 parent 570dbc5 commit fe5e4f8

7 files changed

Lines changed: 499 additions & 58 deletions

File tree

README.md

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,36 +21,36 @@ recovery mechanisms—if GPU initialization fails, the VM powers off. This
2121
## Architecture
2222

2323
```text
24-
┌────────────────────────────────────────────────────────────────
25-
│ NVRC (PID 1)
26-
27-
│ 1. Set panic hook (power off VM on panic)
28-
│ 2. Mount filesystems (/proc, /dev, /sys, /dev/shm)
29-
│ 3. Initialize kernel message logging
30-
│ 4. Start syslog daemon
31-
│ 5. Remount / as read-only (security hardening)
32-
│ 6. Parse kernel parameters (/proc/cmdline)
33-
34-
│ ┌─────────────────────────────────────────────────────────────────────────────┐
35-
│ │ Mode Selection (nvrc.mode)
36-
│ │ ┌────────────────┐ ┌─────────────┐ ┌─────────────┌──────────────┐ │
37-
│ │ │ GPU (default) │ │ CPU Mode │ │ NVSwitch-NVL4│ │ NVSwitch-NVL5│ │
38-
│ │ │ • nvidia.ko │ │ • Skip GPU │ │ (H100/H200) │ │ (B200/B300)
39-
│ │ │ • nvidia-uvm │ │ • Jump to │ │ • nvidia.ko │ │ • ib_umad │ │
40-
│ │ │ • Lock clocks │ kata-agent │ │ • fabric-mgr │ │ • fabric-mgr │
41-
│ │ │ • Lock memory │ │ │ │ • Check daemons│• NVLSM auto │ │
42-
│ │ │ • Power limit │ │ │ │ • Jump agent │ │ • Jump agent │ │
43-
│ │ │ • Daemons │ │ │ │ │ │
44-
│ │ │ • CDI spec │ │ │ │ │ │
45-
│ │ │ • SRS config │ │ │ │ │ │
46-
│ │ └────────────────┘ └─────────────┘ └─────────────└──────────────┘ │
47-
│ └─────────────────────────────────────────────────────────────────────────────┘
48-
49-
│ 7. Check daemon health (fail if any crashed)
50-
│ 8. Disable kernel module loading (lockdown)
51-
│ 9. Fork kata-agent (handoff control)
52-
│ 10. Poll syslog forever (keep PID 1 alive)
53-
└────────────────────────────────────────────────────────────────
24+
┌────────────────────────────────────────────────────────────────┐
25+
│ NVRC (PID 1) │
26+
│ │
27+
│ 1. Set panic hook (power off VM on panic) │
28+
│ 2. Mount filesystems (/proc, /dev, /sys, /dev/shm) │
29+
│ 3. Initialize kernel message logging │
30+
│ 4. Start syslog daemon │
31+
│ 5. Remount / as read-only (security hardening) │
32+
│ 6. Parse kernel parameters (/proc/cmdline) │
33+
│ │
34+
│ ┌──────────────────────────────────────────────────────────
35+
│ │ Mode Selection (nvrc.mode)
36+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
37+
│ │ │GPU (default)│ │ CPU Mode │ │NVSwitch-NVL4│ ... │
38+
│ │ │• nvidia.ko │ │• Skip GPU │ │(H100/H200) │ │ │
39+
│ │ │• nvidia-uvm │ │• exec agent │ │• nvidia.ko │
40+
│ │ │• Lock clocks • fabric-mgr │
41+
│ │ │• Lock memory│ │ │ │• exec agent
42+
│ │ │• Power limit│ │ │ │ │ │
43+
│ │ │• Daemons │ │ │ │ │
44+
│ │ │• CDI spec │ │ │ │ │
45+
│ │ │• SRS config │ │ │ │ │
46+
│ │ └─────────────┘ └─────────────┘ └─────────────┘
47+
│ └──────────────────────────────────────────────────────────
48+
│ │
49+
│ 7. Check daemon health (fail if any crashed) │
50+
│ 8. Disable kernel module loading (lockdown) │
51+
│ 9. Fork kata-agent (handoff control) │
52+
│ 10. Poll syslog forever (keep PID 1 alive) │
53+
└────────────────────────────────────────────────────────────────┘
5454
```
5555

5656
## Kernel Parameters
@@ -79,9 +79,10 @@ configuration doesn't exist yet.
7979

8080
| Parameter | Values | Default | Description |
8181
| --------------------------- | --------------------------------------- | ------- | ------------------------------------------------------------------------------ |
82-
| `nvrc.uvm.persistence.mode` | `on/off`, `true/false`, `1/0`, `yes/no` | `true` | UVM persistence mode keeps unified memory state across CUDA context teardowns. |
83-
| `nvrc.dcgm` | `on/off`, `true/false`, `1/0`, `yes/no` | `false` | Enable DCGM (Data Center GPU Manager) for telemetry and health monitoring. |
84-
| `nvrc.fabricmanager` | `on/off`, `true/false`, `1/0`, `yes/no` | `false` | Enable Fabric Manager for NVLink/NVSwitch multi-GPU communication. |
82+
| `nvrc.uvm.persistence.mode` | `on/off`, `true/false`, `1/0`, `yes/no` | `true` | UVM persistence mode keeps unified memory state across CUDA context teardowns. |
83+
| `nvrc.dcgm` | `on/off`, `true/false`, `1/0`, `yes/no` | `false` | Enable DCGM (Data Center GPU Manager) for telemetry and health monitoring. |
84+
| `nvrc.fm.mode` | `0`, `1`, `2` | - | Fabric Manager mode: 0=bare metal, 1=servicevm (shared nvswitch), 2=vgpu. Auto-set in nvswitch modes. |
85+
| `nvrc.fm.rail.policy` | `greedy`, `symmetric` | `greedy` | Partition rail policy. Symmetric required for Confidential Computing on Blackwell. |
8586

8687
### Example Configurations
8788

@@ -124,7 +125,7 @@ nvrc.mode=gpu nvrc.dcgm=on nvrc.log=info
124125
**Multi-GPU with NVLink:**
125126

126127
```text
127-
nvrc.mode=gpu nvrc.fabricmanager=on nvrc.log=debug
128+
nvrc.mode=gpu nvrc.fm.mode=0 nvrc.log=debug
128129
```
129130

130131
## Build

src/config.rs

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright (c) NVIDIA CORPORATION
3+
4+
//! Generic KEY=VALUE configuration file utilities.
5+
6+
use crate::macros::ResultExt;
7+
use log::debug;
8+
use std::collections::HashSet;
9+
use std::fs;
10+
11+
/// Updates KEY=VALUE pairs in a config file, adding them if missing.
12+
/// Existing keys are updated in place, new keys are appended to the end.
13+
pub fn update_config_file(path: &str, updates: &[(&str, &str)]) {
14+
let content = fs::read_to_string(path).or_panic(format_args!("read {path}"));
15+
16+
let mut lines: Vec<String> = content.lines().map(String::from).collect();
17+
let mut found_keys: HashSet<&str> = HashSet::new();
18+
19+
// Update existing lines
20+
for line in &mut lines {
21+
let trimmed = line.trim();
22+
for (key, value) in updates {
23+
if trimmed.starts_with(&format!("{}=", key)) {
24+
*line = format!("{}={}", key, value);
25+
found_keys.insert(key);
26+
debug!("{}: {}={}", path, key, value);
27+
break;
28+
}
29+
}
30+
}
31+
32+
// Add missing keys
33+
for (key, value) in updates {
34+
if !found_keys.contains(key) {
35+
lines.push(format!("{}={}", key, value));
36+
debug!("{}: {}={}", path, key, value);
37+
}
38+
}
39+
40+
let updated = lines.join("\n") + "\n";
41+
fs::write(path, updated).or_panic(format_args!("write {path}"));
42+
}
43+
44+
#[cfg(test)]
45+
mod tests {
46+
use super::*;
47+
use std::fs;
48+
use tempfile::NamedTempFile;
49+
50+
#[test]
51+
fn test_update_config_file_add_new_keys() {
52+
let mut tmpfile = NamedTempFile::new().unwrap();
53+
let path = tmpfile.path().to_str().unwrap();
54+
55+
// Start with empty file
56+
fs::write(path, "").unwrap();
57+
58+
update_config_file(path, &[("KEY1", "value1"), ("KEY2", "value2")]);
59+
60+
let content = fs::read_to_string(path).unwrap();
61+
assert!(content.contains("KEY1=value1"));
62+
assert!(content.contains("KEY2=value2"));
63+
}
64+
65+
#[test]
66+
fn test_update_config_file_update_existing_keys() {
67+
let mut tmpfile = NamedTempFile::new().unwrap();
68+
let path = tmpfile.path().to_str().unwrap();
69+
70+
// Start with existing content
71+
fs::write(path, "KEY1=oldvalue\nKEY2=oldvalue\n").unwrap();
72+
73+
update_config_file(path, &[("KEY1", "newvalue"), ("KEY2", "newvalue")]);
74+
75+
let content = fs::read_to_string(path).unwrap();
76+
assert!(content.contains("KEY1=newvalue"));
77+
assert!(content.contains("KEY2=newvalue"));
78+
assert!(!content.contains("oldvalue"));
79+
}
80+
81+
#[test]
82+
fn test_update_config_file_mixed_update_and_add() {
83+
let mut tmpfile = NamedTempFile::new().unwrap();
84+
let path = tmpfile.path().to_str().unwrap();
85+
86+
// Start with one existing key
87+
fs::write(path, "KEY1=oldvalue\n").unwrap();
88+
89+
update_config_file(path, &[("KEY1", "updated"), ("KEY2", "new")]);
90+
91+
let content = fs::read_to_string(path).unwrap();
92+
assert!(content.contains("KEY1=updated"));
93+
assert!(content.contains("KEY2=new"));
94+
assert!(!content.contains("oldvalue"));
95+
}
96+
97+
#[test]
98+
fn test_update_config_file_preserves_other_lines() {
99+
let mut tmpfile = NamedTempFile::new().unwrap();
100+
let path = tmpfile.path().to_str().unwrap();
101+
102+
// Start with mixed content
103+
fs::write(path, "# Comment\nKEY1=old\nOTHER=unchanged\n").unwrap();
104+
105+
update_config_file(path, &[("KEY1", "new")]);
106+
107+
let content = fs::read_to_string(path).unwrap();
108+
assert!(content.contains("# Comment"));
109+
assert!(content.contains("KEY1=new"));
110+
assert!(content.contains("OTHER=unchanged"));
111+
}
112+
113+
#[test]
114+
fn test_update_config_file_with_spaces() {
115+
let mut tmpfile = NamedTempFile::new().unwrap();
116+
let path = tmpfile.path().to_str().unwrap();
117+
118+
fs::write(path, " KEY1=old \n").unwrap();
119+
120+
update_config_file(path, &[("KEY1", "new")]);
121+
122+
let content = fs::read_to_string(path).unwrap();
123+
assert!(content.contains("KEY1=new"));
124+
}
125+
126+
#[test]
127+
fn test_update_config_file_empty_value() {
128+
let mut tmpfile = NamedTempFile::new().unwrap();
129+
let path = tmpfile.path().to_str().unwrap();
130+
131+
fs::write(path, "").unwrap();
132+
133+
update_config_file(path, &[("KEY1", "")]);
134+
135+
let content = fs::read_to_string(path).unwrap();
136+
assert!(content.contains("KEY1="));
137+
}
138+
139+
#[test]
140+
fn test_update_config_file_multiple_updates_same_key() {
141+
let mut tmpfile = NamedTempFile::new().unwrap();
142+
let path = tmpfile.path().to_str().unwrap();
143+
144+
fs::write(path, "KEY1=old\n").unwrap();
145+
146+
// Update twice
147+
update_config_file(path, &[("KEY1", "first")]);
148+
update_config_file(path, &[("KEY1", "second")]);
149+
150+
let content = fs::read_to_string(path).unwrap();
151+
assert!(content.contains("KEY1=second"));
152+
assert!(!content.contains("first"));
153+
}
154+
155+
#[test]
156+
fn test_update_config_file_similar_key_names() {
157+
let mut tmpfile = NamedTempFile::new().unwrap();
158+
let path = tmpfile.path().to_str().unwrap();
159+
160+
// Test that FABRIC_MODE_RESTART doesn't match FABRIC_MODE
161+
fs::write(path, "FABRIC_MODE=0\nFABRIC_MODE_RESTART=0\n").unwrap();
162+
163+
update_config_file(path, &[("FABRIC_MODE", "1")]);
164+
165+
let content = fs::read_to_string(path).unwrap();
166+
assert!(content.contains("FABRIC_MODE=1"));
167+
assert!(content.contains("FABRIC_MODE_RESTART=0"));
168+
}
169+
170+
#[test]
171+
#[should_panic(expected = "read")]
172+
fn test_update_config_file_nonexistent_file() {
173+
update_config_file("/nonexistent/path/file.cfg", &[("KEY", "value")]);
174+
}
175+
}

0 commit comments

Comments
 (0)