feat(cli): support explicit CDI device names via --gpu

elezar · elezar · commit 85b370d7510c · 2026-03-20T09:05:55.000+01:00
Explicit CDI device IDs can now be passed:

  --gpu=nvidia.com/gpu=all        single CDI device
  --gpu=nvidia.com/gpu=0 --gpu=nvidia.com/gpu=1  multiple CDI devices

parse_gpu_flag validates the input and rejects mixing legacy/auto with
CDI device names or specifying them more than once.

Signed-off-by: Evan Lezar &lt;elezar@nvidia.com&gt;
diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md
@@ -324,6 +324,9 @@ The `--gpu` flag on `gateway start` accepts an optional value that overrides the
 |---|---|
 | `--gpu` | Auto-select: CDI on Docker >= 28.2.0, `--gpus all` otherwise |
 | `--gpu=legacy` | Force `--gpus all` |
+| `--gpu=<cdi-device>` | Inject a specific CDI device (e.g. `nvidia.com/gpu=all`). May be repeated for multiple devices. Note: because the cluster container runs privileged, device-level isolation may not work as expected. |
+
+Mixing `legacy` or auto-select with explicit CDI device names in the same invocation is an error.
 
 The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`.
 
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
@@ -790,10 +790,19 @@ enum GatewayCommands {
         ///
         /// An optional argument controls the injection mode:
         ///
-        ///   --gpu            Auto-select: CDI on Docker >= 28.2.0, legacy otherwise
-        ///   --gpu=legacy     Force legacy nvidia DeviceRequest
-        #[arg(long = "gpu", num_args = 0..=1, default_missing_value = "auto", value_name = "MODE")]
-        gpu: Option<String>,
+        ///   --gpu              Auto-select: CDI on Docker >= 28.2.0, legacy otherwise
+        ///   --gpu=legacy       Force legacy nvidia DeviceRequest (specify once only)
+        ///   --gpu=<cdi-id>     Use explicit CDI device name (repeatable)
+        ///
+        /// Example CDI device names: `nvidia.com/gpu=all`, `nvidia.com/gpu=0`
+        #[arg(
+            long = "gpu",
+            num_args = 0..=1,
+            default_missing_value = "auto",
+            action = clap::ArgAction::Append,
+            value_name = "MODE",
+        )]
+        gpu: Vec<String>,
     },
 
     /// Stop the gateway (preserves state).
@@ -1408,6 +1417,29 @@ enum ForwardCommands {
     List,
 }
 
+/// Validate and normalise the raw values collected from `--gpu`.
+///
+/// | Input             | Output                          |
+/// |-------------------|---------------------------------|
+/// | `[]`              | `[]`  — no GPU                  |
+/// | `["auto"]`        | `["auto"]`  — resolve at deploy |
+/// | `["legacy"]`      | `["legacy"]`                    |
+/// | `[cdi-ids…]`      | `[cdi-ids…]`                    |
+///
+/// Returns an error when `legacy` or `auto` is mixed with other values, or
+/// appears more than once.
+fn parse_gpu_flag(values: &[String]) -> Result<Vec<String>> {
+    match values {
+        [] => Ok(vec![]),
+        [v] if v == "auto" || v == "legacy" => Ok(values.to_vec()),
+        ids if ids.iter().all(|v| v != "auto" && v != "legacy") => Ok(ids.to_vec()),
+        _ => Err(miette::miette!(
+            "--gpu=legacy and --gpu=auto can only be specified once \
+             and cannot be mixed with CDI device names"
+        )),
+    }
+}
+
 #[tokio::main]
 async fn main() -> Result<()> {
     // Install the rustls crypto provider before completion runs — completers may
@@ -1456,16 +1488,7 @@ async fn main() -> Result<()> {
                 registry_token,
                 gpu,
             } => {
-                let gpu = match gpu.as_deref() {
-                    None => vec![],
-                    Some("auto") => vec!["auto".to_string()],
-                    Some("legacy") => vec!["legacy".to_string()],
-                    Some(other) => {
-                        return Err(miette::miette!(
-                            "unknown --gpu value: {other:?}; expected `legacy`"
-                        ));
-                    }
-                };
+                let gpu = parse_gpu_flag(&gpu)?;
                 run::gateway_admin_deploy(
                     &name,
                     remote.as_deref(),
@@ -2818,4 +2841,55 @@ mod tests {
             other => panic!("expected SshProxy, got: {other:?}"),
         }
     }
+
+    // --- parse_gpu_flag ---
+
+    #[test]
+    fn parse_gpu_empty_returns_empty() {
+        assert_eq!(parse_gpu_flag(&[]).unwrap(), Vec::<String>::new());
+    }
+
+    #[test]
+    fn parse_gpu_auto_accepted() {
+        assert_eq!(parse_gpu_flag(&["auto".to_string()]).unwrap(), vec!["auto"]);
+    }
+
+    #[test]
+    fn parse_gpu_legacy_accepted() {
+        assert_eq!(
+            parse_gpu_flag(&["legacy".to_string()]).unwrap(),
+            vec!["legacy"]
+        );
+    }
+
+    #[test]
+    fn parse_gpu_cdi_device_ids_accepted() {
+        assert_eq!(
+            parse_gpu_flag(&["nvidia.com/gpu=all".to_string()]).unwrap(),
+            vec!["nvidia.com/gpu=all"],
+        );
+        assert_eq!(
+            parse_gpu_flag(&[
+                "nvidia.com/gpu=0".to_string(),
+                "nvidia.com/gpu=1".to_string()
+            ])
+            .unwrap(),
+            vec!["nvidia.com/gpu=0", "nvidia.com/gpu=1"],
+        );
+    }
+
+    #[test]
+    fn parse_gpu_legacy_mixed_with_cdi_is_error() {
+        assert!(parse_gpu_flag(&["legacy".to_string(), "nvidia.com/gpu=all".to_string()]).is_err());
+    }
+
+    #[test]
+    fn parse_gpu_auto_mixed_with_cdi_is_error() {
+        assert!(parse_gpu_flag(&["auto".to_string(), "nvidia.com/gpu=all".to_string()]).is_err());
+    }
+
+    #[test]
+    fn parse_gpu_double_legacy_is_error() {
+        assert!(parse_gpu_flag(&["legacy".to_string(), "legacy".to_string()]).is_err());
+    }
 }
diff --git a/docs/sandboxes/manage-gateways.md b/docs/sandboxes/manage-gateways.md
@@ -168,7 +168,7 @@ $ openshell gateway info --name my-remote-cluster
 
 | Flag | Purpose |
 |---|---|
-| `--gpu` | Enable NVIDIA GPU passthrough. Requires NVIDIA drivers and the Container Toolkit on the host. Accepts an optional value: omit for auto-select (CDI on Docker >= 28.2.0, `--gpus all` otherwise), or `--gpu=legacy` to force `--gpus all`. |
+| `--gpu` | Enable NVIDIA GPU passthrough. Requires NVIDIA drivers and the Container Toolkit on the host. Accepts an optional value: omit for auto-select (CDI on Docker >= 28.2.0, `--gpus all` otherwise), `--gpu=legacy` to force `--gpus all`, or `--gpu=<cdi-device>` to inject a specific CDI device (e.g. `nvidia.com/gpu=all`). May be repeated for multiple CDI devices. |
 | `--plaintext` | Listen on HTTP instead of mTLS. Use behind a TLS-terminating reverse proxy. |
 | `--disable-gateway-auth` | Skip mTLS client certificate checks. Use when a reverse proxy cannot forward client certs. |
 | `--registry-username` | Username for registry authentication. Defaults to `__token__` when `--registry-token` is set. Only needed for private registries. Also configurable with `OPENSHELL_REGISTRY_USERNAME`. |