Skip to content

Commit 91cc902

Browse files
author
Michail Resvanis
committed
Enable FM in sandbox device plugin when FM shared-nvswitch mode
Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent d28606a commit 91cc902

3 files changed

Lines changed: 155 additions & 0 deletions

File tree

controllers/object_controls.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,6 +1647,31 @@ func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo
16471647
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
16481648
}
16491649
}
1650+
1651+
// Set ENABLE_FABRIC_MANAGER environment variable if shared-nvswitch mode is configured
1652+
if config.FabricManager.IsSharedNVSwitchMode() {
1653+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "ENABLE_FABRIC_MANAGER", "true")
1654+
1655+
// Add fabric manager volume mount to the container
1656+
fabricManagerVolMount := corev1.VolumeMount{
1657+
Name: "run-nvidia-fabricmanager",
1658+
MountPath: "/run/nvidia-fabricmanager",
1659+
}
1660+
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, fabricManagerVolMount)
1661+
1662+
// Add fabric manager volume to the pod spec
1663+
fabricManagerVol := corev1.Volume{
1664+
Name: "run-nvidia-fabricmanager",
1665+
VolumeSource: corev1.VolumeSource{
1666+
HostPath: &corev1.HostPathVolumeSource{
1667+
Path: "/run/nvidia-fabricmanager",
1668+
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
1669+
},
1670+
},
1671+
}
1672+
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, fabricManagerVol)
1673+
}
1674+
16501675
return nil
16511676
}
16521677

controllers/object_controls_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,8 @@ func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {
882882
switch testCase {
883883
case "default":
884884
// Do nothing
885+
case "fabric-manager-shared-nvswitch":
886+
cp.Spec.FabricManager.Mode = gpuv1.FabricModeSharedNVSwitch
885887
default:
886888
return nil
887889
}
@@ -897,11 +899,16 @@ func getSandboxDevicePluginTestOutput(testCase string) map[string]interface{} {
897899
"numDaemonsets": 1,
898900
"image": "nvcr.io/nvidia/kubevirt-device-plugin:v1.1.0",
899901
"imagePullSecret": "ngc-secret",
902+
"env": map[string]string{},
900903
}
901904

902905
switch testCase {
903906
case "default":
904907
// Do nothing
908+
case "fabric-manager-shared-nvswitch":
909+
output["env"] = map[string]string{
910+
"ENABLE_FABRIC_MANAGER": "true",
911+
}
905912
default:
906913
return nil
907914
}
@@ -922,6 +929,11 @@ func TestSandboxDevicePlugin(t *testing.T) {
922929
getSandboxDevicePluginTestInput("default"),
923930
getSandboxDevicePluginTestOutput("default"),
924931
},
932+
{
933+
"FabricManagerSharedNVSwitch",
934+
getSandboxDevicePluginTestInput("fabric-manager-shared-nvswitch"),
935+
getSandboxDevicePluginTestOutput("fabric-manager-shared-nvswitch"),
936+
},
925937
}
926938

927939
for _, tc := range testCases {
@@ -935,14 +947,26 @@ func TestSandboxDevicePlugin(t *testing.T) {
935947
}
936948

937949
image := ""
950+
containerEnv := make(map[string]string)
938951
for _, container := range ds.Spec.Template.Spec.Containers {
939952
if strings.Contains(container.Name, "nvidia-sandbox-device-plugin-ctr") {
940953
image = container.Image
954+
for _, env := range container.Env {
955+
containerEnv[env.Name] = env.Value
956+
}
941957
continue
942958
}
943959
}
944960

945961
require.Equal(t, tc.output["image"], image, "Unexpected configuration for nvidia-sandbox-device-plugin-ctr image")
962+
963+
// Check environment variables
964+
expectedEnv := tc.output["env"].(map[string]string)
965+
for envName, expectedValue := range expectedEnv {
966+
actualValue, found := containerEnv[envName]
967+
require.True(t, found, "Expected environment variable %s not found", envName)
968+
require.Equal(t, expectedValue, actualValue, "Unexpected value for environment variable %s", envName)
969+
}
946970

947971
// cleanup by deleting all kubernetes objects
948972
err = removeState(&clusterPolicyController, clusterPolicyController.idx-1)

controllers/transforms_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3665,3 +3665,109 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) {
36653665
require.NoError(t, err)
36663666
require.EqualValues(t, expectedDs, ds)
36673667
}
3668+
3669+
func TestTransformSandboxDevicePlugin(t *testing.T) {
3670+
initMockK8sClients()
3671+
testCases := []struct {
3672+
description string
3673+
ds Daemonset
3674+
cpSpec *gpuv1.ClusterPolicySpec
3675+
expectedDs Daemonset
3676+
errorExpected bool
3677+
}{
3678+
{
3679+
description: "sandbox device plugin with fabric manager shared-nvswitch mode",
3680+
ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
3681+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
3682+
cpSpec: &gpuv1.ClusterPolicySpec{
3683+
SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
3684+
Repository: "nvcr.io/nvidia",
3685+
Image: "kubevirt-device-plugin",
3686+
Version: "v1.2.0",
3687+
},
3688+
FabricManager: gpuv1.FabricManagerSpec{
3689+
Mode: gpuv1.FabricModeSharedNVSwitch,
3690+
},
3691+
Validator: gpuv1.ValidatorSpec{
3692+
Repository: "nvcr.io/nvidia/cloud-native",
3693+
Image: "gpu-operator-validator",
3694+
Version: "v1.0.0",
3695+
},
3696+
},
3697+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
3698+
Name: "nvidia-sandbox-device-plugin-ctr",
3699+
Image: "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
3700+
ImagePullPolicy: corev1.PullIfNotPresent,
3701+
Env: []corev1.EnvVar{
3702+
{
3703+
Name: "ENABLE_FABRIC_MANAGER",
3704+
Value: "true",
3705+
},
3706+
},
3707+
VolumeMounts: []corev1.VolumeMount{
3708+
{
3709+
Name: "run-nvidia-fabricmanager",
3710+
MountPath: "/run/nvidia-fabricmanager",
3711+
},
3712+
},
3713+
}).WithInitContainer(corev1.Container{
3714+
Name: "toolkit-validation",
3715+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
3716+
SecurityContext: &corev1.SecurityContext{
3717+
RunAsUser: rootUID,
3718+
},
3719+
}).WithVolume(corev1.Volume{
3720+
Name: "run-nvidia-fabricmanager",
3721+
VolumeSource: corev1.VolumeSource{
3722+
HostPath: &corev1.HostPathVolumeSource{
3723+
Path: "/run/nvidia-fabricmanager",
3724+
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
3725+
},
3726+
},
3727+
}),
3728+
errorExpected: false,
3729+
},
3730+
{
3731+
description: "sandbox device plugin without fabric manager shared-nvswitch mode",
3732+
ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
3733+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
3734+
cpSpec: &gpuv1.ClusterPolicySpec{
3735+
SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
3736+
Repository: "nvcr.io/nvidia",
3737+
Image: "kubevirt-device-plugin",
3738+
Version: "v1.2.0",
3739+
},
3740+
Validator: gpuv1.ValidatorSpec{
3741+
Repository: "nvcr.io/nvidia/cloud-native",
3742+
Image: "gpu-operator-validator",
3743+
Version: "v1.0.0",
3744+
},
3745+
},
3746+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
3747+
Name: "nvidia-sandbox-device-plugin-ctr",
3748+
Image: "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
3749+
ImagePullPolicy: corev1.PullIfNotPresent,
3750+
}).WithInitContainer(corev1.Container{
3751+
Name: "toolkit-validation",
3752+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
3753+
SecurityContext: &corev1.SecurityContext{
3754+
RunAsUser: rootUID,
3755+
},
3756+
}),
3757+
errorExpected: false,
3758+
},
3759+
}
3760+
3761+
for _, tc := range testCases {
3762+
t.Run(tc.description, func(t *testing.T) {
3763+
err := TransformSandboxDevicePlugin(tc.ds.DaemonSet, tc.cpSpec,
3764+
ClusterPolicyController{operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
3765+
if tc.errorExpected {
3766+
require.Error(t, err)
3767+
return
3768+
}
3769+
require.NoError(t, err)
3770+
require.EqualValues(t, tc.expectedDs, tc.ds)
3771+
})
3772+
}
3773+
}

0 commit comments

Comments
 (0)