Skip to content

Commit 2361ef5

Browse files
astefanuttiopenshift-merge-robot
authored andcommitted
test: Add MNIST training with MCAD Job
1 parent ff609f9 commit 2361ef5

File tree

4 files changed

+269
-1
lines changed

4 files changed

+269
-1
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
Copyright 2023.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"testing"
21+
22+
. "github.com/onsi/gomega"
23+
24+
batchv1 "k8s.io/api/batch/v1"
25+
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/api/resource"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
29+
. "github.com/project-codeflare/codeflare-operator/test/support"
30+
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
31+
)
32+
33+
func TestMNISTPyTorchMCAD(t *testing.T) {
34+
test := With(t)
35+
test.T().Parallel()
36+
37+
// Create a namespace
38+
namespace := test.NewTestNamespace()
39+
40+
// MNIST training script
41+
mnist, err := scripts.ReadFile("mnist.py")
42+
test.Expect(err).NotTo(HaveOccurred())
43+
44+
mnistScript := &corev1.ConfigMap{
45+
TypeMeta: metav1.TypeMeta{
46+
APIVersion: corev1.SchemeGroupVersion.String(),
47+
Kind: "ConfigMap",
48+
},
49+
ObjectMeta: metav1.ObjectMeta{
50+
Name: "mnist",
51+
Namespace: namespace.Name,
52+
},
53+
BinaryData: map[string][]byte{
54+
"mnist.py": mnist,
55+
},
56+
Immutable: Ptr(true),
57+
}
58+
mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{})
59+
test.Expect(err).NotTo(HaveOccurred())
60+
61+
// pip requirements
62+
requirements := &corev1.ConfigMap{
63+
TypeMeta: metav1.TypeMeta{
64+
APIVersion: corev1.SchemeGroupVersion.String(),
65+
Kind: "ConfigMap",
66+
},
67+
ObjectMeta: metav1.ObjectMeta{
68+
Name: "requirements",
69+
Namespace: namespace.Name,
70+
},
71+
BinaryData: map[string][]byte{
72+
"requirements.txt": []byte(`
73+
pytorch_lightning==1.5.10
74+
torchmetrics==0.9.1
75+
torchvision==0.12.0
76+
`),
77+
},
78+
Immutable: Ptr(true),
79+
}
80+
requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{})
81+
test.Expect(err).NotTo(HaveOccurred())
82+
83+
// Batch Job
84+
job := &batchv1.Job{
85+
TypeMeta: metav1.TypeMeta{
86+
APIVersion: batchv1.SchemeGroupVersion.String(),
87+
Kind: "Job",
88+
},
89+
ObjectMeta: metav1.ObjectMeta{
90+
Name: "mnist",
91+
Namespace: namespace.Name,
92+
},
93+
Spec: batchv1.JobSpec{
94+
Completions: Ptr(int32(1)),
95+
Parallelism: Ptr(int32(1)),
96+
Template: corev1.PodTemplateSpec{
97+
Spec: corev1.PodSpec{
98+
Containers: []corev1.Container{
99+
{
100+
Name: "job",
101+
Image: "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime",
102+
Command: []string{"/bin/sh", "-c", "pip install -r /test/runtime/requirements.txt && torchrun /test/job/mnist.py"},
103+
VolumeMounts: []corev1.VolumeMount{
104+
{
105+
Name: "mnist",
106+
MountPath: "/test/job",
107+
},
108+
{
109+
Name: "requirements",
110+
MountPath: "/test/runtime",
111+
},
112+
},
113+
},
114+
},
115+
Volumes: []corev1.Volume{
116+
{
117+
Name: "mnist",
118+
VolumeSource: corev1.VolumeSource{
119+
ConfigMap: &corev1.ConfigMapVolumeSource{
120+
LocalObjectReference: corev1.LocalObjectReference{
121+
Name: mnistScript.Name,
122+
},
123+
},
124+
},
125+
},
126+
{
127+
Name: "requirements",
128+
VolumeSource: corev1.VolumeSource{
129+
ConfigMap: &corev1.ConfigMapVolumeSource{
130+
LocalObjectReference: corev1.LocalObjectReference{
131+
Name: requirements.Name,
132+
},
133+
},
134+
},
135+
},
136+
},
137+
RestartPolicy: corev1.RestartPolicyNever,
138+
},
139+
},
140+
},
141+
}
142+
143+
// Create an AppWrapper resource
144+
aw := &mcadv1beta1.AppWrapper{
145+
ObjectMeta: metav1.ObjectMeta{
146+
Name: "mnist",
147+
Namespace: namespace.Name,
148+
},
149+
Spec: mcadv1beta1.AppWrapperSpec{
150+
AggrResources: mcadv1beta1.AppWrapperResourceList{
151+
GenericItems: []mcadv1beta1.AppWrapperGenericResource{
152+
{
153+
DesiredAvailable: 1,
154+
CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{
155+
{
156+
Replicas: 1,
157+
Requests: corev1.ResourceList{
158+
corev1.ResourceCPU: resource.MustParse("250m"),
159+
corev1.ResourceMemory: resource.MustParse("512Mi"),
160+
},
161+
Limits: corev1.ResourceList{
162+
corev1.ResourceCPU: resource.MustParse("500m"),
163+
corev1.ResourceMemory: resource.MustParse("1G"),
164+
},
165+
},
166+
},
167+
GenericTemplate: Raw(test, job),
168+
},
169+
},
170+
},
171+
},
172+
}
173+
174+
_, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw)
175+
test.Expect(err).NotTo(HaveOccurred())
176+
177+
test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium).
178+
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive)))
179+
180+
test.Eventually(Job(test, namespace, job.Name), TestTimeoutLong).
181+
Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)))
182+
}

test/e2e/mnist_rayjob_mcad_raycluster_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
4444

4545
configMap := &corev1.ConfigMap{
4646
TypeMeta: metav1.TypeMeta{
47-
APIVersion: corev1.GroupName,
47+
APIVersion: corev1.SchemeGroupVersion.String(),
4848
Kind: "ConfigMap",
4949
},
5050
ObjectMeta: metav1.ObjectMeta{

test/support/batch.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
Copyright 2023.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package support
18+
19+
import (
20+
"github.com/onsi/gomega"
21+
22+
batchv1 "k8s.io/api/batch/v1"
23+
corev1 "k8s.io/api/core/v1"
24+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
)
26+
27+
func Job(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *batchv1.Job {
28+
return func(g gomega.Gomega) *batchv1.Job {
29+
job, err := t.Client().Core().BatchV1().Jobs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{})
30+
g.Expect(err).NotTo(gomega.HaveOccurred())
31+
return job
32+
}
33+
}

test/support/conditions.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
Licensed to the Apache Software Foundation (ASF) under one or more
3+
contributor license agreements. See the NOTICE file distributed with
4+
this work for additional information regarding copyright ownership.
5+
The ASF licenses this file to You under the Apache License, Version 2.0
6+
(the "License"); you may not use this file except in compliance with
7+
the License. You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
*/
17+
18+
package support
19+
20+
import (
21+
batchv1 "k8s.io/api/batch/v1"
22+
corev1 "k8s.io/api/core/v1"
23+
)
24+
25+
type conditionType interface {
26+
~string
27+
}
28+
29+
func ConditionStatus[T conditionType](conditionType T) func(any) corev1.ConditionStatus {
30+
return func(object any) corev1.ConditionStatus {
31+
switch o := object.(type) {
32+
33+
case *batchv1.Job:
34+
if c := getJobCondition(o.Status.Conditions, batchv1.JobConditionType(conditionType)); c != nil {
35+
return c.Status
36+
}
37+
38+
}
39+
40+
return corev1.ConditionUnknown
41+
}
42+
}
43+
44+
// TODO: to be replaced with a generic version once common struct fields of a type set can be used.
45+
// See https://github.com/golang/go/issues/48522
46+
func getJobCondition(conditions []batchv1.JobCondition, conditionType batchv1.JobConditionType) *batchv1.JobCondition {
47+
for _, c := range conditions {
48+
if c.Type == conditionType {
49+
return &c
50+
}
51+
}
52+
return nil
53+
}

0 commit comments

Comments
 (0)