Skip to content

Commit 07afd38

Browse files
committed
set up for new gpu class, creating notebooks, localqueue config to point to clusterqueues, and observability for jobs through rolebinding
1 parent 77125de commit 07afd38

7 files changed

Lines changed: 505 additions & 3 deletions

File tree

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
repos:
22
- repo: https://github.com/Lucas-C/pre-commit-hooks
3-
rev: v1.5.4
3+
rev: v1.5.5
44
hooks:
55
- id: remove-tabs
66

77
- repo: https://github.com/pre-commit/pre-commit-hooks
8-
rev: v4.5.0
8+
rev: v6.0.0
99
hooks:
1010
- id: trailing-whitespace
1111
- id: check-merge-conflict
@@ -18,7 +18,7 @@ repos:
1818
- id: detect-private-key
1919

2020
- repo: https://github.com/adrienverge/yamllint.git
21-
rev: v1.32.0
21+
rev: v1.37.1
2222
hooks:
2323
- id: yamllint
2424
files: \.(yaml|yml)$

gpu-class/cleanup.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
pattern="^bu-cs599-pmpp-cuda-"
2+
3+
for proj in $(oc get projects -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep "$pattern"); do
4+
echo "deleting notebook + pvc"
5+
oc -n "$proj" delete notebook --as system:admin --all --ignore-not-found --wait=true || true
6+
oc -n "$proj" delete pvc --as system:admin --all --ignore-not-found --wait=true || true
7+
done

gpu-class/cluster_queue_role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: kueue-clusterqueue-reader
5+
rules:
6+
- apiGroups: ["kueue.x-k8s.io"]
7+
resources: ["clusterqueues"]
8+
verbs: ["get", "list", "watch"]

gpu-class/gpu-class-setup.sh

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/bin/bash
2+
3+
CLASS_NAME="bu-cs599-pmpp-cuda"
4+
5+
create_resource_command=(oc create -f -)
6+
openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.edu.nerc.mghpcc.org/projects
7+
# split openshift url to provide as parameters
8+
host="${openshift_url%/projects*}" # get everything before projects
9+
hub_host=$host
10+
run_name="gpu_class_test"
11+
image_name="csw-dev-f25"
12+
13+
create_wb() {
14+
random_id=$(openssl rand -hex 3)
15+
16+
#set namespace
17+
namespace=$1
18+
19+
username=$(oc -n "$ns" get rolebinding edit -o json \
20+
| jq -r '
21+
(.subjects // [])
22+
| map(.name)
23+
| map(select(. != "jappavoo-40bu-2edu"))
24+
| map(select(. != "sdanni-40redhat-2com"))
25+
| map(select(. != "istaplet"))
26+
| .[]
27+
')
28+
29+
user=$(oc -n "$ns" get rolebinding edit -o json \
30+
| jq -r '
31+
(.subjects // [])
32+
| map(.name
33+
| if test("@.*\\..*$")
34+
then sub("@"; "-40") | gsub("\\.";"-2")
35+
else .
36+
end)
37+
| map(select(. != "jappavoo-40bu-2edu"))
38+
| map(select(. != "sdanni-40redhat-2com"))
39+
| map(select(. != "istaplet"))
40+
| .[]
41+
')
42+
43+
# give notebook within namespace a name
44+
notebook_name=cs599-${user}-wb
45+
46+
params=(
47+
-p NOTEBOOK_NAME="$notebook_name"
48+
-p RUN_NAME="$run_name"
49+
-p USERNAME="$username"
50+
-p NAMESPACE="$namespace"
51+
-p USER="$user"
52+
-p IMAGE_NAME="$image_name"
53+
-p OPENSHIFT_URL="$openshift_url"
54+
-p HUB_HOST="$hub_host"
55+
)
56+
57+
oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
58+
59+
echo "$notebook_name"
60+
}
61+
62+
apply_localqueue() {
63+
namespace=$1
64+
65+
local_params=(
66+
-p NAMESPACE="$namespace"
67+
)
68+
69+
oc process -f localqueue.yaml --local "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
70+
}
71+
72+
apply_rolebinding() {
73+
#set namespace and nb name
74+
namespace=$1
75+
notebook_name=$2
76+
77+
rb_params=(
78+
-p NAMESPACE="$namespace"
79+
-p SERVICE_ACCOUNT_NB="$notebook_name"
80+
)
81+
82+
oc process -f rb.yaml --local "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
83+
}
84+
85+
apply_clusterq() {
86+
87+
oc apply -f cluster_queue_role.yaml --as system:admin
88+
}
89+
90+
apply_clusterq
91+
92+
oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
93+
oc project "$ns"
94+
95+
#create a workbench and save the name of the notebook to apply rolebindings
96+
nb_name="$(create_wb "$ns")"
97+
apply_rolebinding "$ns" "$nb_name"
98+
apply_localqueue "$ns"
99+
100+
done

gpu-class/localqueue.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
metadata:
4+
name: localqueue
5+
parameters:
6+
- name: NAMESPACE
7+
required: true
8+
objects:
9+
- apiVersion: kueue.x-k8s.io/v1beta1
10+
kind: LocalQueue
11+
metadata:
12+
name: v100-localqueue
13+
namespace: ${NAMESPACE}
14+
spec:
15+
clusterQueue: v100-clusterqueue
16+
- apiVersion: kueue.x-k8s.io/v1beta1
17+
kind: LocalQueue
18+
metadata:
19+
name: a100-localqueue
20+
namespace: ${NAMESPACE}
21+
spec:
22+
clusterQueue: a100-clusterqueue
23+
- apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: LocalQueue
25+
metadata:
26+
name: h100-localqueue
27+
namespace: ${NAMESPACE}
28+
spec:
29+
clusterQueue: h100-clusterqueue

gpu-class/notebook_resource.yaml

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
parameters:
4+
- name: NOTEBOOK_NAME
5+
required: true
6+
- name: RUN_NAME
7+
required: true
8+
- name: USERNAME
9+
required: true
10+
- name: IMAGE_NAME
11+
required: true
12+
- name: NAMESPACE
13+
required: true
14+
- name: OPENSHIFT_URL
15+
required: true
16+
- name: USER
17+
required: true
18+
- name: IMAGE_REPO
19+
required: true
20+
value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications"
21+
- name: HUB_HOST
22+
required: true
23+
- name: PVC_SIZE
24+
required: true
25+
value: "20Gi"
26+
- name: TOKEN
27+
required: false
28+
objects:
29+
- apiVersion: kubeflow.org/v1beta1
30+
kind: Notebook
31+
metadata:
32+
annotations:
33+
notebooks.opendatahub.io/inject-oauth: 'true'
34+
notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME}
35+
notebooks.opendatahub.io/last-size-selection: Small
36+
notebooks.opendatahub.io/oauth-logout-url: >-
37+
${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
38+
opendatahub.io/username: ${USER}
39+
openshift.io/description: ''
40+
openshift.io/display-name: ${NOTEBOOK_NAME}
41+
opendatahub.io/image-display-name: ${IMAGE_NAME}
42+
name: ${NOTEBOOK_NAME}
43+
labels:
44+
ope-run: ${RUN_NAME}
45+
app: ${NOTEBOOK_NAME}
46+
opendatahub.io/dashboard: 'true'
47+
opendatahub.io/odh-managed: 'true'
48+
opendatahub.io/user: ${USER}
49+
spec:
50+
affinity:
51+
nodeAffinity:
52+
preferredDuringSchedulingIgnoredDuringExecution:
53+
- preference:
54+
matchExpressions:
55+
- key: nvidia.com/gpu.present
56+
# set the value to 'true' to use nodes with GPUs
57+
operator: In
58+
values:
59+
- 'false'
60+
weight: 1
61+
template:
62+
spec:
63+
containers:
64+
- resources:
65+
limits:
66+
cpu: '2'
67+
memory: 8Gi
68+
requests:
69+
cpu: '1'
70+
memory: 8Gi
71+
readinessProbe:
72+
failureThreshold: 3
73+
httpGet:
74+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
75+
port: notebook-port
76+
scheme: HTTP
77+
initialDelaySeconds: 10
78+
periodSeconds: 5
79+
successThreshold: 1
80+
timeoutSeconds: 1
81+
name: ${NOTEBOOK_NAME}
82+
livenessProbe:
83+
failureThreshold: 3
84+
httpGet:
85+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
86+
port: notebook-port
87+
scheme: HTTP
88+
initialDelaySeconds: 10
89+
periodSeconds: 5
90+
successThreshold: 1
91+
timeoutSeconds: 1
92+
env:
93+
- name: NOTEBOOK_ARGS
94+
value: |-
95+
--ServerApp.port=8888
96+
--ServerApp.token=${TOKEN}
97+
--ServerApp.password=''
98+
--ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME}
99+
--ServerApp.quit_button=False
100+
--ServerApp.tornado_settings={"user":"${USER}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"}
101+
- name: JUPYTER_IMAGE
102+
value: >-
103+
${IMAGE_REPO}/${IMAGE_NAME}
104+
ports:
105+
- containerPort: 8888
106+
name: notebook-port
107+
protocol: TCP
108+
imagePullPolicy: Always
109+
volumeMounts:
110+
- mountPath: /opt/app-root/src
111+
name: ${NOTEBOOK_NAME}
112+
- mountPath: /dev/shm
113+
name: shm
114+
image: >-
115+
${IMAGE_REPO}/${IMAGE_NAME}
116+
workingDir: /opt/app-root/src
117+
- resources:
118+
limits:
119+
cpu: 100m
120+
memory: 64Mi
121+
requests:
122+
cpu: 100m
123+
memory: 64Mi
124+
readinessProbe:
125+
failureThreshold: 3
126+
httpGet:
127+
path: /oauth/healthz
128+
port: oauth-proxy
129+
scheme: HTTPS
130+
initialDelaySeconds: 5
131+
periodSeconds: 5
132+
successThreshold: 1
133+
timeoutSeconds: 1
134+
name: oauth-proxy
135+
livenessProbe:
136+
failureThreshold: 3
137+
httpGet:
138+
path: /oauth/healthz
139+
port: oauth-proxy
140+
scheme: HTTPS
141+
initialDelaySeconds: 30
142+
periodSeconds: 5
143+
successThreshold: 1
144+
timeoutSeconds: 1
145+
env:
146+
- name: NAMESPACE
147+
valueFrom:
148+
fieldRef:
149+
fieldPath: metadata.namespace
150+
ports:
151+
- containerPort: 8443
152+
name: oauth-proxy
153+
protocol: TCP
154+
imagePullPolicy: Always
155+
volumeMounts:
156+
- mountPath: /etc/oauth/config
157+
name: oauth-config
158+
- mountPath: /etc/tls/private
159+
name: tls-certificates
160+
image: >-
161+
registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46
162+
args:
163+
- '--provider=openshift'
164+
- '--https-address=:8443'
165+
- '--http-address='
166+
- '--openshift-service-account=${NOTEBOOK_NAME}'
167+
- '--cookie-secret-file=/etc/oauth/config/cookie_secret'
168+
- '--cookie-expire=24h0m0s'
169+
- '--tls-cert=/etc/tls/private/tls.crt'
170+
- '--tls-key=/etc/tls/private/tls.key'
171+
- '--upstream=http://localhost:8888'
172+
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
173+
- '--email-domain=*'
174+
- '--skip-provider-button'
175+
- >-
176+
--openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"}
177+
- >-
178+
--logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
179+
enableServiceLinks: false
180+
serviceAccountName: ${NOTEBOOK_NAME}
181+
volumes:
182+
- name: ${NOTEBOOK_NAME}
183+
persistentVolumeClaim:
184+
claimName: ${NOTEBOOK_NAME}
185+
- emptyDir:
186+
medium: Memory
187+
name: shm
188+
- name: oauth-config
189+
secret:
190+
defaultMode: 420
191+
secretName: ${NOTEBOOK_NAME}-oauth-config
192+
- name: tls-certificates
193+
secret:
194+
defaultMode: 420
195+
secretName: ${NOTEBOOK_NAME}-tls
196+
- apiVersion: v1
197+
kind: PersistentVolumeClaim
198+
metadata:
199+
name: ${NOTEBOOK_NAME}
200+
labels:
201+
app: ${NOTEBOOK_NAME}
202+
notebook-name: ${NOTEBOOK_NAME}
203+
ope-run: ${RUN_NAME}
204+
opendatahub.io/dashboard: 'true'
205+
spec:
206+
accessModes:
207+
- ReadWriteOnce
208+
resources:
209+
requests:
210+
storage: "${PVC_SIZE}"

0 commit comments

Comments
 (0)