-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgreeneJob.lua
More file actions
218 lines (152 loc) · 6.33 KB
/
greeneJob.lua
File metadata and controls
218 lines (152 loc) · 6.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/bin/env lua
local greeneJob = { }
local greeneUtils = require "greeneUtils"
local greeneCommon = require "greeneCommon"
local greeneCPU = require "greeneCPU"
local greeneGPU = require "greeneGPU"
local greeneQoS = require "greeneQoS"
local greeneReservation = require "greeneReservation"
local slurm_log = greeneUtils.slurm_log
local user_log = greeneUtils.user_log
-- constants
local uint16_NO_VAL = greeneUtils.uint16_NO_VAL
local uint32_NO_VAL = greeneUtils.uint32_NO_VAL
local uint64_NO_VAL = greeneUtils.uint64_NO_VAL
local bigIntNumber = greeneUtils.bigIntNumber
local job_desc = nil
local n_cpus_per_node = nil
local function memory_is_specified(mem)
if mem == nil or mem > bigIntNumber then
return false
else
return true
end
end
local function setup_parameters(args)
job_desc = args.job_desc
setup_default_compute_resources()
greeneCommon.setup_parameters{job_desc = job_desc}
-- GPU job
if greeneCommon.is_gpu_job() then
greeneGPU.setup_parameters{ gpus = greeneCommon.gpus,
cpus = n_cpus_per_node,
memory = job_desc.pn_min_memory/1024.0,
time_limit = job_desc.time_limit,
gpu_type = greeneCommon.gpu_type }
if job_desc.partition == nil then
local partitions = greeneGPU.valid_partitions()
if partitions ~= nil then job_desc.partition = partitions end
end
else -- CPU only jobs
-- For CPU only jobs, we will set threads_per_core = 1
if job_desc.threads_per_core == uint16_NO_VAL then job_desc.threads_per_core = 1 end
greeneCPU.setup_parameters{ cpus = n_cpus_per_node,
memory = job_desc.pn_min_memory/1024.0,
nodes = job_desc.min_nodes,
time_limit = job_desc.time_limit }
if job_desc.partition == nil then
local partitions = greeneCPU.valid_partitions()
if partitions ~= nil then job_desc.partition = partitions end
end
end
greeneQoS.setup_parameters{ time_limit = job_desc.time_limit }
if job_desc.qos == nil then
local qos = greeneQoS.valid_qos()
if qos ~= nil then job_desc.qos = qos end
end
end
local function ntasks_is_specified()
if job_desc.num_tasks ~= uint32_NO_VAL and job_desc.ntasks_per_node == uint16_NO_VAL then
user_log("Plase do not specify --ntasks on prince cluster, try to use --nodes and --tasks-per-node together")
return true
end
return false
end
local function setup_is_valid()
if not greeneReservation.check_reservation_is_OK(job_desc) then return false; end
-- if ntasks_is_specified() then return false end
if greeneCommon.is_gpu_job() then
if not greeneGPU.setup_is_valid() then return false end
else
if not greeneCPU.setup_is_valid() then return false end
end
if not greeneQoS.qos_is_valid() then return false end
-- check reservations
-- check accounts
return true
end
local function print_job_desc()
slurm_log("*** ==== SLURM job desc ==== ***")
if job_desc.threads_per_core ~= nil then slurm_log("threads_per_core: %d", job_desc.threads_per_core) end
if job_desc.user_name ~= nil then slurm_log("user: %s", job_desc.user_name) end
if job_desc.account ~= nil then slurm_log("account: %s", job_desc.account) end
if job_desc.partition ~= nil then slurm_log("partitions: %s", job_desc.partition) end
if job_desc.qos ~= nil then slurm_log("qos: %s", job_desc.qos) end
if job_desc.features ~= nil then slurm_log("features: %s", job_desc.features) end
if job_desc.default_account ~= nil then slurm_log("default_account: %s", job_desc.default_account) end
if job_desc.work_dir ~= nil then slurm_log("work dir: %s", job_desc.work_dir) end
if job_desc.comment ~= nil then slurm_log("comment: %s", job_desc.comment) end
if job_desc.argc > 0 then
local argv = job_desc.argv[0]
for i = 1, (job_desc.argc - 1) do
argv = argv .. " " .. job_desc.argv[i]
end
slurm_log("sbatch script with arguments: %s", argv)
end
if job_desc.script ~= nil then
slurm_log("script:\n%s", job_desc.script)
else
slurm_log("no script, interactive job")
end
return
end
local function print_job_desc_verbose()
print_job_desc()
slurm_log("time_limit = %d", job_desc.time_limit)
slurm_log("ntasks_per_node: %d", job_desc.ntasks_per_node)
slurm_log("ntasks_per_socket: %d", job_desc.ntasks_per_socket)
slurm_log("num_tasks = %d", job_desc.num_tasks)
slurm_log("pn_min_cpus: %d", job_desc.pn_min_cpus)
slurm_log("pn_min_memory: %d", job_desc.pn_min_memory)
slurm_log("cpus_per_task: %d", job_desc.cpus_per_task)
slurm_log("min_nodes: %d", job_desc.min_nodes)
slurm_log("max_nodes: %d", job_desc.max_nodes)
if memory_is_specified(job_desc.min_mem_per_cpu) then
slurm_log("min_mem_per_cpu: %d", job_desc.min_mem_per_cpu)
end
slurm_log("requeue: %d", job_desc.requeue)
if job_desc.mail_user ~= nil then slurm_log("mail_user: %s", job_desc.mail_user) end
if job_desc.gres ~= nil then slurm_log("gres: %s", job_desc.gres) end
if job_desc.gres_bind ~= nil then slurm_log("gres_bind: %s", job_desc.gres_bind) end
end
function setup_default_compute_resources()
if job_desc.mail_type ~= 0 and job_desc.mail_user == nil then
local netid = job_desc.user_name
if string.find(netid, "^%a+%d+$") then
job_desc.mail_user = netid .. "@nyu.edu"
end
end
if job_desc.time_limit == uint32_NO_VAL then job_desc.time_limit = 60 end
if job_desc.cpus_per_task == uint16_NO_VAL then job_desc.cpus_per_task = 1 end
if job_desc.pn_min_cpus == uint16_NO_VAL then job_desc.pn_min_cpus = 1 end
if job_desc.ntasks_per_node == uint16_NO_VAL then job_desc.ntasks_per_node = 1 end
-- if job_desc.threads_per_core == uint16_NO_VAL then job_desc.threads_per_core = 1 end
n_cpus_per_node = job_desc.ntasks_per_node * job_desc.cpus_per_task
if job_desc.min_nodes == uint32_NO_VAL then job_desc.min_nodes = 1 end
if job_desc.requeue ~= 1 then job_desc.requeue = 0 end
if not memory_is_specified(job_desc.pn_min_memory) then
if memory_is_specified(job_desc.min_mem_per_cpu) then
job_desc.pn_min_memory = job_desc.min_mem_per_cpu
else
job_desc.pn_min_memory = 2048
end
end
end
-- data
-- functions
greeneJob.setup_parameters = setup_parameters
greeneJob.print_job_desc_verbose = print_job_desc_verbose
greeneJob.print_job_desc = print_job_desc
greeneJob.setup_is_valid = setup_is_valid
slurm_log("To load greeneJob.lua")
return greeneJob