-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubmit-train-0.sh
More file actions
executable file
·32 lines (30 loc) · 865 Bytes
/
submit-train-0.sh
File metadata and controls
executable file
·32 lines (30 loc) · 865 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
#SBATCH --gres=gpu:1
#SBATCH --time=1000:00:00
#SBATCH --nodes=1
#SBATCH --mem=12g
#SBATCH --job-name="multlin"
##SBATCH --mail-user=gneubig@cs.cmu.edu
##SBATCH --mail-type=ALL
##SBATCH --requeue
#Specifies that the job will be requeued after a node failure.
#The default is that the job will not be requeued.
set -e
#export PYTHONPATH="$(pwd)"
#export CUDA_VISIBLE_DEVICES="0"
version=single
mkdir -p checkpoints/"$version"
#for f in `ls job_scripts/"$version"/ | grep -v .sh$`; do
for f in `ls job_scripts/"$version"/`; do
f1=`basename $f .sh`
echo $f1
if [[ ! -e checkpoints/"$version"/$f1.started ]]; then
echo "running $f1"
touch checkpoints/"$version"/$f1.started
hostname
nvidia-smi
./job_scripts/"$version"/$f $1
else
echo "already started $f1"
fi
done