-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlm_k8s_local.yaml
More file actions
108 lines (108 loc) · 3.06 KB
/
lm_k8s_local.yaml
File metadata and controls
108 lines (108 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
apiVersion: batch/v1
kind: Job
metadata:
name: lm-local-trainer
spec:
parallelism: 1
completions: 1
template:
metadata:
labels:
paddle-job: lm-local
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
volumes:
- name: nvidia-driver
hostPath:
path: /usr/local/nvidia/lib64
- name: nvidia-smi
hostPath:
path: /usr/bin/nvidia-smi
containers:
- name: trainer
#image: "registry.baidu.com/paddlepaddle/fluid_benchmark:gpu"
image: "bootstrapper:5000/fluid_benchmark:gpu"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
command: ["bash", "-c", "unset http_proxy && unset https_proxy && cd /workspace/lm && python gru_lm.py"]
ports:
- name: spread-50011
containerPort: 50011
volumeMounts:
- mountPath: /usr/local/nvidia/lib64
name: nvidia-driver
- mountPath: /opt/bin/nvidia-smi
name: nvidia-smi
env:
#- name: GRPC_TRACE
# value: all,-tcp
#- name: GRPC_VERBOSITY
# value: debug
- name: PADDLE_JOB_NAME
value: lm-local
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "1"
- name: PSERVERS
value: "1"
- name: TOPOLOGY
value: ""
#- name: GLOG_v
# value: "2"
- name: FLAGS_fraction_of_gpu_memory_to_use
value: "0.4"
- name: FLAGS_benchmark
value: "1"
- name: GLOG_logtostderr
value: "1"
- name: MKL_NUM_THREADS
value: "1"
- name: LOCAL
value: "TRUE"
- name: USE_GPU
value: "TRUE"
- name: BATCH_SIZE
value: "20"
- name: IS_SPARSE
value: "FALSE"
- name: READER
value: "CLUSTER"
- name: ENTRY
value: "sleep 10 && unset http_proxy && unset https_proxy && cd /workspace/lm && python gru_lm.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30257"
- name: PADDLE_INIT_NICS
value: "eth2"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "4"
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64:/usr/local/lib:/usr/local/cuda/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 30Gi
cpu: 1
alpha.kubernetes.io/nvidia-gpu: 1
limits:
memory: 30Gi
cpu: 1
alpha.kubernetes.io/nvidia-gpu: 1
restartPolicy: Never