-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenvironment-config-example.yaml
More file actions
116 lines (116 loc) · 2.54 KB
/
environment-config-example.yaml
File metadata and controls
116 lines (116 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
---
auto_deploy: false
context: release-ry6clz
domain: ai-playground.releaseapp.io
repo_name: Draiken/AI-Examples
hostnames:
- nemo-inference: nemo-${env_id}.${domain}
resources:
cpu:
limits: 24
requests: 12
memory:
limits: 64Gi
requests: 16Gi
replicas: 1
shared_volumes:
- name: tmp
size: 20Gi
type: persistent
jobs:
- name: peft-tuning
command:
- "/bin/peft_tuning"
from_services: nemo-training
completed_timeout: 5400
node_selector:
- key: nvidia.com/gpu
value: "true"
volumes:
- name: shmem
type: shmem
size: 16Gi
mount_path: "/dev/shm"
- type: s3
bucket: release-ry6clz-static-builds
mount_path: "/bucket"
services:
- name: nemo-training
image: draiken/ai-examples/nemo-training
build:
context: "."
node_selector:
- key: nvidia.com/gpu
value: "true"
volumes:
- name: shmem
type: shmem
size: 16Gi
mount_path: "/dev/shm"
- claim: tmp
mount_path: "/tmp"
- type: s3
bucket: release-ry6clz-static-builds
mount_path: "/bucket"
cpu:
limits: 24
requests: 12
memory:
limits: 64Gi
requests: 16Gi
- name: nemo-inference
image: nvcr.io/ea-bignlp/ga-participants/nemofw-inference:23.10
has_repo: false
command:
- python
- "/opt/NeMo/scripts/deploy/deploy_triton.py"
- "--nemo_checkpoint"
- "/bucket/ai-models-tmp/llama-2-7b-hf.nemo"
- "--model_type"
- llama
- "--triton_model_name"
- release-model
node_selector:
- key: nvidia.com/gpu
value: "true"
volumes:
- name: shmem
type: shmem
size: 16Gi
mount_path: "/dev/shm"
- claim: tmp
mount_path: "/tmp"
- type: s3
bucket: release-ry6clz-static-builds
mount_path: "/bucket"
cpu:
limits: 24
requests: 8
memory:
limits: 64Gi
requests: 16Gi
s3_volumes:
- bucket: release-ry6clz-static-builds
region: us-west-2
workflows:
- name: setup
parallelize:
- step: fine-tuning
tasks:
- jobs.peft-tuning
halt_on_error: true
wait_for_finish: true
- step: inference
tasks:
- services.nemo-inference
- name: patch
parallelize:
- step: inference
tasks:
- services.nemo-inference
- name: teardown
parallelize:
- step: remove-environment
tasks:
- release.remove_environment
tracking_branch: peft