-
Notifications
You must be signed in to change notification settings - Fork 22
126 lines (111 loc) · 4.71 KB
/
gpu_test.yml
File metadata and controls
126 lines (111 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright 2026 Tensor Auto Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Nightly GPU Tests
on:
schedule:
# Run at 2:00 AM PST every day (10:00 AM UTC)
- cron: '0 10 * * *'
workflow_dispatch:
permissions:
contents: read # Required for actions/checkout
env:
MUJOCO_GL: "egl"
PYOPENGL_PLATFORM: "egl"
jobs:
start-runner:
name: Start GPU Runner
runs-on: ubuntu-latest
permissions:
id-token: write # Required for requesting the JWT
contents: read
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: us-west-2
- name: Start Instance
run: |
aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 1
echo "Waiting for instance to be ready..."
gpu-test:
name: Run Pytest on GPU
needs: start-runner
runs-on: [g6.2xlarge]
# 45 (not 30): the RoboCasa GPU test cold-downloads ~4.4G of kitchen asset packs onto
# the ephemeral runner before building a real scene; the rest of the suite is ~15 min.
timeout-minutes: 45
container:
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
options: --gpus all
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Install system dependencies
run: |
apt-get update && apt-get install -y python3 python3-pip git ffmpeg libegl1 libegl-mesa0 libegl-dev libgl1 libglx-mesa0 libgles2 mesa-utils curl cmake build-essential
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "latest"
- name: Install dependencies
run: |
# --extra robocasa installs the RoboCasa sim (shuheng-liu/robocasa fork) so the
# @pytest.mark.gpu RoboCasa env test in tests/envs/test_robocasa.py can run instead
# of erroring with "robocasa is not installed"; it co-resolves with libero on the
# shared robosuite-master stack, so the resolved env for the other tests is unchanged.
uv sync --extra dev --extra libero --extra robocasa
- name: Check GPU
run: nvidia-smi
- name: Run Tests
shell: bash
# Authenticate via env vars instead of `hf auth login`: the CLI login does
# an eager whoami() call that can be rate-limited (429) and fail the job at
# setup. huggingface_hub reads HF_TOKEN / HUGGINGFACE_HUB_TOKEN with no
# network call, and the GPU tests need it (gated model weights, datasets).
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
source .venv/bin/activate
# Point LIBERO at the assets bundled inside the installed package (the
# fork's MANIFEST grafts bddl_files/init_files/assets) so the real-sim
# GPU test can build a genuine OffScreenRenderEnv. The .github/assets/libero
# config used by the mocked CPU tests points every path at an empty /tmp
# tree, which is fine for mocks but not for a real env build (it reads the
# bddl file via get_libero_path("bddl_files")).
export LIBERO_CONFIG_PATH=/tmp/libero-cfg
mkdir -p "$LIBERO_CONFIG_PATH"
touch "$LIBERO_CONFIG_PATH/config.yaml" # pre-create so the LIBERO import skips its first-run input() prompt
python -c "from libero.libero import set_libero_default_path; set_libero_default_path()"
pytest -m "gpu" -n 0 -v tests/
stop-runner:
name: Stop GPU Runner
needs: [start-runner, gpu-test]
if: always() # Run even if tests fail
runs-on: ubuntu-latest
permissions:
id-token: write # Required for requesting the JWT
contents: read
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: us-west-2
- name: Stop Instance
run: |
aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 0