OpenTau/.github/workflows/gpu_test.yml at main · TensorAuto/OpenTau · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright 2026 Tensor Auto Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Nightly GPU Tests
on:
  schedule:
    # Run at 2:00 AM PST every day (10:00 AM UTC)
    - cron: '0 10 * * *'
  workflow_dispatch:

permissions:
  contents: read  # Required for actions/checkout

env:
  MUJOCO_GL: "egl"
  PYOPENGL_PLATFORM: "egl"

jobs:
  start-runner:
    name: Start GPU Runner
    runs-on: ubuntu-latest
    permissions:
      id-token: write # Required for requesting the JWT
      contents: read
    steps:
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: us-west-2

      - name: Start Instance
        run: |
          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 1
          echo "Waiting for instance to be ready..."

  gpu-test:
    name: Run Pytest on GPU
    needs: start-runner
    runs-on: [g6.2xlarge]
    # 45 (not 30): the RoboCasa GPU test cold-downloads ~4.4G of kitchen asset packs onto
    # the ephemeral runner before building a real scene; the rest of the suite is ~15 min.
    timeout-minutes: 45

    container:
      image: nvidia/cuda:12.2.0-devel-ubuntu22.04
      options: --gpus all

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          persist-credentials: false

      - name: Install system dependencies
        run: |
          apt-get update && apt-get install -y python3 python3-pip git ffmpeg libegl1 libegl-mesa0 libegl-dev libgl1 libglx-mesa0 libgles2 mesa-utils curl cmake build-essential

      - name: Install uv
        uses: astral-sh/setup-uv@v5
        with:
          version: "latest"

      - name: Install dependencies
        run: |
          # --extra robocasa installs the RoboCasa sim (shuheng-liu/robocasa fork) so the
          # @pytest.mark.gpu RoboCasa env test in tests/envs/test_robocasa.py can run instead
          # of erroring with "robocasa is not installed"; it co-resolves with libero on the
          # shared robosuite-master stack, so the resolved env for the other tests is unchanged.
          uv sync --extra dev --extra libero --extra robocasa

      - name: Check GPU
        run: nvidia-smi

      - name: Run Tests
        shell: bash
        # Authenticate via env vars instead of `hf auth login`: the CLI login does
        # an eager whoami() call that can be rate-limited (429) and fail the job at
        # setup. huggingface_hub reads HF_TOKEN / HUGGINGFACE_HUB_TOKEN with no
        # network call, and the GPU tests need it (gated model weights, datasets).
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          source .venv/bin/activate
          # Point LIBERO at the assets bundled inside the installed package (the
          # fork's MANIFEST grafts bddl_files/init_files/assets) so the real-sim
          # GPU test can build a genuine OffScreenRenderEnv. The .github/assets/libero
          # config used by the mocked CPU tests points every path at an empty /tmp
          # tree, which is fine for mocks but not for a real env build (it reads the
          # bddl file via get_libero_path("bddl_files")).
          export LIBERO_CONFIG_PATH=/tmp/libero-cfg
          mkdir -p "$LIBERO_CONFIG_PATH"
          touch "$LIBERO_CONFIG_PATH/config.yaml"  # pre-create so the LIBERO import skips its first-run input() prompt
          python -c "from libero.libero import set_libero_default_path; set_libero_default_path()"
          pytest -m "gpu" -n 0 -v tests/

  stop-runner:
    name: Stop GPU Runner
    needs: [start-runner, gpu-test]
    if: always() # Run even if tests fail
    runs-on: ubuntu-latest
    permissions:
      id-token: write # Required for requesting the JWT
      contents: read
    steps:
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
          aws-region: us-west-2

      - name: Stop Instance
        run: |
          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 0