Weekly Batch Evaluation #35
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Weekly Batch Evaluation | |
| on: | |
| schedule: | |
| # Run every Sunday at 00:00 UTC | |
| - cron: '0 0 * * 0' | |
| workflow_dispatch: | |
| inputs: | |
| problems: | |
| description: 'Comma-separated problem IDs (leave empty for all)' | |
| required: false | |
| default: '' | |
| max_concurrent: | |
| description: 'Max concurrent evaluations (number of SkyPilot clusters)' | |
| required: false | |
| default: '20' | |
| env: | |
| SKYPILOT_CLOUD: gcp | |
| # Disable Ray object spilling to prevent disk exhaustion on GitHub Actions runner | |
| # SkyPilot uses Ray autoscaler internally, which can spill objects to disk | |
| # when handling many concurrent cluster operations | |
| RAY_automatic_object_spilling_enabled: "false" | |
| jobs: | |
| evaluate: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 360 # 6 hours max | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Checkout internal repository (hidden problems) | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: FrontierCS/Frontier-CS-internal | |
| token: ${{ secrets.INTERNAL_REPO_TOKEN }} | |
| path: internal | |
| - name: Merge internal problems and solutions | |
| run: | | |
| # Copy internal problems to research/problems (overwrite if exists) | |
| if [ -d "internal/research/problems" ]; then | |
| cp -r internal/research/problems/* research/problems/ | |
| echo "Merged $(ls -d internal/research/problems/*/ 2>/dev/null | wc -l) problem directories from internal repo" | |
| fi | |
| # Copy internal solutions to research/solutions (overwrite if exists) | |
| if [ -d "internal/research/solutions" ]; then | |
| cp -r internal/research/solutions/* research/solutions/ | |
| echo "Merged $(ls -d internal/research/solutions/*/ 2>/dev/null | wc -l) solution directories from internal repo" | |
| fi | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| - name: Set up Python | |
| run: uv python install 3.11 | |
| - name: Install dependencies | |
| run: | | |
| uv sync | |
| uv pip install skypilot[gcp] | |
| - name: Set up GCP credentials | |
| run: | | |
| echo '${{ secrets.GCP_CREDENTIALS }}' > /tmp/gcp-key.json | |
| # Set as application default credentials for SkyPilot | |
| mkdir -p ~/.config/gcloud | |
| cp /tmp/gcp-key.json ~/.config/gcloud/application_default_credentials.json | |
| # For service account, also activate with gcloud | |
| if grep -q '"type": "service_account"' /tmp/gcp-key.json; then | |
| gcloud auth activate-service-account --key-file=/tmp/gcp-key.json | |
| fi | |
| gcloud config set project ${{ secrets.GCP_PROJECT_ID }} | |
| env: | |
| GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json | |
| - name: Configure SkyPilot | |
| run: | | |
| mkdir -p ~/.sky | |
| # SkyPilot uses GCP project from gcloud config or ADC | |
| uv run sky check gcp || true | |
| env: | |
| GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json | |
| - name: Fetch previous state for incremental evaluation | |
| env: | |
| RESULTS_REPO_TOKEN: ${{ secrets.RESULTS_REPO_TOKEN }} | |
| run: | | |
| # Clone results repo to get previous state | |
| git clone --depth 1 https://x-access-token:${RESULTS_REPO_TOKEN}@github.com/FrontierCS/Frontier-CS-Result.git /tmp/prev-results | |
| # Copy previous state to results dir (for incremental evaluation) | |
| mkdir -p results/batch | |
| if [ -f /tmp/prev-results/batch/.state.json ]; then | |
| cp /tmp/prev-results/batch/.state.json results/batch/ | |
| echo "Loaded previous state for incremental evaluation" | |
| fi | |
| - name: Run batch evaluation | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json | |
| run: | | |
| MAX_CONCURRENT="${{ github.event.inputs.max_concurrent || '4' }}" | |
| # Auto-discover solutions from research/solutions directory | |
| uv run frontier-eval batch \ | |
| --solutions-dir research/solutions \ | |
| --skypilot \ | |
| --max-concurrent $MAX_CONCURRENT \ | |
| --results-dir results/batch | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: evaluation-results-${{ github.run_id }} | |
| path: results/ | |
| retention-days: 90 | |
| - name: Push results to results repository | |
| if: always() | |
| env: | |
| RESULTS_REPO_TOKEN: ${{ secrets.RESULTS_REPO_TOKEN }} | |
| run: | | |
| # Clone the results repository | |
| git clone https://x-access-token:${RESULTS_REPO_TOKEN}@github.com/FrontierCS/Frontier-CS-Result.git /tmp/results-repo | |
| # Copy results | |
| cp -r results/* /tmp/results-repo/ | |
| cd /tmp/results-repo | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add . | |
| git diff --staged --quiet || git commit -m "chore: update evaluation results $(date +%Y-%m-%d)" | |
| git push | |
| - name: Cleanup SkyPilot clusters | |
| if: always() | |
| env: | |
| GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json | |
| run: | | |
| echo "Cleaning up SkyPilot clusters..." | |
| # Get all eval-* clusters and delete them in parallel | |
| CLUSTERS=$(uv run sky status --refresh 2>/dev/null | grep -E '^eval-' | awk '{print $1}' || true) | |
| if [ -n "$CLUSTERS" ]; then | |
| echo "$CLUSTERS" | while read cluster; do | |
| echo "Terminating cluster: $cluster" | |
| uv run sky down "$cluster" -y & | |
| done | |
| wait | |
| fi | |
| echo "Cleanup complete" |