Skip to content

Weekly Batch Evaluation #35

Weekly Batch Evaluation

Weekly Batch Evaluation #35

Workflow file for this run

name: Weekly Batch Evaluation
on:
schedule:
# Run every Sunday at 00:00 UTC
- cron: '0 0 * * 0'
workflow_dispatch:
inputs:
problems:
description: 'Comma-separated problem IDs (leave empty for all)'
required: false
default: ''
max_concurrent:
description: 'Max concurrent evaluations (number of SkyPilot clusters)'
required: false
default: '20'
env:
SKYPILOT_CLOUD: gcp
# Disable Ray object spilling to prevent disk exhaustion on GitHub Actions runner
# SkyPilot uses Ray autoscaler internally, which can spill objects to disk
# when handling many concurrent cluster operations
RAY_automatic_object_spilling_enabled: "false"
jobs:
evaluate:
runs-on: ubuntu-latest
timeout-minutes: 360 # 6 hours max
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Checkout internal repository (hidden problems)
uses: actions/checkout@v4
with:
repository: FrontierCS/Frontier-CS-internal
token: ${{ secrets.INTERNAL_REPO_TOKEN }}
path: internal
- name: Merge internal problems and solutions
run: |
# Copy internal problems to research/problems (overwrite if exists)
if [ -d "internal/research/problems" ]; then
cp -r internal/research/problems/* research/problems/
echo "Merged $(ls -d internal/research/problems/*/ 2>/dev/null | wc -l) problem directories from internal repo"
fi
# Copy internal solutions to research/solutions (overwrite if exists)
if [ -d "internal/research/solutions" ]; then
cp -r internal/research/solutions/* research/solutions/
echo "Merged $(ls -d internal/research/solutions/*/ 2>/dev/null | wc -l) solution directories from internal repo"
fi
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Set up Python
run: uv python install 3.11
- name: Install dependencies
run: |
uv sync
uv pip install skypilot[gcp]
- name: Set up GCP credentials
run: |
echo '${{ secrets.GCP_CREDENTIALS }}' > /tmp/gcp-key.json
# Set as application default credentials for SkyPilot
mkdir -p ~/.config/gcloud
cp /tmp/gcp-key.json ~/.config/gcloud/application_default_credentials.json
# For service account, also activate with gcloud
if grep -q '"type": "service_account"' /tmp/gcp-key.json; then
gcloud auth activate-service-account --key-file=/tmp/gcp-key.json
fi
gcloud config set project ${{ secrets.GCP_PROJECT_ID }}
env:
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json
- name: Configure SkyPilot
run: |
mkdir -p ~/.sky
# SkyPilot uses GCP project from gcloud config or ADC
uv run sky check gcp || true
env:
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json
- name: Fetch previous state for incremental evaluation
env:
RESULTS_REPO_TOKEN: ${{ secrets.RESULTS_REPO_TOKEN }}
run: |
# Clone results repo to get previous state
git clone --depth 1 https://x-access-token:${RESULTS_REPO_TOKEN}@github.com/FrontierCS/Frontier-CS-Result.git /tmp/prev-results
# Copy previous state to results dir (for incremental evaluation)
mkdir -p results/batch
if [ -f /tmp/prev-results/batch/.state.json ]; then
cp /tmp/prev-results/batch/.state.json results/batch/
echo "Loaded previous state for incremental evaluation"
fi
- name: Run batch evaluation
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json
run: |
MAX_CONCURRENT="${{ github.event.inputs.max_concurrent || '4' }}"
# Auto-discover solutions from research/solutions directory
uv run frontier-eval batch \
--solutions-dir research/solutions \
--skypilot \
--max-concurrent $MAX_CONCURRENT \
--results-dir results/batch
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: evaluation-results-${{ github.run_id }}
path: results/
retention-days: 90
- name: Push results to results repository
if: always()
env:
RESULTS_REPO_TOKEN: ${{ secrets.RESULTS_REPO_TOKEN }}
run: |
# Clone the results repository
git clone https://x-access-token:${RESULTS_REPO_TOKEN}@github.com/FrontierCS/Frontier-CS-Result.git /tmp/results-repo
# Copy results
cp -r results/* /tmp/results-repo/
cd /tmp/results-repo
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add .
git diff --staged --quiet || git commit -m "chore: update evaluation results $(date +%Y-%m-%d)"
git push
- name: Cleanup SkyPilot clusters
if: always()
env:
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcp-key.json
run: |
echo "Cleaning up SkyPilot clusters..."
# Get all eval-* clusters and delete them in parallel
CLUSTERS=$(uv run sky status --refresh 2>/dev/null | grep -E '^eval-' | awk '{print $1}' || true)
if [ -n "$CLUSTERS" ]; then
echo "$CLUSTERS" | while read cluster; do
echo "Terminating cluster: $cluster"
uv run sky down "$cluster" -y &
done
wait
fi
echo "Cleanup complete"