Skip to content

Nightly Evaluations

Nightly Evaluations #7

Workflow file for this run

name: 'Nightly Evaluations'
on:
schedule:
- cron: '0 1 * * *' # 1 AM UTC
workflow_dispatch:
inputs:
iterations:
description: 'Number of iterations per test case'
required: true
default: '1'
jobs:
evaluate:
runs-on: 'ubuntu-latest'
permissions:
contents: 'read'
strategy:
matrix:
model:
[
'gemini-3-pro-preview',
'gemini-3-flash-preview',
'gemini-2.5-pro',
'gemini-2.5-flash',
'gemini-2.5-flash-lite',
]
name: 'Evaluate ${{ matrix.model }}'
steps:
- name: 'Checkout code'
uses: 'actions/checkout@v4' # ratchet:exclude
- name: 'Set up Node.js'
uses: 'actions/setup-node@v4' # ratchet:exclude
with:
node-version: '20'
cache: 'npm'
- name: 'Install dependencies'
run: |
npm ci
- name: 'Install Gemini CLI'
run: 'npm install -g @google/gemini-cli@latest'
- name: 'Run Evaluations'
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: '${{ matrix.model }}'
run: |
npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json
- name: 'Upload Results'
if: 'always()'
uses: 'actions/upload-artifact@v4' # ratchet:exclude
with:
name: 'eval-results-${{ matrix.model }}'
path: 'eval-results-${{ matrix.model }}.json'
- name: 'Job Summary'
if: 'always()'
run: |
npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY"