Nightly Evaluations #4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: 'Nightly Evaluations' | |
| on: | |
| schedule: | |
| - cron: '0 1 * * *' # 1 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| iterations: | |
| description: 'Number of iterations per test case' | |
| required: true | |
| default: '1' | |
| jobs: | |
| evaluate: | |
| runs-on: 'ubuntu-latest' | |
| permissions: | |
| contents: 'read' | |
| strategy: | |
| matrix: | |
| model: | |
| [ | |
| 'gemini-3-pro-preview', | |
| 'gemini-3-flash-preview', | |
| 'gemini-2.5-pro', | |
| 'gemini-2.5-flash', | |
| 'gemini-2.5-flash-lite', | |
| ] | |
| name: 'Evaluate ${{ matrix.model }}' | |
| steps: | |
| - name: 'Checkout code' | |
| uses: 'actions/checkout@v4' # ratchet:exclude | |
| - name: 'Set up Node.js' | |
| uses: 'actions/setup-node@v4' # ratchet:exclude | |
| with: | |
| node-version: '20' | |
| cache: 'npm' | |
| - name: 'Install dependencies' | |
| run: | | |
| npm ci | |
| - name: 'Install Gemini CLI' | |
| run: 'npm install -g @google/gemini-cli@latest' | |
| - name: 'Run Evaluations' | |
| env: | |
| GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' | |
| GEMINI_MODEL: '${{ matrix.model }}' | |
| run: | | |
| npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json | |
| - name: 'Upload Results' | |
| if: 'always()' | |
| uses: 'actions/upload-artifact@v4' # ratchet:exclude | |
| with: | |
| name: 'eval-results-${{ matrix.model }}' | |
| path: 'eval-results-${{ matrix.model }}.json' | |
| - name: 'Job Summary' | |
| if: 'always()' | |
| run: | | |
| npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY" |