Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 106 additions & 15 deletions .github/workflows/llm-benchmark-periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@ name: Periodic LLM benchmarks

on:
schedule:
# Daily at midnight UTC. Change to '0 */6 * * *' for every 6h,
# or '0 */4 * * *' for every 4h.
- cron: '0 0 * * *'
# Weekly on Monday at midnight UTC.
- cron: '0 0 * * 1'
workflow_dispatch:
inputs:
model_set:
description: 'Model set to run'
required: false
type: choice
options:
- website_active
- local_defaults
- explicit
default: website_active
models:
description: 'Models to run (provider:model format, comma-separated, or "all")'
description: 'Space-separated provider:model groups. Required when model_set=explicit.'
required: false
default: 'all'
default: ''
languages:
description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)'
required: false
Expand All @@ -19,12 +27,24 @@ on:
description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)'
required: false
default: 'guidelines,no_context'
categories:
description: 'Optional benchmark categories to run (comma-separated)'
required: false
default: ''
tasks:
description: 'Optional benchmark task ids/selectors to run (comma-separated)'
required: false
default: ''
dry_run:
description: 'Run benchmarks without uploading results'
required: false
default: 'false'

permissions:
contents: read

concurrency:
group: llm-benchmark-periodic
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand All @@ -33,10 +53,9 @@ jobs:
timeout-minutes: 180

steps:
- name: Checkout master
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 1

- uses: dtolnay/rust-toolchain@stable
Expand All @@ -45,7 +64,7 @@ jobs:
- name: Setup .NET SDK
uses: actions/setup-dotnet@v4
with:
dotnet-version: "8.0.x"
global-json-file: global.json

- name: Install WASI workload
env:
Expand All @@ -55,13 +74,28 @@ jobs:
run: |
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel

- name: Pack C# runtime packages
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'csharp') }}
run: |
dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
dotnet pack -c Release crates/bindings-csharp/Runtime

- name: Set up Node.js
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
uses: actions/setup-node@v4
with:
node-version: 22

- name: Install pnpm
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
uses: ./.github/actions/setup-pnpm
with:
run_install: true

- name: Build TypeScript SDK
if: ${{ github.event_name != 'workflow_dispatch' || contains(inputs.languages || 'rust,csharp,typescript', 'typescript') }}
run: pnpm build
working-directory: crates/bindings-typescript

- name: Build llm-benchmark tool
run: cargo install --path tools/xtask-llm-benchmark --locked
Expand All @@ -78,30 +112,87 @@ jobs:
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
LLM_VENDOR: openrouter
LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
DOTNET_MULTILEVEL_LOOKUP: "0"
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
MSBUILDDISABLENODEREUSE: "1"
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
LLM_BENCH_CSHARP_CONCURRENCY: "1"
INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
INPUT_MODELS: ${{ inputs.models || 'all' }}
INPUT_MODEL_SET: ${{ inputs.model_set || 'website_active' }}
INPUT_MODELS: ${{ inputs.models || '' }}
INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
INPUT_CATEGORIES: ${{ inputs.categories || '' }}
INPUT_TASKS: ${{ inputs.tasks || '' }}
INPUT_DRY_RUN: ${{ inputs.dry_run || 'false' }}
run: |
LANGS="$INPUT_LANGUAGES"
MODEL_SET="$INPUT_MODEL_SET"
MODELS="$INPUT_MODELS"
MODES="$INPUT_MODES"
CATEGORIES="$INPUT_CATEGORIES"
TASKS="$INPUT_TASKS"
DRY_RUN="$INPUT_DRY_RUN"

case "$MODEL_SET" in
website_active)
if [ -n "$MODELS" ]; then
echo "::error::models is only valid when model_set=explicit"
exit 1
fi
;;
local_defaults)
if [ -n "$MODELS" ]; then
echo "::error::models is only valid when model_set=explicit"
exit 1
fi
;;
explicit)
if [ -z "$MODELS" ]; then
echo "::error::models is required when model_set=explicit"
exit 1
fi
read -r -a MODEL_ARGS <<< "$MODELS"
;;
*)
echo "::error::unknown model_set '$MODEL_SET' (expected website_active, local_defaults, or explicit)"
exit 1
;;
esac

SUCCEEDED=0
FAILED=0
for LANG in $(echo "$LANGS" | tr ',' ' '); do
if [ "$MODELS" = "all" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES"; then
EXTRA_ARGS=()
if [ -n "$CATEGORIES" ]; then
EXTRA_ARGS+=(--categories "$CATEGORIES")
fi
if [ -n "$TASKS" ]; then
EXTRA_ARGS+=(--tasks "$TASKS")
fi
if [ "$DRY_RUN" = "true" ]; then
EXTRA_ARGS+=(--dry-run)
fi

if [ "$MODEL_SET" = "website_active" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" --model-source remote "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG"
FAILED=$((FAILED + 1))
fi
elif [ "$MODEL_SET" = "local_defaults" ]; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG"
FAILED=$((FAILED + 1))
fi
else
if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then
if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "${MODEL_ARGS[@]}" "${EXTRA_ARGS[@]}"; then
SUCCEEDED=$((SUCCEEDED + 1))
else
echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
Expand All @@ -110,7 +201,7 @@ jobs:
fi
done
echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed"
if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then
echo "::error::All benchmark runs failed"
if [ "$FAILED" -gt 0 ]; then
echo "::error::$FAILED benchmark run(s) failed"
exit 1
fi
43 changes: 35 additions & 8 deletions .github/workflows/llm-benchmark-validate-goldens.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,26 @@ name: Validate LLM benchmark golden answers

on:
schedule:
# Nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch: {}
# Weekly on Monday at 2 AM UTC.
- cron: '0 2 * * 1'
workflow_dispatch:
inputs:
lang:
description: 'Language to validate for manual smoke runs'
required: false
type: choice
default: all
options:
- all
- rust
- csharp
- typescript

permissions:
contents: read

concurrency:
group: llm-benchmark-validate-goldens
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand All @@ -21,13 +32,12 @@ jobs:
strategy:
fail-fast: false
matrix:
lang: [rust, csharp, typescript]
lang: ${{ fromJSON(github.event_name == 'workflow_dispatch' && inputs.lang != 'all' && format('["{0}"]', inputs.lang) || '["rust","csharp","typescript"]') }}

steps:
- name: Checkout master
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: master
fetch-depth: 1

- uses: dtolnay/rust-toolchain@stable
Expand All @@ -37,7 +47,7 @@ jobs:
if: matrix.lang == 'csharp'
uses: actions/setup-dotnet@v4
with:
dotnet-version: "8.0.x"
global-json-file: global.json

- name: Install WASI workload
if: matrix.lang == 'csharp'
Expand All @@ -48,6 +58,12 @@ jobs:
run: |
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel

- name: Pack C# runtime packages
if: matrix.lang == 'csharp'
run: |
dotnet pack -c Release crates/bindings-csharp/BSATN.Runtime
dotnet pack -c Release crates/bindings-csharp/Runtime

- name: Set up Node.js
if: matrix.lang == 'typescript'
uses: actions/setup-node@v4
Expand All @@ -57,6 +73,13 @@ jobs:
- name: Install pnpm
if: matrix.lang == 'typescript'
uses: ./.github/actions/setup-pnpm
with:
run_install: true

- name: Build TypeScript SDK
if: matrix.lang == 'typescript'
run: pnpm build
working-directory: crates/bindings-typescript

- name: Build llm-benchmark tool
run: cargo install --path tools/xtask-llm-benchmark --locked
Expand All @@ -70,7 +93,11 @@ jobs:

- name: Validate golden answers (${{ matrix.lang }})
env:
DOTNET_MULTILEVEL_LOOKUP: "0"
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
MSBUILDDISABLENODEREUSE: "1"
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
LLM_BENCH_CSHARP_CONCURRENCY: "1"
run: |
llm_benchmark run --goldens-only --lang ${{ matrix.lang }}
Loading
Loading