bumping sumboudle again #10
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: KernelBench Eval | ||
| on: | ||
| pull_request: | ||
| types: [labeled] | ||
| paths: | ||
| - 'submissions/*.json' | ||
| jobs: | ||
| evaluate: | ||
| # Only run when PR has 'evaluate' label | ||
| if: github.event.label.name == 'evaluate' | ||
| runs-on: ubuntu-latest | ||
| permissions: | ||
| contents: write | ||
| pull-requests: write | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| with: | ||
| submodules: recursive | ||
| - name: Setup Python | ||
| uses: actions/setup-python@v4 | ||
| with: | ||
| python-version: '3.10' | ||
| - name: Install Dependencies | ||
| run: | | ||
| pip install modal pydra tqdm numpy tabulate datasets | ||
| pip install -r KernelBench/requirements.txt | ||
| - name: Setup Modal | ||
| env: | ||
| MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} | ||
| MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} | ||
| run: | | ||
| modal token set --token-id $MODAL_TOKEN_ID --token-secret $MODAL_TOKEN_SECRET | ||
| - name: Checkout Submodule Branch | ||
| run: | | ||
| cd KernelBench | ||
| git fetch origin leaderboard-analysis-json | ||
| git checkout leaderboard-analysis-json | ||
| - name: Identify Submission File | ||
| id: submission | ||
| run: | | ||
| FILE=$(git diff --name-only origin/main | grep 'submissions/.*\.json' | head -n 1) | ||
| if [ -z "$FILE" ]; then | ||
| echo "Error: No submission JSON found in PR" | ||
| exit 1 | ||
| fi | ||
| echo "file=$FILE" >> $GITHUB_OUTPUT | ||
| NAME=$(basename "$FILE" .json) | ||
| echo "name=$NAME" >> $GITHUB_OUTPUT | ||
| - name: Validate Submission Format | ||
| run: | | ||
| python3 << 'EOF' | ||
| import json | ||
| import sys | ||
| with open("${{ steps.submission.outputs.file }}", "r") as f: | ||
| data = json.load(f) | ||
| if "metadata" not in data or "kernels" not in data: | ||
| print("Error: Missing 'metadata' or 'kernels' field") | ||
| sys.exit(1) | ||
| if "display_name" not in data["metadata"]: | ||
| print("Error: Missing 'display_name' in metadata") | ||
| sys.exit(1) | ||
| level_counts = {1: 0, 2: 0, 3: 0} | ||
| for key in data["kernels"]: | ||
| parts = key.split("_") | ||
| if len(parts) >= 4 and parts[0] == "level": | ||
| level = int(parts[1]) | ||
| if level in level_counts: | ||
| level_counts[level] += 1 | ||
| print(f"Submission: {data['metadata'].get('display_name', 'Unknown')}") | ||
| print(f"Level 1: {level_counts[1]}/100, Level 2: {level_counts[2]}/100, Level 3: {level_counts[3]}/50") | ||
| EOF | ||
| - name: Create Data Directory | ||
| run: mkdir -p data | ||
| - name: Convert Submission to Run Format | ||
| run: | | ||
| python scripts/submission_to_run.py --submission ${{ steps.submission.outputs.file }} | ||
| - name: Evaluate Level 1 | ||
| working-directory: KernelBench | ||
| env: | ||
| MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} | ||
| MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} | ||
| run: | | ||
| python scripts/eval_from_generations.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level1 \ | ||
| level=1 \ | ||
| eval_mode=modal \ | ||
| gpu=H100 \ | ||
| dataset_src=local \ | ||
| num_gpu_devices=10 \ | ||
| timeout=300 | ||
| - name: Evaluate Level 2 | ||
| working-directory: KernelBench | ||
| env: | ||
| MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} | ||
| MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} | ||
| run: | | ||
| python scripts/eval_from_generations.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level2 \ | ||
| level=2 \ | ||
| eval_mode=modal \ | ||
| gpu=H100 \ | ||
| dataset_src=local \ | ||
| num_gpu_devices=10 \ | ||
| timeout=300 | ||
| - name: Evaluate Level 3 | ||
| working-directory: KernelBench | ||
| env: | ||
| MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} | ||
| MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} | ||
| run: | | ||
| python scripts/eval_from_generations.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level3 \ | ||
| level=3 \ | ||
| eval_mode=modal \ | ||
| gpu=H100 \ | ||
| dataset_src=local \ | ||
| num_gpu_devices=10 \ | ||
| timeout=300 | ||
| - name: Analyze Level 1 | ||
| working-directory: KernelBench | ||
| run: | | ||
| python scripts/benchmark_eval_analysis.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level1 \ | ||
| level=1 \ | ||
| hardware=H100 \ | ||
| baseline=baseline \ | ||
| baseline_file=${{ github.workspace }}/baselines/H100.json \ | ||
| output_file=${{ github.workspace }}/data/level1_results.json | ||
| - name: Analyze Level 2 | ||
| working-directory: KernelBench | ||
| run: | | ||
| python scripts/benchmark_eval_analysis.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level2 \ | ||
| level=2 \ | ||
| hardware=H100 \ | ||
| baseline=baseline \ | ||
| baseline_file=${{ github.workspace }}/baselines/H100.json \ | ||
| output_file=${{ github.workspace }}/data/level2_results.json | ||
| - name: Analyze Level 3 | ||
| working-directory: KernelBench | ||
| run: | | ||
| python scripts/benchmark_eval_analysis.py \ | ||
| run_name=${{ steps.submission.outputs.name }}_level3 \ | ||
| level=3 \ | ||
| hardware=H100 \ | ||
| baseline=baseline \ | ||
| baseline_file=${{ github.workspace }}/baselines/H100.json \ | ||
| output_file=${{ github.workspace }}/data/level3_results.json | ||
| - name: Update Leaderboard | ||
| run: | | ||
| python scripts/update_leaderboard.py \ | ||
| --level1 data/level1_results.json \ | ||
| --level2 data/level2_results.json \ | ||
| --level3 data/level3_results.json \ | ||
| --submission ${{ steps.submission.outputs.file }} | ||
| - name: Upload Results Artifact | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: evaluation-results | ||
| path: | | ||
| data/metadata.json | ||
| data/results/*.json | ||
| - name: Post Results Comment | ||
| uses: actions/github-script@v6 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const metadata = JSON.parse(fs.readFileSync('data/metadata.json', 'utf8')); | ||
| const runName = '${{ steps.submission.outputs.name }}'; | ||
| const entry = metadata.find(e => e.id === runName); | ||
| if (!entry) { | ||
| console.log('Entry not found in metadata.json'); | ||
| return; | ||
| } | ||
| const m = entry.metrics; | ||
| const ls = entry.level_stats; | ||
| const body = `## KernelBench Evaluation Results | ||
| | Metric | Value | | ||
| |--------|-------| | ||
| | **Display Name** | ${entry.display_name} | | ||
| | **Hardware** | ${entry.hardware} | | ||
| | **GeoMean Speedup** | ${m.geo_mean.toFixed(4)}x | | ||
| | **Fast@1.0** | ${(m.fast_p_1_0 * 100).toFixed(1)}% | | ||
| | **Fast@1.5** | ${(m.fast_p_1_5 * 100).toFixed(1)}% | | ||
| | **Fast@2.0** | ${(m.fast_p_2_0 * 100).toFixed(1)}% | | ||
| | **Compiled** | ${m.total_compiled}/${m.total_submitted} (${(m.compile_rate * 100).toFixed(1)}%) | | ||
| | **Correct** | ${m.total_correct}/${m.total_submitted} (${(m.correct_rate * 100).toFixed(1)}%) | | ||
| ### Per-Level Breakdown | ||
| | Level | Evaluated | Compiled | Correct | | ||
| |-------|-----------|----------|---------| | ||
| | Level 1 | ${ls.level1.evaluated}/${ls.level1.expected} | ${ls.level1.compiled} | ${ls.level1.correct} | | ||
| | Level 2 | ${ls.level2.evaluated}/${ls.level2.expected} | ${ls.level2.compiled} | ${ls.level2.correct} | | ||
| | Level 3 | ${ls.level3.evaluated}/${ls.level3.expected} | ${ls.level3.compiled} | ${ls.level3.correct} | | ||
| --- | ||
| *Evaluated on ${entry.hardware} via Modal* | ||
| `; | ||
| github.rest.issues.createComment({ | ||
| issue_number: context.issue.number, | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| body: body | ||
| }); | ||