Skip to content

bumping sumboudle again #10

bumping sumboudle again

bumping sumboudle again #10

Workflow file for this run

name: KernelBench Eval
on:
pull_request:
types: [labeled]
paths:
- 'submissions/*.json'
jobs:
evaluate:
# Only run when PR has 'evaluate' label
if: github.event.label.name == 'evaluate'
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Dependencies
run: |
pip install modal pydra tqdm numpy tabulate datasets
pip install -r KernelBench/requirements.txt
- name: Setup Modal
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
run: |
modal token set --token-id $MODAL_TOKEN_ID --token-secret $MODAL_TOKEN_SECRET
- name: Checkout Submodule Branch
run: |
cd KernelBench
git fetch origin leaderboard-analysis-json
git checkout leaderboard-analysis-json
- name: Identify Submission File
id: submission
run: |
FILE=$(git diff --name-only origin/main | grep 'submissions/.*\.json' | head -n 1)
if [ -z "$FILE" ]; then
echo "Error: No submission JSON found in PR"
exit 1
fi
echo "file=$FILE" >> $GITHUB_OUTPUT
NAME=$(basename "$FILE" .json)
echo "name=$NAME" >> $GITHUB_OUTPUT
- name: Validate Submission Format
run: |
python3 << 'EOF'
import json
import sys
with open("${{ steps.submission.outputs.file }}", "r") as f:
data = json.load(f)
if "metadata" not in data or "kernels" not in data:
print("Error: Missing 'metadata' or 'kernels' field")
sys.exit(1)
if "display_name" not in data["metadata"]:
print("Error: Missing 'display_name' in metadata")
sys.exit(1)
level_counts = {1: 0, 2: 0, 3: 0}
for key in data["kernels"]:
parts = key.split("_")
if len(parts) >= 4 and parts[0] == "level":
level = int(parts[1])
if level in level_counts:
level_counts[level] += 1
print(f"Submission: {data['metadata'].get('display_name', 'Unknown')}")
print(f"Level 1: {level_counts[1]}/100, Level 2: {level_counts[2]}/100, Level 3: {level_counts[3]}/50")
EOF
- name: Create Data Directory
run: mkdir -p data
- name: Convert Submission to Run Format
run: |
python scripts/submission_to_run.py --submission ${{ steps.submission.outputs.file }}
- name: Evaluate Level 1
working-directory: KernelBench
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
run: |
python scripts/eval_from_generations.py \
run_name=${{ steps.submission.outputs.name }}_level1 \
level=1 \
eval_mode=modal \
gpu=H100 \
dataset_src=local \
num_gpu_devices=10 \
timeout=300
- name: Evaluate Level 2
working-directory: KernelBench
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
run: |
python scripts/eval_from_generations.py \
run_name=${{ steps.submission.outputs.name }}_level2 \
level=2 \
eval_mode=modal \
gpu=H100 \
dataset_src=local \
num_gpu_devices=10 \
timeout=300
- name: Evaluate Level 3
working-directory: KernelBench
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
run: |
python scripts/eval_from_generations.py \
run_name=${{ steps.submission.outputs.name }}_level3 \
level=3 \
eval_mode=modal \
gpu=H100 \
dataset_src=local \
num_gpu_devices=10 \
timeout=300
- name: Analyze Level 1
working-directory: KernelBench
run: |
python scripts/benchmark_eval_analysis.py \
run_name=${{ steps.submission.outputs.name }}_level1 \
level=1 \
hardware=H100 \
baseline=baseline \
baseline_file=${{ github.workspace }}/baselines/H100.json \
output_file=${{ github.workspace }}/data/level1_results.json
- name: Analyze Level 2
working-directory: KernelBench
run: |
python scripts/benchmark_eval_analysis.py \
run_name=${{ steps.submission.outputs.name }}_level2 \
level=2 \
hardware=H100 \
baseline=baseline \
baseline_file=${{ github.workspace }}/baselines/H100.json \
output_file=${{ github.workspace }}/data/level2_results.json
- name: Analyze Level 3
working-directory: KernelBench
run: |
python scripts/benchmark_eval_analysis.py \
run_name=${{ steps.submission.outputs.name }}_level3 \
level=3 \
hardware=H100 \
baseline=baseline \
baseline_file=${{ github.workspace }}/baselines/H100.json \
output_file=${{ github.workspace }}/data/level3_results.json
- name: Update Leaderboard
run: |
python scripts/update_leaderboard.py \
--level1 data/level1_results.json \
--level2 data/level2_results.json \
--level3 data/level3_results.json \
--submission ${{ steps.submission.outputs.file }}
- name: Upload Results Artifact
uses: actions/upload-artifact@v4
with:
name: evaluation-results
path: |
data/metadata.json
data/results/*.json
- name: Post Results Comment
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
const metadata = JSON.parse(fs.readFileSync('data/metadata.json', 'utf8'));
const runName = '${{ steps.submission.outputs.name }}';
const entry = metadata.find(e => e.id === runName);
if (!entry) {
console.log('Entry not found in metadata.json');
return;
}
const m = entry.metrics;
const ls = entry.level_stats;
const body = `## KernelBench Evaluation Results
| Metric | Value |

Check failure on line 207 in .github/workflows/eval.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/eval.yml

Invalid workflow file

You have an error in your yaml syntax on line 207
|--------|-------|
| **Display Name** | ${entry.display_name} |
| **Hardware** | ${entry.hardware} |
| **GeoMean Speedup** | ${m.geo_mean.toFixed(4)}x |
| **Fast@1.0** | ${(m.fast_p_1_0 * 100).toFixed(1)}% |
| **Fast@1.5** | ${(m.fast_p_1_5 * 100).toFixed(1)}% |
| **Fast@2.0** | ${(m.fast_p_2_0 * 100).toFixed(1)}% |
| **Compiled** | ${m.total_compiled}/${m.total_submitted} (${(m.compile_rate * 100).toFixed(1)}%) |
| **Correct** | ${m.total_correct}/${m.total_submitted} (${(m.correct_rate * 100).toFixed(1)}%) |
### Per-Level Breakdown
| Level | Evaluated | Compiled | Correct |
|-------|-----------|----------|---------|
| Level 1 | ${ls.level1.evaluated}/${ls.level1.expected} | ${ls.level1.compiled} | ${ls.level1.correct} |
| Level 2 | ${ls.level2.evaluated}/${ls.level2.expected} | ${ls.level2.compiled} | ${ls.level2.correct} |
| Level 3 | ${ls.level3.evaluated}/${ls.level3.expected} | ${ls.level3.compiled} | ${ls.level3.correct} |
---
*Evaluated on ${entry.hardware} via Modal*
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body
});