bumping sumboudle again #10

Workflow file for this run

	name: KernelBench Eval

	on:
	pull_request:
	types: [labeled]
	paths:
	- 'submissions/*.json'

	jobs:
	evaluate:
	# Only run when PR has 'evaluate' label
	if: github.event.label.name == 'evaluate'
	runs-on: ubuntu-latest
	permissions:
	contents: write
	pull-requests: write

	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10'

	- name: Install Dependencies
	run: \|
	pip install modal pydra tqdm numpy tabulate datasets
	pip install -r KernelBench/requirements.txt

	- name: Setup Modal
	env:
	MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
	MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
	run: \|
	modal token set --token-id $MODAL_TOKEN_ID --token-secret $MODAL_TOKEN_SECRET

	- name: Checkout Submodule Branch
	run: \|
	cd KernelBench
	git fetch origin leaderboard-analysis-json
	git checkout leaderboard-analysis-json

	- name: Identify Submission File
	id: submission
	run: \|
	FILE=$(git diff --name-only origin/main \| grep 'submissions/.*\.json' \| head -n 1)
	if [ -z "$FILE" ]; then
	echo "Error: No submission JSON found in PR"
	exit 1
	fi
	echo "file=$FILE" >> $GITHUB_OUTPUT
	NAME=$(basename "$FILE" .json)
	echo "name=$NAME" >> $GITHUB_OUTPUT

	- name: Validate Submission Format
	run: \|
	python3 << 'EOF'
	import json
	import sys

	with open("${{ steps.submission.outputs.file }}", "r") as f:
	data = json.load(f)

	if "metadata" not in data or "kernels" not in data:
	print("Error: Missing 'metadata' or 'kernels' field")
	sys.exit(1)

	if "display_name" not in data["metadata"]:
	print("Error: Missing 'display_name' in metadata")
	sys.exit(1)

	level_counts = {1: 0, 2: 0, 3: 0}
	for key in data["kernels"]:
	parts = key.split("_")
	if len(parts) >= 4 and parts[0] == "level":
	level = int(parts[1])
	if level in level_counts:
	level_counts[level] += 1

	print(f"Submission: {data['metadata'].get('display_name', 'Unknown')}")
	print(f"Level 1: {level_counts[1]}/100, Level 2: {level_counts[2]}/100, Level 3: {level_counts[3]}/50")
	EOF

	- name: Create Data Directory
	run: mkdir -p data

	- name: Convert Submission to Run Format
	run: \|
	python scripts/submission_to_run.py --submission ${{ steps.submission.outputs.file }}

	- name: Evaluate Level 1
	working-directory: KernelBench
	env:
	MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
	MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
	run: \|
	python scripts/eval_from_generations.py \
	run_name=${{ steps.submission.outputs.name }}_level1 \
	level=1 \
	eval_mode=modal \
	gpu=H100 \
	dataset_src=local \
	num_gpu_devices=10 \
	timeout=300

	- name: Evaluate Level 2
	working-directory: KernelBench
	env:
	MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
	MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
	run: \|
	python scripts/eval_from_generations.py \
	run_name=${{ steps.submission.outputs.name }}_level2 \
	level=2 \
	eval_mode=modal \
	gpu=H100 \
	dataset_src=local \
	num_gpu_devices=10 \
	timeout=300

	- name: Evaluate Level 3
	working-directory: KernelBench
	env:
	MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
	MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
	run: \|
	python scripts/eval_from_generations.py \
	run_name=${{ steps.submission.outputs.name }}_level3 \
	level=3 \
	eval_mode=modal \
	gpu=H100 \
	dataset_src=local \
	num_gpu_devices=10 \
	timeout=300

	- name: Analyze Level 1
	working-directory: KernelBench
	run: \|
	python scripts/benchmark_eval_analysis.py \
	run_name=${{ steps.submission.outputs.name }}_level1 \
	level=1 \
	hardware=H100 \
	baseline=baseline \
	baseline_file=${{ github.workspace }}/baselines/H100.json \
	output_file=${{ github.workspace }}/data/level1_results.json

	- name: Analyze Level 2
	working-directory: KernelBench
	run: \|
	python scripts/benchmark_eval_analysis.py \
	run_name=${{ steps.submission.outputs.name }}_level2 \
	level=2 \
	hardware=H100 \
	baseline=baseline \
	baseline_file=${{ github.workspace }}/baselines/H100.json \
	output_file=${{ github.workspace }}/data/level2_results.json

	- name: Analyze Level 3
	working-directory: KernelBench
	run: \|
	python scripts/benchmark_eval_analysis.py \
	run_name=${{ steps.submission.outputs.name }}_level3 \
	level=3 \
	hardware=H100 \
	baseline=baseline \
	baseline_file=${{ github.workspace }}/baselines/H100.json \
	output_file=${{ github.workspace }}/data/level3_results.json

	- name: Update Leaderboard
	run: \|
	python scripts/update_leaderboard.py \
	--level1 data/level1_results.json \
	--level2 data/level2_results.json \
	--level3 data/level3_results.json \
	--submission ${{ steps.submission.outputs.file }}

	- name: Upload Results Artifact
	uses: actions/upload-artifact@v4
	with:
	name: evaluation-results
	path: \|
	data/metadata.json
	data/results/*.json

	- name: Post Results Comment
	uses: actions/github-script@v6
	with:
	script: \|
	const fs = require('fs');
	const metadata = JSON.parse(fs.readFileSync('data/metadata.json', 'utf8'));
	const runName = '${{ steps.submission.outputs.name }}';
	const entry = metadata.find(e => e.id === runName);

	if (!entry) {
	console.log('Entry not found in metadata.json');
	return;
	}

	const m = entry.metrics;
	const ls = entry.level_stats;

	const body = `## KernelBench Evaluation Results

	\| Metric \| Value \|
Check failure on line 207 in .github/workflows/eval.yml View workflow run for this annotation GitHub Actions / .github/workflows/eval.yml Invalid workflow file `You have an error in your yaml syntax on line 207`
	\|--------\|-------\|
	\| Display Name \| ${entry.display_name} \|
	\| Hardware \| ${entry.hardware} \|
	\| GeoMean Speedup \| ${m.geo_mean.toFixed(4)}x \|
	\| Fast@1.0 \| ${(m.fast_p_1_0 * 100).toFixed(1)}% \|
	\| Fast@1.5 \| ${(m.fast_p_1_5 * 100).toFixed(1)}% \|
	\| Fast@2.0 \| ${(m.fast_p_2_0 * 100).toFixed(1)}% \|
	\| Compiled \| ${m.total_compiled}/${m.total_submitted} (${(m.compile_rate * 100).toFixed(1)}%) \|
	\| Correct \| ${m.total_correct}/${m.total_submitted} (${(m.correct_rate * 100).toFixed(1)}%) \|

	### Per-Level Breakdown

	\| Level \| Evaluated \| Compiled \| Correct \|
	\|-------\|-----------\|----------\|---------\|
	\| Level 1 \| ${ls.level1.evaluated}/${ls.level1.expected} \| ${ls.level1.compiled} \| ${ls.level1.correct} \|
	\| Level 2 \| ${ls.level2.evaluated}/${ls.level2.expected} \| ${ls.level2.compiled} \| ${ls.level2.correct} \|
	\| Level 3 \| ${ls.level3.evaluated}/${ls.level3.expected} \| ${ls.level3.compiled} \| ${ls.level3.correct} \|

	---
	Evaluated on ${entry.hardware} via Modal
	`;

	github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: body
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

bumping sumboudle again #10

Workflow file

bumping sumboudle again #10

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/eval.yml