Skip to content

Commit 5b83e83

Browse files
committed
Automated Claude Code usage for eval test generation
1 parent 612c12a commit 5b83e83

File tree

17 files changed

+2986
-22
lines changed

17 files changed

+2986
-22
lines changed

evals/CLAUDE.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,13 @@ cp .env.example .env
5858
# Navigate to native runner directory
5959
cd native
6060

61-
# Run a specific evaluation by path (relative to data/)
61+
# Run a specific evaluation by file path (relative to data/)
6262
python3 run.py --path test-simple/math-001.yaml
6363

64+
# Run a specific evaluation by directory path (NEW: auto-detects task.yaml)
65+
python3 run.py --path js-verifier/action/dropdown
66+
python3 run.py --path js-verifier/action/daterange --verbose
67+
6468
# Run with verbose output (shows input, response, reasoning, screenshots)
6569
python3 run.py --path action-agent/accordion-001.yaml --verbose
6670

evals/config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Evaluation Framework Configuration
22
# This configuration is shared across all evaluation runner scripts
3-
# Configuration for OpenAI models
3+
# Supports OpenAI, Groq, OpenRouter, and LiteLLM providers
44

55
# API endpoint for the evaluation server
66
api_endpoint: "http://localhost:8080"
@@ -30,7 +30,7 @@ judge_model:
3030
provider: "openai"
3131
model_name: "gpt-5"
3232
api_key: "${OPENAI_API_KEY}"
33-
# temperature: 0.1 # GPT-5 doesn't support custom temperature
33+
3434

3535
# Execution settings
3636

evals/eval_builder_snapshots.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import yaml
2323
import requests
2424
import time
25+
import subprocess
2526
from pathlib import Path
2627
from typing import Dict, Any, Optional
2728
from difflib import unified_diff
@@ -508,15 +509,87 @@ async def step_7_generate_validation(self):
508509
validation_file = f"{snapshot_dir}/verify.js"
509510

510511
print("Options:")
511-
print("1. Wait for Claude Code to create verify.js (recommended)")
512-
print("2. Enter validation JavaScript manually")
512+
print("1. Auto-run Claude Code subprocess (recommended)")
513+
print("2. Wait for Claude Code manually (you run it)")
514+
print("3. Enter validation JavaScript manually")
513515
print()
514516

515-
choice = input("Choice (1/2): ").strip()
517+
choice = input("Choice (1/2/3): ").strip()
516518

517519
lines = []
518520

519521
if choice == '1':
522+
# Automatically spawn Claude Code subprocess
523+
print(f"\n🤖 Launching Claude Code subprocess...")
524+
print()
525+
526+
# Construct the prompt for Claude Code
527+
claude_prompt = f"Read @{marker_file} and complete the task described there. Generate the validation JavaScript and save it to {validation_file}. Test it on both tabs as instructed."
528+
529+
try:
530+
# Call Claude Code CLI with --dangerously-skip-permissions for auto-accept
531+
result = subprocess.run(
532+
['claude', '--dangerously-skip-permissions', claude_prompt],
533+
cwd=os.getcwd(),
534+
capture_output=True,
535+
text=True,
536+
timeout=300 # 5 minute timeout
537+
)
538+
539+
print("Claude Code output:")
540+
print("─" * 60)
541+
print(result.stdout)
542+
if result.stderr:
543+
print("Errors:")
544+
print(result.stderr)
545+
print("─" * 60)
546+
print()
547+
548+
# Check if verify.js was created
549+
if os.path.exists(validation_file):
550+
print("✅ Validation file detected!")
551+
with open(validation_file, 'r') as f:
552+
js_code = f.read().strip()
553+
554+
# Clean up if it has markdown code blocks
555+
if js_code.startswith('```'):
556+
lines_raw = js_code.split('\n')
557+
if lines_raw[0].startswith('```'):
558+
lines_raw = lines_raw[1:]
559+
if lines_raw[-1].startswith('```'):
560+
lines_raw = lines_raw[:-1]
561+
js_code = '\n'.join(lines_raw).strip()
562+
563+
print()
564+
print("📝 Loaded validation code:")
565+
print("─" * 60)
566+
print(js_code[:300] + "..." if len(js_code) > 300 else js_code)
567+
print("─" * 60)
568+
569+
lines = js_code.split('\n')
570+
else:
571+
print(f"⚠️ Claude Code ran but {validation_file} was not created")
572+
print("Falling back to manual entry...")
573+
choice = '3'
574+
lines = []
575+
576+
except subprocess.TimeoutExpired:
577+
print("⏱️ Claude Code subprocess timed out (5 minutes)")
578+
print("Falling back to manual entry...")
579+
choice = '3'
580+
lines = []
581+
except FileNotFoundError:
582+
print("❌ 'claude' command not found. Is Claude Code installed?")
583+
print("Falling back to manual entry...")
584+
choice = '3'
585+
lines = []
586+
except Exception as e:
587+
print(f"❌ Error running Claude Code: {e}")
588+
print("Falling back to manual entry...")
589+
choice = '3'
590+
lines = []
591+
592+
elif choice == '2':
520593
print(f"\n⏳ Waiting for {validation_file} to be created...")
521594
print(" (Claude Code will create this file)")
522595
print()
@@ -557,9 +630,9 @@ async def step_7_generate_validation(self):
557630
if not lines:
558631
print("⏱️ Timeout waiting for validation file")
559632
print(" Falling back to manual entry...")
560-
choice = '2'
633+
choice = '3'
561634

562-
if choice == '2':
635+
if choice == '3':
563636
print("\nEnter validation JavaScript (type 'END' on new line when done):\n")
564637
while True:
565638
line = input()

evals/native/data/js-verifier/action/agent-datepicker/task.yaml renamed to evals/native/data/js-verifier/action/datepicker/task.yaml

File renamed without changes.

evals/native/data/js-verifier/action/date-range/CLAUDE_REQUEST.md renamed to evals/native/data/js-verifier/action/daterange/CLAUDE_REQUEST.md

File renamed without changes.

evals/native/data/js-verifier/action/date-range/after.html renamed to evals/native/data/js-verifier/action/daterange/after.html

File renamed without changes.

evals/native/data/js-verifier/action/date-range/before.html renamed to evals/native/data/js-verifier/action/daterange/before.html

File renamed without changes.

evals/native/data/js-verifier/action/date-range/diff.txt renamed to evals/native/data/js-verifier/action/daterange/diff.txt

File renamed without changes.

evals/native/data/js-verifier/action/date-range/task.yaml renamed to evals/native/data/js-verifier/action/daterange/task.yaml

File renamed without changes.

evals/native/data/js-verifier/action/date-range/verify.js renamed to evals/native/data/js-verifier/action/daterange/verify.js

File renamed without changes.

0 commit comments

Comments
 (0)