Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions evals/config.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Evaluation Framework Configuration
# This configuration is shared across all evaluation runner scripts
# Supports OpenAI, Groq, OpenRouter, and LiteLLM providers
# Example configuration for Cerebras models

# API endpoint for the evaluation server
api_endpoint: "http://localhost:8080"
Expand All @@ -9,28 +9,27 @@ api_endpoint: "http://localhost:8080"
# These models are sent to the agent for processing requests

main_model:
provider: "openai"
model_name: "gpt-5-mini"
api_key: "${OPENAI_API_KEY}"
provider: "cerebras"
model_name: "zai-glm-4.6"
api_key: "${CEREBRAS_API_KEY}"

mini_model:
provider: "openai"
model_name: "gpt-5-nano"
api_key: "${OPENAI_API_KEY}"
provider: "cerebras"
model_name: "zai-glm-4.6"
api_key: "${CEREBRAS_API_KEY}"

nano_model:
provider: "openai"
model_name: "gpt-5-nano"
api_key: "${OPENAI_API_KEY}"
provider: "cerebras"
model_name: "zai-glm-4.6"
api_key: "${CEREBRAS_API_KEY}"

# Model configuration for judging evaluation responses
# This model is used locally to assess the quality of agent responses

judge_model:
provider: "openai"
model_name: "gpt-5"
api_key: "${OPENAI_API_KEY}"

provider: "cerebras"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets use openai as judge so that we can compare with other open source benchmarks

model_name: "zai-glm-4.6"
api_key: "${CEREBRAS_API_KEY}"

# Execution settings

Expand Down
72 changes: 61 additions & 11 deletions evals/eval_builder_snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,30 @@ async def step_3_open_browser(self):
print("⏳ Waiting for page to load...")
await asyncio.sleep(3)
print("✅ Page loaded")
print(" 💡 This tab will be used for capturing BEFORE/AFTER snapshots")

except requests.exceptions.RequestException as e:
print(f"❌ Browser error: {e}")
print(" Make sure BrowserOperator is running at http://localhost:8080")
sys.exit(1)

async def _get_browser_client(self):
"""Get browser client without opening a tab. Used in extend mode."""
print("\n🔍 Getting browser client...\n")

try:
resp = requests.get(f"{self.api_base}/clients", timeout=5)
resp.raise_for_status()
clients = resp.json()

if not clients:
print("❌ No browser clients. Is BrowserOperator running?")
print(" Start it: cd deployments/local && make compose-up")
sys.exit(1)

self.client_id = clients[0]['id']
print(f"✅ Client: {self.client_id}")
print(" 💡 Tabs will be opened on-demand for each example")

except requests.exceptions.RequestException as e:
print(f"❌ Browser error: {e}")
Expand Down Expand Up @@ -1257,7 +1281,7 @@ async def run_extend(self):
"""Extend workflow for refining existing verify.js with additional examples."""
print("\n🔄 Extend Mode: Refine verify.js with additional examples\n")
print("This workflow:")
print("1. Opens browser to target URL")
print("1. Connects to browser (tabs opened per-example)")
print("2. You add positive/negative examples")
print("3. Each example tests current verify.js")
print("4. If wrong, Claude Code adjusts the script")
Expand All @@ -1269,25 +1293,51 @@ async def run_extend(self):
# Load existing task.yaml
await self.step_1_load_file()

# Open browser to target URL
await self.step_3_open_browser()
# Get browser client (without opening a tab yet)
await self._get_browser_client()

# Capture or load baseline snapshot
if self.example_manager.index['baseline']:
print("📂 Loading existing baseline snapshot...")
print("\n📂 Loading existing baseline snapshot...")
baseline_snapshot = self.example_manager.get_baseline_snapshot()
print(f"✅ Loaded baseline from previous session")
else:
print("📸 Capturing baseline snapshot (initial page state)...")
baseline_snapshot = self._capture_dom_snapshot("BASELINE")
if baseline_snapshot:
self.example_manager.save_baseline(self.client_id, self.tab_id, baseline_snapshot)
print(f"✅ Baseline saved")
else:
print("❌ Failed to capture baseline snapshot")
# Need to open a temporary tab to capture baseline
print("\n📸 Capturing baseline snapshot (initial page state)...")
print(" Opening temporary tab for baseline capture...")
url = self.eval_data['target']['url']
try:
resp = requests.post(
f"{self.api_base}/tabs/open",
json={"clientId": self.client_id, "url": url, "background": False},
timeout=10
)
resp.raise_for_status()
result = resp.json()
baseline_tab_id = result['tabId']
print(f" ✅ Baseline tab opened: {baseline_tab_id}")
await asyncio.sleep(3) # Wait for page load

# Capture baseline using the temporary tab
self.tab_id = baseline_tab_id # Temporarily set for _capture_dom_snapshot
baseline_snapshot = self._capture_dom_snapshot("BASELINE")
if baseline_snapshot:
self.example_manager.save_baseline(self.client_id, baseline_tab_id, baseline_snapshot)
print(f"✅ Baseline saved")
else:
print("❌ Failed to capture baseline snapshot")
return
self.tab_id = None # Clear since we don't need it

except requests.exceptions.RequestException as e:
print(f"❌ Failed to open baseline tab: {e}")
return

# Main loop for adding examples
print("\n" + "=" * 60)
print("💡 Ready to add examples. Each example opens a fresh tab.")
print("=" * 60)

while True:
result = await self._add_example_interactive(baseline_snapshot)
if not result:
Expand Down
32 changes: 32 additions & 0 deletions evals/native/data/js-verifier/public/ana-airlines/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
id: ana-airlines-flights-001
name: Test ANA Airlines Flights Search Functionality
description: Find a roundtrip flight from Singapore to Tokio from April 10 to April
20 for 1 adult
enabled: true
target:
url: https://www.ana.co.jp/en/us/
wait_for: networkidle
wait_timeout: 5000
timeout: 45000
input:
objective: Find a roundtrip flight from Singapore to Tokio from April 10 to April
20 for 1 adult
validation:
type: js-eval
js-eval:
script: verify.js
expected_result: true
timeout: 5000
metadata:
tags:
- flights
- form
- kayak
- action
- google-flights
priority: high
timeout: 45000
retries: 2
flaky: false
owner: devtools-team
tool: action_agent
26 changes: 26 additions & 0 deletions evals/native/data/js-verifier/public/ana-airlines/verify.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Validation for: Find a roundtrip flight from Singapore to Tokyo from April 10 to April 20 for 1 adult
// This checks for the presence of flight search results with the correct parameters

(() => {
// Check for the search results page by verifying hidden inputs with flight search details
const segmentCode = document.querySelector('#criteo_segment_code');
const departureDate = document.querySelector('#criteo_departure_date');
const arrivalDate = document.querySelector('#criteo_arrival_date');
const searchMode = document.querySelector('#criteo_search_mode');
const adultCount = document.querySelector('#criteo_adult_count');
const boardingClass = document.querySelector('#criteo_boarding_class');

// Verify all required elements exist and have correct values
const hasCorrectRoute = segmentCode && segmentCode.value === 'SIN_TYO';
const hasCorrectDeparture = departureDate && departureDate.value === '20260410';
const hasCorrectArrival = arrivalDate && arrivalDate.value === '20260420';
const hasRoundTrip = searchMode && searchMode.value === 'ROUND_TRIP';
const hasOneAdult = adultCount && adultCount.value === '1';
const hasEconomyClass = boardingClass && boardingClass.value === 'eco';

// Also verify the page title indicates search results
const isSearchResultsPage = document.title.includes('Search Results');

// All conditions must be true - using return since this is an IIFE
return hasCorrectRoute && hasCorrectDeparture && hasCorrectArrival && hasRoundTrip && hasOneAdult && hasEconomyClass && isSearchResultsPage;
})()
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id: booking-001
name: Test Booking Search Functionality
description: Test Booking search functionality
description: Find hotels in Portu (Portugal) for the dates August 10 to August 17 for 2 adults
enabled: true
target:
url: https://www.booking.com
Expand Down
3 changes: 2 additions & 1 deletion evals/native/data/js-verifier/public/booking-002/task.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
id: booking-002
name: Test Booking Search Functionality
description: Test Booking search functionality
description: Find hotels in Madeira for the dates from June 1 to June 29 for 2 adults
and a 10 years old child
enabled: true
target:
url: https://www.booking.com
Expand Down
32 changes: 32 additions & 0 deletions evals/native/data/js-verifier/public/delta-airlines/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
id: delta-airlines-flights-001
name: Test Delta Airlines Flights Search Functionality
description: Find a roundtrip flight from Chicago to Anchorage from April 10 to April
20 for 3 adults
enabled: true
target:
url: https://www.delta.com
wait_for: networkidle
wait_timeout: 5000
timeout: 45000
input:
objective: Find a roundtrip flight from Chicago to Anchorage from April 10 to April
20 for 3 adults
validation:
type: js-eval
js-eval:
script: verify.js
expected_result: true
timeout: 5000
metadata:
tags:
- flights
- form
- kayak
- action
- google-flights
priority: high
timeout: 45000
retries: 2
flaky: false
owner: devtools-team
tool: action_agent
33 changes: 33 additions & 0 deletions evals/native/data/js-verifier/public/delta-airlines/verify.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
(() => {
// Validation for: Find a roundtrip flight from Chicago to Anchorage from April 10 to April 20 for 3 adults

// Check hidden inputs for route information
const fromAirport = document.querySelector('input[name="fromAirportCode"]');
const toAirport = document.querySelector('input[name="arrivalCity"]');

// Route must be Chicago (CHI) to Anchorage (ANC)
const fromOk = fromAirport && fromAirport.value === 'CHI';
const toOk = toAirport && toAirport.value === 'ANC';

// Check for round trip indicator in header
const tripTypeHeader = document.querySelector('.triptype-header');
const tripTypeText = tripTypeHeader ? tripTypeHeader.textContent.trim() : '';
const roundTripOk = tripTypeText.toLowerCase().includes('round trip');

// Check for 3 passengers
const paxHeader = document.querySelector('.paxcount-header');
const paxText = paxHeader ? paxHeader.textContent : '';
const passengersOk = paxText.includes('3 Passenger');

// Check that we're on the search results page
const searchResults = document.querySelector('.search-results, .mach-search-results');
const resultsPageOk = searchResults !== null;

// Check for correct dates (April 10-20)
const pageText = document.body.innerText;
const hasApril10 = pageText.includes('Apr 10') || pageText.includes('April 10');
const datesOk = hasApril10;

// All checks must pass
return Boolean(fromOk && toOk && roundTripOk && passengersOk && resultsPageOk && datesOk);
})()
2 changes: 1 addition & 1 deletion evals/native/data/js-verifier/public/kayak/task.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id: kayak-001
name: Test Kayak Flights Search Functionality
description: Test Kayak Flights search functionality
description: Find one way flight options from Austin to Paris at May 20
enabled: true
target:
url: https://www.kayak.com/flights
Expand Down