Skip to content

Commit 5fc6106

Browse files
authored
Merge pull request #12 from BrowserOperator/feat/new-evals
Extra new eval tests created
2 parents 06b613b + 3ec2486 commit 5fc6106

File tree

10 files changed

+201
-28
lines changed

10 files changed

+201
-28
lines changed

evals/config.yml

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Evaluation Framework Configuration
22
# This configuration is shared across all evaluation runner scripts
3-
# Supports OpenAI, Groq, OpenRouter, and LiteLLM providers
3+
# Example configuration for Cerebras models
44

55
# API endpoint for the evaluation server
66
api_endpoint: "http://localhost:8080"
@@ -9,28 +9,27 @@ api_endpoint: "http://localhost:8080"
99
# These models are sent to the agent for processing requests
1010

1111
main_model:
12-
provider: "openai"
13-
model_name: "gpt-5-mini"
14-
api_key: "${OPENAI_API_KEY}"
12+
provider: "cerebras"
13+
model_name: "zai-glm-4.6"
14+
api_key: "${CEREBRAS_API_KEY}"
1515

1616
mini_model:
17-
provider: "openai"
18-
model_name: "gpt-5-nano"
19-
api_key: "${OPENAI_API_KEY}"
17+
provider: "cerebras"
18+
model_name: "zai-glm-4.6"
19+
api_key: "${CEREBRAS_API_KEY}"
2020

2121
nano_model:
22-
provider: "openai"
23-
model_name: "gpt-5-nano"
24-
api_key: "${OPENAI_API_KEY}"
22+
provider: "cerebras"
23+
model_name: "zai-glm-4.6"
24+
api_key: "${CEREBRAS_API_KEY}"
2525

2626
# Model configuration for judging evaluation responses
2727
# This model is used locally to assess the quality of agent responses
2828

2929
judge_model:
30-
provider: "openai"
31-
model_name: "gpt-5"
32-
api_key: "${OPENAI_API_KEY}"
33-
30+
provider: "cerebras"
31+
model_name: "zai-glm-4.6"
32+
api_key: "${CEREBRAS_API_KEY}"
3433

3534
# Execution settings
3635

evals/eval_builder_snapshots.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,30 @@ async def step_3_open_browser(self):
386386
print("⏳ Waiting for page to load...")
387387
await asyncio.sleep(3)
388388
print("✅ Page loaded")
389+
print(" 💡 This tab will be used for capturing BEFORE/AFTER snapshots")
390+
391+
except requests.exceptions.RequestException as e:
392+
print(f"❌ Browser error: {e}")
393+
print(" Make sure BrowserOperator is running at http://localhost:8080")
394+
sys.exit(1)
395+
396+
async def _get_browser_client(self):
397+
"""Get browser client without opening a tab. Used in extend mode."""
398+
print("\n🔍 Getting browser client...\n")
399+
400+
try:
401+
resp = requests.get(f"{self.api_base}/clients", timeout=5)
402+
resp.raise_for_status()
403+
clients = resp.json()
404+
405+
if not clients:
406+
print("❌ No browser clients. Is BrowserOperator running?")
407+
print(" Start it: cd deployments/local && make compose-up")
408+
sys.exit(1)
409+
410+
self.client_id = clients[0]['id']
411+
print(f"✅ Client: {self.client_id}")
412+
print(" 💡 Tabs will be opened on-demand for each example")
389413

390414
except requests.exceptions.RequestException as e:
391415
print(f"❌ Browser error: {e}")
@@ -1257,7 +1281,7 @@ async def run_extend(self):
12571281
"""Extend workflow for refining existing verify.js with additional examples."""
12581282
print("\n🔄 Extend Mode: Refine verify.js with additional examples\n")
12591283
print("This workflow:")
1260-
print("1. Opens browser to target URL")
1284+
print("1. Connects to browser (tabs opened per-example)")
12611285
print("2. You add positive/negative examples")
12621286
print("3. Each example tests current verify.js")
12631287
print("4. If wrong, Claude Code adjusts the script")
@@ -1269,25 +1293,51 @@ async def run_extend(self):
12691293
# Load existing task.yaml
12701294
await self.step_1_load_file()
12711295

1272-
# Open browser to target URL
1273-
await self.step_3_open_browser()
1296+
# Get browser client (without opening a tab yet)
1297+
await self._get_browser_client()
12741298

12751299
# Capture or load baseline snapshot
12761300
if self.example_manager.index['baseline']:
1277-
print("📂 Loading existing baseline snapshot...")
1301+
print("\n📂 Loading existing baseline snapshot...")
12781302
baseline_snapshot = self.example_manager.get_baseline_snapshot()
12791303
print(f"✅ Loaded baseline from previous session")
12801304
else:
1281-
print("📸 Capturing baseline snapshot (initial page state)...")
1282-
baseline_snapshot = self._capture_dom_snapshot("BASELINE")
1283-
if baseline_snapshot:
1284-
self.example_manager.save_baseline(self.client_id, self.tab_id, baseline_snapshot)
1285-
print(f"✅ Baseline saved")
1286-
else:
1287-
print("❌ Failed to capture baseline snapshot")
1305+
# Need to open a temporary tab to capture baseline
1306+
print("\n📸 Capturing baseline snapshot (initial page state)...")
1307+
print(" Opening temporary tab for baseline capture...")
1308+
url = self.eval_data['target']['url']
1309+
try:
1310+
resp = requests.post(
1311+
f"{self.api_base}/tabs/open",
1312+
json={"clientId": self.client_id, "url": url, "background": False},
1313+
timeout=10
1314+
)
1315+
resp.raise_for_status()
1316+
result = resp.json()
1317+
baseline_tab_id = result['tabId']
1318+
print(f" ✅ Baseline tab opened: {baseline_tab_id}")
1319+
await asyncio.sleep(3) # Wait for page load
1320+
1321+
# Capture baseline using the temporary tab
1322+
self.tab_id = baseline_tab_id # Temporarily set for _capture_dom_snapshot
1323+
baseline_snapshot = self._capture_dom_snapshot("BASELINE")
1324+
if baseline_snapshot:
1325+
self.example_manager.save_baseline(self.client_id, baseline_tab_id, baseline_snapshot)
1326+
print(f"✅ Baseline saved")
1327+
else:
1328+
print("❌ Failed to capture baseline snapshot")
1329+
return
1330+
self.tab_id = None # Clear since we don't need it
1331+
1332+
except requests.exceptions.RequestException as e:
1333+
print(f"❌ Failed to open baseline tab: {e}")
12881334
return
12891335

12901336
# Main loop for adding examples
1337+
print("\n" + "=" * 60)
1338+
print("💡 Ready to add examples. Each example opens a fresh tab.")
1339+
print("=" * 60)
1340+
12911341
while True:
12921342
result = await self._add_example_interactive(baseline_snapshot)
12931343
if not result:
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
id: ana-airlines-flights-001
2+
name: Test ANA Airlines Flights Search Functionality
3+
description: Find a roundtrip flight from Singapore to Tokio from April 10 to April
4+
20 for 1 adult
5+
enabled: true
6+
target:
7+
url: https://www.ana.co.jp/en/us/
8+
wait_for: networkidle
9+
wait_timeout: 5000
10+
timeout: 45000
11+
input:
12+
objective: Find a roundtrip flight from Singapore to Tokio from April 10 to April
13+
20 for 1 adult
14+
validation:
15+
type: js-eval
16+
js-eval:
17+
script: verify.js
18+
expected_result: true
19+
timeout: 5000
20+
metadata:
21+
tags:
22+
- flights
23+
- form
24+
- kayak
25+
- action
26+
- google-flights
27+
priority: high
28+
timeout: 45000
29+
retries: 2
30+
flaky: false
31+
owner: devtools-team
32+
tool: action_agent
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Validation for: Find a roundtrip flight from Singapore to Tokyo from April 10 to April 20 for 1 adult
2+
// This checks for the presence of flight search results with the correct parameters
3+
4+
(() => {
5+
// Check for the search results page by verifying hidden inputs with flight search details
6+
const segmentCode = document.querySelector('#criteo_segment_code');
7+
const departureDate = document.querySelector('#criteo_departure_date');
8+
const arrivalDate = document.querySelector('#criteo_arrival_date');
9+
const searchMode = document.querySelector('#criteo_search_mode');
10+
const adultCount = document.querySelector('#criteo_adult_count');
11+
const boardingClass = document.querySelector('#criteo_boarding_class');
12+
13+
// Verify all required elements exist and have correct values
14+
const hasCorrectRoute = segmentCode && segmentCode.value === 'SIN_TYO';
15+
const hasCorrectDeparture = departureDate && departureDate.value === '20260410';
16+
const hasCorrectArrival = arrivalDate && arrivalDate.value === '20260420';
17+
const hasRoundTrip = searchMode && searchMode.value === 'ROUND_TRIP';
18+
const hasOneAdult = adultCount && adultCount.value === '1';
19+
const hasEconomyClass = boardingClass && boardingClass.value === 'eco';
20+
21+
// Also verify the page title indicates search results
22+
const isSearchResultsPage = document.title.includes('Search Results');
23+
24+
// All conditions must be true - using return since this is an IIFE
25+
return hasCorrectRoute && hasCorrectDeparture && hasCorrectArrival && hasRoundTrip && hasOneAdult && hasEconomyClass && isSearchResultsPage;
26+
})()

evals/native/data/js-verifier/public/booking/task.yaml renamed to evals/native/data/js-verifier/public/booking-001/task.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
id: booking-001
22
name: Test Booking Search Functionality
3-
description: Test Booking search functionality
3+
description: Find hotels in Portu (Portugal) for the dates August 10 to August 17 for 2 adults
44
enabled: true
55
target:
66
url: https://www.booking.com

evals/native/data/js-verifier/public/booking/verify.js renamed to evals/native/data/js-verifier/public/booking-001/verify.js

File renamed without changes.

evals/native/data/js-verifier/public/booking-002/task.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
id: booking-002
22
name: Test Booking Search Functionality
3-
description: Test Booking search functionality
3+
description: Find hotels in Madeira for the dates from June 1 to June 29 for 2 adults
4+
and a 10 years old child
45
enabled: true
56
target:
67
url: https://www.booking.com
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
id: delta-airlines-flights-001
2+
name: Test Delta Airlines Flights Search Functionality
3+
description: Find a roundtrip flight from Chicago to Anchorage from April 10 to April
4+
20 for 3 adults
5+
enabled: true
6+
target:
7+
url: https://www.delta.com
8+
wait_for: networkidle
9+
wait_timeout: 5000
10+
timeout: 45000
11+
input:
12+
objective: Find a roundtrip flight from Chicago to Anchorage from April 10 to April
13+
20 for 3 adults
14+
validation:
15+
type: js-eval
16+
js-eval:
17+
script: verify.js
18+
expected_result: true
19+
timeout: 5000
20+
metadata:
21+
tags:
22+
- flights
23+
- form
24+
- kayak
25+
- action
26+
- google-flights
27+
priority: high
28+
timeout: 45000
29+
retries: 2
30+
flaky: false
31+
owner: devtools-team
32+
tool: action_agent
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
(() => {
2+
// Validation for: Find a roundtrip flight from Chicago to Anchorage from April 10 to April 20 for 3 adults
3+
4+
// Check hidden inputs for route information
5+
const fromAirport = document.querySelector('input[name="fromAirportCode"]');
6+
const toAirport = document.querySelector('input[name="arrivalCity"]');
7+
8+
// Route must be Chicago (CHI) to Anchorage (ANC)
9+
const fromOk = fromAirport && fromAirport.value === 'CHI';
10+
const toOk = toAirport && toAirport.value === 'ANC';
11+
12+
// Check for round trip indicator in header
13+
const tripTypeHeader = document.querySelector('.triptype-header');
14+
const tripTypeText = tripTypeHeader ? tripTypeHeader.textContent.trim() : '';
15+
const roundTripOk = tripTypeText.toLowerCase().includes('round trip');
16+
17+
// Check for 3 passengers
18+
const paxHeader = document.querySelector('.paxcount-header');
19+
const paxText = paxHeader ? paxHeader.textContent : '';
20+
const passengersOk = paxText.includes('3 Passenger');
21+
22+
// Check that we're on the search results page
23+
const searchResults = document.querySelector('.search-results, .mach-search-results');
24+
const resultsPageOk = searchResults !== null;
25+
26+
// Check for correct dates (April 10-20)
27+
const pageText = document.body.innerText;
28+
const hasApril10 = pageText.includes('Apr 10') || pageText.includes('April 10');
29+
const datesOk = hasApril10;
30+
31+
// All checks must pass
32+
return Boolean(fromOk && toOk && roundTripOk && passengersOk && resultsPageOk && datesOk);
33+
})()

evals/native/data/js-verifier/public/kayak/task.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
id: kayak-001
22
name: Test Kayak Flights Search Functionality
3-
description: Test Kayak Flights search functionality
3+
description: Find one way flight options from Austin to Paris at May 20
44
enabled: true
55
target:
66
url: https://www.kayak.com/flights

0 commit comments

Comments
 (0)