From 823c9243d89dfbd8cab456ae62524fbad37a0708 Mon Sep 17 00:00:00 2001 From: Oleh Luchkiv Date: Wed, 10 Dec 2025 15:52:41 -0600 Subject: [PATCH 1/2] Extra new eval tests created --- evals/eval_builder_snapshots.py | 72 ++++++++++++++++--- .../js-verifier/public/ana-airlines/task.yaml | 32 +++++++++ .../js-verifier/public/ana-airlines/verify.js | 26 +++++++ .../public/{booking => booking-001}/task.yaml | 2 +- .../public/{booking => booking-001}/verify.js | 0 .../js-verifier/public/booking-002/task.yaml | 3 +- .../public/delta-airlines/task.yaml | 32 +++++++++ .../public/delta-airlines/verify.js | 33 +++++++++ .../data/js-verifier/public/kayak/task.yaml | 2 +- 9 files changed, 188 insertions(+), 14 deletions(-) create mode 100644 evals/native/data/js-verifier/public/ana-airlines/task.yaml create mode 100644 evals/native/data/js-verifier/public/ana-airlines/verify.js rename evals/native/data/js-verifier/public/{booking => booking-001}/task.yaml (85%) rename evals/native/data/js-verifier/public/{booking => booking-001}/verify.js (100%) create mode 100644 evals/native/data/js-verifier/public/delta-airlines/task.yaml create mode 100644 evals/native/data/js-verifier/public/delta-airlines/verify.js diff --git a/evals/eval_builder_snapshots.py b/evals/eval_builder_snapshots.py index 48dedee..3e37a84 100755 --- a/evals/eval_builder_snapshots.py +++ b/evals/eval_builder_snapshots.py @@ -386,6 +386,30 @@ async def step_3_open_browser(self): print("ā³ Waiting for page to load...") await asyncio.sleep(3) print("āœ… Page loaded") + print(" šŸ’” This tab will be used for capturing BEFORE/AFTER snapshots") + + except requests.exceptions.RequestException as e: + print(f"āŒ Browser error: {e}") + print(" Make sure BrowserOperator is running at http://localhost:8080") + sys.exit(1) + + async def _get_browser_client(self): + """Get browser client without opening a tab. Used in extend mode.""" + print("\nšŸ” Getting browser client...\n") + + try: + resp = requests.get(f"{self.api_base}/clients", timeout=5) + resp.raise_for_status() + clients = resp.json() + + if not clients: + print("āŒ No browser clients. Is BrowserOperator running?") + print(" Start it: cd deployments/local && make compose-up") + sys.exit(1) + + self.client_id = clients[0]['id'] + print(f"āœ… Client: {self.client_id}") + print(" šŸ’” Tabs will be opened on-demand for each example") except requests.exceptions.RequestException as e: print(f"āŒ Browser error: {e}") @@ -1257,7 +1281,7 @@ async def run_extend(self): """Extend workflow for refining existing verify.js with additional examples.""" print("\nšŸ”„ Extend Mode: Refine verify.js with additional examples\n") print("This workflow:") - print("1. Opens browser to target URL") + print("1. Connects to browser (tabs opened per-example)") print("2. You add positive/negative examples") print("3. Each example tests current verify.js") print("4. If wrong, Claude Code adjusts the script") @@ -1269,25 +1293,51 @@ async def run_extend(self): # Load existing task.yaml await self.step_1_load_file() - # Open browser to target URL - await self.step_3_open_browser() + # Get browser client (without opening a tab yet) + await self._get_browser_client() # Capture or load baseline snapshot if self.example_manager.index['baseline']: - print("šŸ“‚ Loading existing baseline snapshot...") + print("\nšŸ“‚ Loading existing baseline snapshot...") baseline_snapshot = self.example_manager.get_baseline_snapshot() print(f"āœ… Loaded baseline from previous session") else: - print("šŸ“ø Capturing baseline snapshot (initial page state)...") - baseline_snapshot = self._capture_dom_snapshot("BASELINE") - if baseline_snapshot: - self.example_manager.save_baseline(self.client_id, self.tab_id, baseline_snapshot) - print(f"āœ… Baseline saved") - else: - print("āŒ Failed to capture baseline snapshot") + # Need to open a temporary tab to capture baseline + print("\nšŸ“ø Capturing baseline snapshot (initial page state)...") + print(" Opening temporary tab for baseline capture...") + url = self.eval_data['target']['url'] + try: + resp = requests.post( + f"{self.api_base}/tabs/open", + json={"clientId": self.client_id, "url": url, "background": False}, + timeout=10 + ) + resp.raise_for_status() + result = resp.json() + baseline_tab_id = result['tabId'] + print(f" āœ… Baseline tab opened: {baseline_tab_id}") + await asyncio.sleep(3) # Wait for page load + + # Capture baseline using the temporary tab + self.tab_id = baseline_tab_id # Temporarily set for _capture_dom_snapshot + baseline_snapshot = self._capture_dom_snapshot("BASELINE") + if baseline_snapshot: + self.example_manager.save_baseline(self.client_id, baseline_tab_id, baseline_snapshot) + print(f"āœ… Baseline saved") + else: + print("āŒ Failed to capture baseline snapshot") + return + self.tab_id = None # Clear since we don't need it + + except requests.exceptions.RequestException as e: + print(f"āŒ Failed to open baseline tab: {e}") return # Main loop for adding examples + print("\n" + "=" * 60) + print("šŸ’” Ready to add examples. Each example opens a fresh tab.") + print("=" * 60) + while True: result = await self._add_example_interactive(baseline_snapshot) if not result: diff --git a/evals/native/data/js-verifier/public/ana-airlines/task.yaml b/evals/native/data/js-verifier/public/ana-airlines/task.yaml new file mode 100644 index 0000000..b43c1f0 --- /dev/null +++ b/evals/native/data/js-verifier/public/ana-airlines/task.yaml @@ -0,0 +1,32 @@ +id: ana-airlines-flights-001 +name: Test ANA Airlines Flights Search Functionality +description: Find a roundtrip flight from Singapore to Tokio from April 10 to April + 20 for 1 adult +enabled: true +target: + url: https://www.ana.co.jp/en/us/ + wait_for: networkidle + wait_timeout: 5000 +timeout: 45000 +input: + objective: Find a roundtrip flight from Singapore to Tokio from April 10 to April + 20 for 1 adult +validation: + type: js-eval + js-eval: + script: verify.js + expected_result: true + timeout: 5000 +metadata: + tags: + - flights + - form + - kayak + - action + - google-flights + priority: high + timeout: 45000 + retries: 2 + flaky: false + owner: devtools-team +tool: action_agent diff --git a/evals/native/data/js-verifier/public/ana-airlines/verify.js b/evals/native/data/js-verifier/public/ana-airlines/verify.js new file mode 100644 index 0000000..182c5f4 --- /dev/null +++ b/evals/native/data/js-verifier/public/ana-airlines/verify.js @@ -0,0 +1,26 @@ +// Validation for: Find a roundtrip flight from Singapore to Tokyo from April 10 to April 20 for 1 adult +// This checks for the presence of flight search results with the correct parameters + +(() => { + // Check for the search results page by verifying hidden inputs with flight search details + const segmentCode = document.querySelector('#criteo_segment_code'); + const departureDate = document.querySelector('#criteo_departure_date'); + const arrivalDate = document.querySelector('#criteo_arrival_date'); + const searchMode = document.querySelector('#criteo_search_mode'); + const adultCount = document.querySelector('#criteo_adult_count'); + const boardingClass = document.querySelector('#criteo_boarding_class'); + + // Verify all required elements exist and have correct values + const hasCorrectRoute = segmentCode && segmentCode.value === 'SIN_TYO'; + const hasCorrectDeparture = departureDate && departureDate.value === '20260410'; + const hasCorrectArrival = arrivalDate && arrivalDate.value === '20260420'; + const hasRoundTrip = searchMode && searchMode.value === 'ROUND_TRIP'; + const hasOneAdult = adultCount && adultCount.value === '1'; + const hasEconomyClass = boardingClass && boardingClass.value === 'eco'; + + // Also verify the page title indicates search results + const isSearchResultsPage = document.title.includes('Search Results'); + + // All conditions must be true - using return since this is an IIFE + return hasCorrectRoute && hasCorrectDeparture && hasCorrectArrival && hasRoundTrip && hasOneAdult && hasEconomyClass && isSearchResultsPage; +})() \ No newline at end of file diff --git a/evals/native/data/js-verifier/public/booking/task.yaml b/evals/native/data/js-verifier/public/booking-001/task.yaml similarity index 85% rename from evals/native/data/js-verifier/public/booking/task.yaml rename to evals/native/data/js-verifier/public/booking-001/task.yaml index 3780a5d..0278102 100644 --- a/evals/native/data/js-verifier/public/booking/task.yaml +++ b/evals/native/data/js-verifier/public/booking-001/task.yaml @@ -1,6 +1,6 @@ id: booking-001 name: Test Booking Search Functionality -description: Test Booking search functionality +description: Find hotels in Portu (Portugal) for the dates August 10 to August 17 for 2 adults enabled: true target: url: https://www.booking.com diff --git a/evals/native/data/js-verifier/public/booking/verify.js b/evals/native/data/js-verifier/public/booking-001/verify.js similarity index 100% rename from evals/native/data/js-verifier/public/booking/verify.js rename to evals/native/data/js-verifier/public/booking-001/verify.js diff --git a/evals/native/data/js-verifier/public/booking-002/task.yaml b/evals/native/data/js-verifier/public/booking-002/task.yaml index f5ace7a..da6e85d 100644 --- a/evals/native/data/js-verifier/public/booking-002/task.yaml +++ b/evals/native/data/js-verifier/public/booking-002/task.yaml @@ -1,6 +1,7 @@ id: booking-002 name: Test Booking Search Functionality -description: Test Booking search functionality +description: Find hotels in Madeira for the dates from June 1 to June 29 for 2 adults + and a 10 years old child enabled: true target: url: https://www.booking.com diff --git a/evals/native/data/js-verifier/public/delta-airlines/task.yaml b/evals/native/data/js-verifier/public/delta-airlines/task.yaml new file mode 100644 index 0000000..8dab5db --- /dev/null +++ b/evals/native/data/js-verifier/public/delta-airlines/task.yaml @@ -0,0 +1,32 @@ +id: delta-airlines-flights-001 +name: Test Delta Airlines Flights Search Functionality +description: Find a roundtrip flight from Chicago to Anchorage from April 10 to April + 20 for 3 adults +enabled: true +target: + url: https://www.delta.com + wait_for: networkidle + wait_timeout: 5000 +timeout: 45000 +input: + objective: Find a roundtrip flight from Chicago to Anchorage from April 10 to April + 20 for 3 adults +validation: + type: js-eval + js-eval: + script: verify.js + expected_result: true + timeout: 5000 +metadata: + tags: + - flights + - form + - kayak + - action + - google-flights + priority: high + timeout: 45000 + retries: 2 + flaky: false + owner: devtools-team +tool: action_agent diff --git a/evals/native/data/js-verifier/public/delta-airlines/verify.js b/evals/native/data/js-verifier/public/delta-airlines/verify.js new file mode 100644 index 0000000..b501cd3 --- /dev/null +++ b/evals/native/data/js-verifier/public/delta-airlines/verify.js @@ -0,0 +1,33 @@ +(() => { + // Validation for: Find a roundtrip flight from Chicago to Anchorage from April 10 to April 20 for 3 adults + + // Check hidden inputs for route information + const fromAirport = document.querySelector('input[name="fromAirportCode"]'); + const toAirport = document.querySelector('input[name="arrivalCity"]'); + + // Route must be Chicago (CHI) to Anchorage (ANC) + const fromOk = fromAirport && fromAirport.value === 'CHI'; + const toOk = toAirport && toAirport.value === 'ANC'; + + // Check for round trip indicator in header + const tripTypeHeader = document.querySelector('.triptype-header'); + const tripTypeText = tripTypeHeader ? tripTypeHeader.textContent.trim() : ''; + const roundTripOk = tripTypeText.toLowerCase().includes('round trip'); + + // Check for 3 passengers + const paxHeader = document.querySelector('.paxcount-header'); + const paxText = paxHeader ? paxHeader.textContent : ''; + const passengersOk = paxText.includes('3 Passenger'); + + // Check that we're on the search results page + const searchResults = document.querySelector('.search-results, .mach-search-results'); + const resultsPageOk = searchResults !== null; + + // Check for correct dates (April 10-20) + const pageText = document.body.innerText; + const hasApril10 = pageText.includes('Apr 10') || pageText.includes('April 10'); + const datesOk = hasApril10; + + // All checks must pass + return Boolean(fromOk && toOk && roundTripOk && passengersOk && resultsPageOk && datesOk); +})() diff --git a/evals/native/data/js-verifier/public/kayak/task.yaml b/evals/native/data/js-verifier/public/kayak/task.yaml index 36a6bc8..9601cb2 100644 --- a/evals/native/data/js-verifier/public/kayak/task.yaml +++ b/evals/native/data/js-verifier/public/kayak/task.yaml @@ -1,6 +1,6 @@ id: kayak-001 name: Test Kayak Flights Search Functionality -description: Test Kayak Flights search functionality +description: Find one way flight options from Austin to Paris at May 20 enabled: true target: url: https://www.kayak.com/flights From 3ec24865efd0dea12a24ee3f30eb6d191df910b2 Mon Sep 17 00:00:00 2001 From: Oleh Luchkiv Date: Wed, 10 Dec 2025 15:53:47 -0600 Subject: [PATCH 2/2] Added Cerebras config --- evals/config.yml | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/evals/config.yml b/evals/config.yml index aa9ac8b..48b5211 100644 --- a/evals/config.yml +++ b/evals/config.yml @@ -1,6 +1,6 @@ # Evaluation Framework Configuration # This configuration is shared across all evaluation runner scripts -# Supports OpenAI, Groq, OpenRouter, and LiteLLM providers +# Example configuration for Cerebras models # API endpoint for the evaluation server api_endpoint: "http://localhost:8080" @@ -9,28 +9,27 @@ api_endpoint: "http://localhost:8080" # These models are sent to the agent for processing requests main_model: - provider: "openai" - model_name: "gpt-5-mini" - api_key: "${OPENAI_API_KEY}" + provider: "cerebras" + model_name: "zai-glm-4.6" + api_key: "${CEREBRAS_API_KEY}" mini_model: - provider: "openai" - model_name: "gpt-5-nano" - api_key: "${OPENAI_API_KEY}" + provider: "cerebras" + model_name: "zai-glm-4.6" + api_key: "${CEREBRAS_API_KEY}" nano_model: - provider: "openai" - model_name: "gpt-5-nano" - api_key: "${OPENAI_API_KEY}" + provider: "cerebras" + model_name: "zai-glm-4.6" + api_key: "${CEREBRAS_API_KEY}" # Model configuration for judging evaluation responses # This model is used locally to assess the quality of agent responses judge_model: - provider: "openai" - model_name: "gpt-5" - api_key: "${OPENAI_API_KEY}" - + provider: "cerebras" + model_name: "zai-glm-4.6" + api_key: "${CEREBRAS_API_KEY}" # Execution settings