BrowserOperator
diff --git a/‎evals/eval_builder_snapshots.py‎
Lines changed: 76 additions & 77 deletions b/‎evals/eval_builder_snapshots.py‎
Lines changed: 76 additions & 77 deletions
diff --git a/‎evals/native/data/js-verifier/action/dynamic/CLAUDE_REQUEST.md‎
Lines changed: 118 additions & 0 deletions b/‎evals/native/data/js-verifier/action/dynamic/CLAUDE_REQUEST.md‎
Lines changed: 118 additions & 0 deletions
@@ -23,17 +23,47 @@
 import requests
 import time
 import subprocess
+from lxml.html.clean import Cleaner
 from pathlib import Path
 from typing import Dict, Any, Optional
 from difflib import unified_diff
 
 
+def filter_html_tags(html: str) -> str:
+    """
+    Clean HTML using lxml.html.Cleaner.
+    Removes scripts, styles, and unsafe attributes while preserving DOM structure.
+
+    Args:
+        html: HTML string to clean
+
+    Returns:
+        Cleaned HTML string
+    """
+    cleaner = Cleaner(
+        scripts=True,          # drop <script> elements
+        javascript=True,       # remove on* event attributes (like onclick)
+        style=True,            # drop <style> blocks
+        inline_style=True,     # drop style="" attributes on tags
+        safe_attrs_only=True,  # remove any tag attributes not in a safe allowlist
+        frames=False,          # keep <iframe> elements (content already captured by API)
+        forms=False            # keep <form> elements
+    )
+    try:
+        cleaned_html = cleaner.clean_html(html)
+        return cleaned_html
+    except Exception as e:
+        print(f"⚠️  Warning: HTML cleaning failed ({e}), using original HTML")
+        return html
+
+
 class SnapshotBasedEvalBuilder:
     """Build eval files using before/after snapshots."""
 
-    def __init__(self, file_path: Optional[str] = None, workdir: Optional[str] = None):
+    def __init__(self, file_path: Optional[str] = None, workdir: Optional[str] = None, disable_filtering: bool = False):
         self.file_path = file_path
         self.workdir = workdir  # Working directory for snapshots and validation scripts
+        self.disable_filtering = disable_filtering  # If False, filter <style> and <script> tags
         self.eval_data: Dict[str, Any] = {}
         self.client_id: Optional[str] = None
         self.tab_id: Optional[str] = None
@@ -294,9 +324,17 @@ async def step_7_generate_validation(self):
 
         print("🔍 Analyzing differences...")
 
-        # Find differences
-        before_lines = self.snapshot_before.split('\n')
-        after_lines = self.snapshot_after.split('\n')
+        # Apply filtering FIRST if enabled (default behavior)
+        before_content = self.snapshot_before
+        after_content = self.snapshot_after
+        if not self.disable_filtering:
+            print("🧹 Cleaning HTML with lxml.html.Cleaner...")
+            before_content = filter_html_tags(before_content)
+            after_content = filter_html_tags(after_content)
+
+        # Find differences from FILTERED content
+        before_lines = before_content.split('\n')
+        after_lines = after_content.split('\n')
 
         diff = list(unified_diff(
             before_lines,
@@ -320,10 +358,10 @@ async def step_7_generate_validation(self):
         diff_file = f"{snapshot_dir}/diff.txt"
 
         with open(before_file, 'w') as f:
-            f.write(self.snapshot_before)
+            f.write(before_content)
 
         with open(after_file, 'w') as f:
-            f.write(self.snapshot_after)
+            f.write(after_content)
 
         with open(diff_file, 'w') as f:
             f.write('\n'.join(diff))
@@ -332,6 +370,8 @@ async def step_7_generate_validation(self):
         print(f"   BEFORE: {before_file}")
         print(f"   AFTER:  {after_file}")
         print(f"   DIFF:   {diff_file}")
+        if not self.disable_filtering:
+            print("   (Cleaned: removed scripts, styles, and unsafe attributes)")
 
         # Show sample of changes
         if added_lines:
@@ -508,14 +548,11 @@ async def step_7_generate_validation(self):
         # Wait for Claude to create the validation file
         validation_file = f"{snapshot_dir}/verify.js"
 
-        print("Options:")
-        print("1. Auto-run Claude Code subprocess (recommended)")
-        print("2. Wait for Claude Code manually (you run it)")
-        print("3. Enter validation JavaScript manually")
+        # Automatically run Claude Code subprocess (no user prompt)
+        print("🤖 Auto-running Claude Code subprocess to generate validation...")
         print()
 
-        choice = input("Choice (1/2/3): ").strip()
-
+        choice = '1'
         lines = []
 
         if choice == '1':
@@ -643,12 +680,13 @@ async def step_7_generate_validation(self):
         if lines:
             js_code = '\n'.join(lines)
 
-            # Test it with retry loop (unlimited retries until user cancels)
+            # Test it with retry loop (max 3 attempts)
             validation_saved = False
             retry_count = 0
+            max_retries = 3
 
-            while not validation_saved:
-                print(f"\n🧪 Testing validation... (attempt {retry_count + 1})")
+            while not validation_saved and retry_count < max_retries:
+                print(f"\n🧪 Testing validation... (attempt {retry_count + 1}/{max_retries})")
 
                 if await self._test_validation(js_code):
                     # Save validation JavaScript to external file
@@ -678,17 +716,17 @@ async def step_7_generate_validation(self):
                     print("✅ Validation saved")
                     validation_saved = True
                 else:
-                    # Test failed - allow unlimited retries
-                    print(f"\n⚠️  Validation test failed.")
-                    print("\nOptions:")
-                    print("1. Auto-run Claude Code to fix it (recommended)")
-                    print("2. Enter new validation manually")
-                    print("3. Save anyway (not recommended)")
-                    print("4. Skip validation for now")
+                    # Test failed - auto-retry with Claude Code
+                    retry_count += 1
 
-                    retry_choice = input("\nChoice (1/2/3/4): ").strip()
+                    if retry_count < max_retries:
+                        print(f"\n⚠️  Validation test failed. Auto-retrying with Claude Code... ({retry_count}/{max_retries})")
+                    else:
+                        print(f"\n❌ Validation failed after {max_retries} attempts. Skipping validation.")
+                        break
 
-                    if retry_choice == '1':
+                    # Auto-run Claude Code to fix (no user prompt)
+                    if retry_count < max_retries:
                         # Auto-run Claude Code subprocess to fix the validation
                         print(f"\n🤖 Launching Claude Code subprocess to fix validation...")
                         print()
@@ -738,71 +776,25 @@ async def step_7_generate_validation(self):
                                 print("─" * 60)
                                 print()
                                 print("🔄 Re-testing with updated code...")
-
-                                retry_count += 1
+                                # Continue to next iteration (retry_count already incremented)
                                 continue
                             else:
                                 print(f"⚠️  Claude Code ran but {validation_file} was not found")
-                                retry_count += 1
+                                # Skip to next retry
                                 continue
 
                         except subprocess.TimeoutExpired:
                             print("⏱️  Claude Code subprocess timed out (5 minutes)")
-                            retry_count += 1
+                            # Skip to next retry
                             continue
                         except FileNotFoundError:
                             print("❌ 'claude' command not found. Is Claude Code installed?")
-                            retry_count += 1
+                            # Skip to next retry
                             continue
                         except Exception as e:
                             print(f"❌ Error running Claude Code: {e}")
-                            retry_count += 1
+                            # Skip to next retry
                             continue
-
-                    elif retry_choice == '2':
-                        # Manual entry
-                        print("\nEnter validation JavaScript (type 'END' on new line when done):\n")
-                        new_lines = []
-                        while True:
-                            line = input()
-                            if line.strip() == 'END':
-                                break
-                            new_lines.append(line)
-                        js_code = '\n'.join(new_lines)
-                        retry_count += 1
-                        continue
-
-                    elif retry_choice == '3':
-                        # Save anyway
-                        # Save validation JavaScript to external file
-                        eval_dir = os.path.dirname(self.file_path)
-                        verify_js_path = os.path.join(eval_dir, 'verify.js')
-
-                        # Ensure eval directory exists
-                        os.makedirs(eval_dir, exist_ok=True)
-
-                        # Write JavaScript to external file
-                        with open(verify_js_path, 'w') as f:
-                            f.write(js_code)
-
-                        print(f"💾 Saved validation script to: {verify_js_path}")
-
-                        # Reference external file in YAML
-                        if 'validation' not in self.eval_data:
-                            self.eval_data['validation'] = {}
-                        if 'type' not in self.eval_data['validation']:
-                            self.eval_data['validation']['type'] = 'js-eval'
-                        if 'js-eval' not in self.eval_data['validation']:
-                            self.eval_data['validation']['js-eval'] = {}
-                        self.eval_data['validation']['js-eval']['script'] = 'verify.js'
-                        self.eval_data['validation']['js-eval']['expected_result'] = True
-                        self.eval_data['validation']['js-eval']['timeout'] = 5000
-                        print("⚠️  Validation saved (with errors - use caution!)")
-                        validation_saved = True
-
-                    else:  # Choice 4 or anything else
-                        print("⏭️  Skipping validation")
-                        break
         else:
             print("⚠️  No validation code entered")
 
@@ -928,6 +920,7 @@ async def main():
     parser = argparse.ArgumentParser(description="Snapshot-based eval builder")
     parser.add_argument('--file', '-f', help='Eval file path (default: <workdir>/task.yaml)')
     parser.add_argument('--workdir', '-w', required=True, help='Working directory for snapshots and validation scripts')
+    parser.add_argument('--disable-filtering', action='store_true', help='Disable HTML cleaning (keep raw HTML with scripts/styles)')
     args = parser.parse_args()
 
     # Normalize workdir path (strip 'evals/' prefix if present and we're already in evals/)
@@ -939,18 +932,24 @@ async def main():
     # Strip trailing slashes to avoid double slashes in paths
     workdir = workdir.rstrip('/')
 
-    # Auto-detect task.yaml in workdir if no file specified
+    # Auto-detect task.yaml or task.yml in workdir if no file specified
     file_path = args.file
     if not file_path:
+        # Check for both .yaml and .yml extensions
         task_yaml_path = os.path.join(workdir, 'task.yaml')
+        task_yml_path = os.path.join(workdir, 'task.yml')
+
         if os.path.exists(task_yaml_path):
             file_path = task_yaml_path
             print(f"📋 Found existing task.yaml: {file_path}")
+        elif os.path.exists(task_yml_path):
+            file_path = task_yml_path
+            print(f"📋 Found existing task.yml: {file_path}")
         else:
             file_path = task_yaml_path  # Will be created as new file
             print(f"📝 Will create new task.yaml: {file_path}")
 
-    builder = SnapshotBasedEvalBuilder(file_path=file_path, workdir=workdir)
+    builder = SnapshotBasedEvalBuilder(file_path=file_path, workdir=workdir, disable_filtering=args.disable_filtering)
     await builder.run()
 
 
 
@@ -0,0 +1,118 @@
+# Claude Code: Generate Validation JavaScript
+
+## Objective
+Click the "Start" button to trigger dynamic content loading
+
+## Task
+Analyze the BEFORE and AFTER snapshots and generate JavaScript validation code.
+
+## Files to Analyze
+- BEFORE: native/data/js-verifier/action/dynamic/before.html
+- AFTER: native/data/js-verifier/action/dynamic/after.html
+- DIFF: native/data/js-verifier/action/dynamic/diff.txt
+
+## Instructions
+1. Read the BEFORE and AFTER HTML files
+2. Read the DIFF file to see what actually changed
+3. Identify the specific DOM changes that indicate the objective was completed
+4. Generate JavaScript code that:
+   - Checks if the objective was completed successfully
+   - **CRITICAL: DO NOT use `return` statements - end with a boolean expression**
+   - Is based on ACTUAL observed changes (not assumptions)
+   - Works in the browser context
+
+## CRITICAL: Output Format
+
+**DO NOT USE RETURN STATEMENTS!** The code is evaluated as an expression, not a function.
+
+❌ WRONG:
+```javascript
+return document.querySelector('#success') !== null;
+```
+
+✅ CORRECT:
+```javascript
+// Check for the specific change
+const element = document.querySelector('...');
+element && element.value === 'expected'
+```
+
+The last line should be a boolean expression (no return keyword).
+
+## Testing Your Code
+
+**YOU MUST TEST YOUR CODE ON BOTH TABS** before declaring it complete.
+
+**Endpoint:** POST http://localhost:8080/page/execute
+
+**Browser State Information:**
+- **Client ID:** 4af9762d-c210-4c82-b5ff-a261c1167c04
+- **Tab ID (AFTER - task completed):** DA823E525F671B595F4C7D13BCBB461B
+- **Tab ID (BEFORE - initial state):** 72717EC9480BA1610DA260E917C14E63
+
+**Test 1: AFTER Tab (Should Return TRUE):**
+```bash
+curl -X POST http://localhost:8080/page/execute \
+  -H "Content-Type: application/json" \
+  -d '{
+    "clientId": "4af9762d-c210-4c82-b5ff-a261c1167c04",
+    "tabId": "DA823E525F671B595F4C7D13BCBB461B",
+    "expression": "YOUR_JAVASCRIPT_CODE_HERE",
+    "returnByValue": true,
+    "awaitPromise": false
+  }'
+```
+
+**Expected Response:** `{"result": {"value": true}}`
+
+**Test 2: BEFORE Tab (Should Return FALSE):**
+```bash
+curl -X POST http://localhost:8080/page/execute \
+  -H "Content-Type: application/json" \
+  -d '{
+    "clientId": "4af9762d-c210-4c82-b5ff-a261c1167c04",
+    "tabId": "72717EC9480BA1610DA260E917C14E63",
+    "expression": "YOUR_JAVASCRIPT_CODE_HERE",
+    "returnByValue": true,
+    "awaitPromise": false
+  }'
+```
+
+**Expected Response:** `{"result": {"value": false}}`
+
+**CRITICAL:** Your validation MUST:
+- Return TRUE on the AFTER tab (task completed)
+- Return FALSE on the BEFORE tab (task not done)
+- This proves your validation correctly detects the change
+
+**Error Response:**
+```json
+{
+  "exceptionDetails": { "text": "Error message here" }
+}
+```
+
+## Workflow
+
+1. Write validation code to: native/data/js-verifier/action/dynamic/verify.js
+2. Test it on the AFTER tab (should return TRUE)
+3. Test it on the BEFORE tab (should return FALSE)
+4. If you get errors or wrong results:
+   - Read the existing native/data/js-verifier/action/dynamic/verify.js
+   - Identify the issue from the API error response
+   - Edit and fix the file
+   - Save the improved version
+   - Test BOTH tabs again
+5. Iterate until:
+   - AFTER tab returns {"result": {"value": true}}
+   - BEFORE tab returns {"result": {"value": false}}
+6. Only then is your code complete
+
+## Save Your Response
+When you generate WORKING validation JavaScript (tested via API), save it to:
+native/data/js-verifier/action/dynamic/verify.js
+
+The orchestrator will automatically pick it up and test it again for confirmation.
+
+**IMPORTANT:** The file will NOT be deleted between iterations. You can read it,
+learn from previous attempts, and improve it iteratively.