Skip to content

Commit 5fb2f0f

Browse files
committed
More automation for eval building
1 parent a92fe91 commit 5fb2f0f

File tree

20 files changed

+9330
-77
lines changed

20 files changed

+9330
-77
lines changed

evals/eval_builder_snapshots.py

Lines changed: 76 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,47 @@
2323
import requests
2424
import time
2525
import subprocess
26+
from lxml.html.clean import Cleaner
2627
from pathlib import Path
2728
from typing import Dict, Any, Optional
2829
from difflib import unified_diff
2930

3031

32+
def filter_html_tags(html: str) -> str:
33+
"""
34+
Clean HTML using lxml.html.Cleaner.
35+
Removes scripts, styles, and unsafe attributes while preserving DOM structure.
36+
37+
Args:
38+
html: HTML string to clean
39+
40+
Returns:
41+
Cleaned HTML string
42+
"""
43+
cleaner = Cleaner(
44+
scripts=True, # drop <script> elements
45+
javascript=True, # remove on* event attributes (like onclick)
46+
style=True, # drop <style> blocks
47+
inline_style=True, # drop style="" attributes on tags
48+
safe_attrs_only=True, # remove any tag attributes not in a safe allowlist
49+
frames=False, # keep <iframe> elements (content already captured by API)
50+
forms=False # keep <form> elements
51+
)
52+
try:
53+
cleaned_html = cleaner.clean_html(html)
54+
return cleaned_html
55+
except Exception as e:
56+
print(f"⚠️ Warning: HTML cleaning failed ({e}), using original HTML")
57+
return html
58+
59+
3160
class SnapshotBasedEvalBuilder:
3261
"""Build eval files using before/after snapshots."""
3362

34-
def __init__(self, file_path: Optional[str] = None, workdir: Optional[str] = None):
63+
def __init__(self, file_path: Optional[str] = None, workdir: Optional[str] = None, disable_filtering: bool = False):
3564
self.file_path = file_path
3665
self.workdir = workdir # Working directory for snapshots and validation scripts
66+
self.disable_filtering = disable_filtering # If False, filter <style> and <script> tags
3767
self.eval_data: Dict[str, Any] = {}
3868
self.client_id: Optional[str] = None
3969
self.tab_id: Optional[str] = None
@@ -294,9 +324,17 @@ async def step_7_generate_validation(self):
294324

295325
print("🔍 Analyzing differences...")
296326

297-
# Find differences
298-
before_lines = self.snapshot_before.split('\n')
299-
after_lines = self.snapshot_after.split('\n')
327+
# Apply filtering FIRST if enabled (default behavior)
328+
before_content = self.snapshot_before
329+
after_content = self.snapshot_after
330+
if not self.disable_filtering:
331+
print("🧹 Cleaning HTML with lxml.html.Cleaner...")
332+
before_content = filter_html_tags(before_content)
333+
after_content = filter_html_tags(after_content)
334+
335+
# Find differences from FILTERED content
336+
before_lines = before_content.split('\n')
337+
after_lines = after_content.split('\n')
300338

301339
diff = list(unified_diff(
302340
before_lines,
@@ -320,10 +358,10 @@ async def step_7_generate_validation(self):
320358
diff_file = f"{snapshot_dir}/diff.txt"
321359

322360
with open(before_file, 'w') as f:
323-
f.write(self.snapshot_before)
361+
f.write(before_content)
324362

325363
with open(after_file, 'w') as f:
326-
f.write(self.snapshot_after)
364+
f.write(after_content)
327365

328366
with open(diff_file, 'w') as f:
329367
f.write('\n'.join(diff))
@@ -332,6 +370,8 @@ async def step_7_generate_validation(self):
332370
print(f" BEFORE: {before_file}")
333371
print(f" AFTER: {after_file}")
334372
print(f" DIFF: {diff_file}")
373+
if not self.disable_filtering:
374+
print(" (Cleaned: removed scripts, styles, and unsafe attributes)")
335375

336376
# Show sample of changes
337377
if added_lines:
@@ -508,14 +548,11 @@ async def step_7_generate_validation(self):
508548
# Wait for Claude to create the validation file
509549
validation_file = f"{snapshot_dir}/verify.js"
510550

511-
print("Options:")
512-
print("1. Auto-run Claude Code subprocess (recommended)")
513-
print("2. Wait for Claude Code manually (you run it)")
514-
print("3. Enter validation JavaScript manually")
551+
# Automatically run Claude Code subprocess (no user prompt)
552+
print("🤖 Auto-running Claude Code subprocess to generate validation...")
515553
print()
516554

517-
choice = input("Choice (1/2/3): ").strip()
518-
555+
choice = '1'
519556
lines = []
520557

521558
if choice == '1':
@@ -643,12 +680,13 @@ async def step_7_generate_validation(self):
643680
if lines:
644681
js_code = '\n'.join(lines)
645682

646-
# Test it with retry loop (unlimited retries until user cancels)
683+
# Test it with retry loop (max 3 attempts)
647684
validation_saved = False
648685
retry_count = 0
686+
max_retries = 3
649687

650-
while not validation_saved:
651-
print(f"\n🧪 Testing validation... (attempt {retry_count + 1})")
688+
while not validation_saved and retry_count < max_retries:
689+
print(f"\n🧪 Testing validation... (attempt {retry_count + 1}/{max_retries})")
652690

653691
if await self._test_validation(js_code):
654692
# Save validation JavaScript to external file
@@ -678,17 +716,17 @@ async def step_7_generate_validation(self):
678716
print("✅ Validation saved")
679717
validation_saved = True
680718
else:
681-
# Test failed - allow unlimited retries
682-
print(f"\n⚠️ Validation test failed.")
683-
print("\nOptions:")
684-
print("1. Auto-run Claude Code to fix it (recommended)")
685-
print("2. Enter new validation manually")
686-
print("3. Save anyway (not recommended)")
687-
print("4. Skip validation for now")
719+
# Test failed - auto-retry with Claude Code
720+
retry_count += 1
688721

689-
retry_choice = input("\nChoice (1/2/3/4): ").strip()
722+
if retry_count < max_retries:
723+
print(f"\n⚠️ Validation test failed. Auto-retrying with Claude Code... ({retry_count}/{max_retries})")
724+
else:
725+
print(f"\n❌ Validation failed after {max_retries} attempts. Skipping validation.")
726+
break
690727

691-
if retry_choice == '1':
728+
# Auto-run Claude Code to fix (no user prompt)
729+
if retry_count < max_retries:
692730
# Auto-run Claude Code subprocess to fix the validation
693731
print(f"\n🤖 Launching Claude Code subprocess to fix validation...")
694732
print()
@@ -738,71 +776,25 @@ async def step_7_generate_validation(self):
738776
print("─" * 60)
739777
print()
740778
print("🔄 Re-testing with updated code...")
741-
742-
retry_count += 1
779+
# Continue to next iteration (retry_count already incremented)
743780
continue
744781
else:
745782
print(f"⚠️ Claude Code ran but {validation_file} was not found")
746-
retry_count += 1
783+
# Skip to next retry
747784
continue
748785

749786
except subprocess.TimeoutExpired:
750787
print("⏱️ Claude Code subprocess timed out (5 minutes)")
751-
retry_count += 1
788+
# Skip to next retry
752789
continue
753790
except FileNotFoundError:
754791
print("❌ 'claude' command not found. Is Claude Code installed?")
755-
retry_count += 1
792+
# Skip to next retry
756793
continue
757794
except Exception as e:
758795
print(f"❌ Error running Claude Code: {e}")
759-
retry_count += 1
796+
# Skip to next retry
760797
continue
761-
762-
elif retry_choice == '2':
763-
# Manual entry
764-
print("\nEnter validation JavaScript (type 'END' on new line when done):\n")
765-
new_lines = []
766-
while True:
767-
line = input()
768-
if line.strip() == 'END':
769-
break
770-
new_lines.append(line)
771-
js_code = '\n'.join(new_lines)
772-
retry_count += 1
773-
continue
774-
775-
elif retry_choice == '3':
776-
# Save anyway
777-
# Save validation JavaScript to external file
778-
eval_dir = os.path.dirname(self.file_path)
779-
verify_js_path = os.path.join(eval_dir, 'verify.js')
780-
781-
# Ensure eval directory exists
782-
os.makedirs(eval_dir, exist_ok=True)
783-
784-
# Write JavaScript to external file
785-
with open(verify_js_path, 'w') as f:
786-
f.write(js_code)
787-
788-
print(f"💾 Saved validation script to: {verify_js_path}")
789-
790-
# Reference external file in YAML
791-
if 'validation' not in self.eval_data:
792-
self.eval_data['validation'] = {}
793-
if 'type' not in self.eval_data['validation']:
794-
self.eval_data['validation']['type'] = 'js-eval'
795-
if 'js-eval' not in self.eval_data['validation']:
796-
self.eval_data['validation']['js-eval'] = {}
797-
self.eval_data['validation']['js-eval']['script'] = 'verify.js'
798-
self.eval_data['validation']['js-eval']['expected_result'] = True
799-
self.eval_data['validation']['js-eval']['timeout'] = 5000
800-
print("⚠️ Validation saved (with errors - use caution!)")
801-
validation_saved = True
802-
803-
else: # Choice 4 or anything else
804-
print("⏭️ Skipping validation")
805-
break
806798
else:
807799
print("⚠️ No validation code entered")
808800

@@ -928,6 +920,7 @@ async def main():
928920
parser = argparse.ArgumentParser(description="Snapshot-based eval builder")
929921
parser.add_argument('--file', '-f', help='Eval file path (default: <workdir>/task.yaml)')
930922
parser.add_argument('--workdir', '-w', required=True, help='Working directory for snapshots and validation scripts')
923+
parser.add_argument('--disable-filtering', action='store_true', help='Disable HTML cleaning (keep raw HTML with scripts/styles)')
931924
args = parser.parse_args()
932925

933926
# Normalize workdir path (strip 'evals/' prefix if present and we're already in evals/)
@@ -939,18 +932,24 @@ async def main():
939932
# Strip trailing slashes to avoid double slashes in paths
940933
workdir = workdir.rstrip('/')
941934

942-
# Auto-detect task.yaml in workdir if no file specified
935+
# Auto-detect task.yaml or task.yml in workdir if no file specified
943936
file_path = args.file
944937
if not file_path:
938+
# Check for both .yaml and .yml extensions
945939
task_yaml_path = os.path.join(workdir, 'task.yaml')
940+
task_yml_path = os.path.join(workdir, 'task.yml')
941+
946942
if os.path.exists(task_yaml_path):
947943
file_path = task_yaml_path
948944
print(f"📋 Found existing task.yaml: {file_path}")
945+
elif os.path.exists(task_yml_path):
946+
file_path = task_yml_path
947+
print(f"📋 Found existing task.yml: {file_path}")
949948
else:
950949
file_path = task_yaml_path # Will be created as new file
951950
print(f"📝 Will create new task.yaml: {file_path}")
952951

953-
builder = SnapshotBasedEvalBuilder(file_path=file_path, workdir=workdir)
952+
builder = SnapshotBasedEvalBuilder(file_path=file_path, workdir=workdir, disable_filtering=args.disable_filtering)
954953
await builder.run()
955954

956955

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Claude Code: Generate Validation JavaScript
2+
3+
## Objective
4+
Click the "Start" button to trigger dynamic content loading
5+
6+
## Task
7+
Analyze the BEFORE and AFTER snapshots and generate JavaScript validation code.
8+
9+
## Files to Analyze
10+
- BEFORE: native/data/js-verifier/action/dynamic/before.html
11+
- AFTER: native/data/js-verifier/action/dynamic/after.html
12+
- DIFF: native/data/js-verifier/action/dynamic/diff.txt
13+
14+
## Instructions
15+
1. Read the BEFORE and AFTER HTML files
16+
2. Read the DIFF file to see what actually changed
17+
3. Identify the specific DOM changes that indicate the objective was completed
18+
4. Generate JavaScript code that:
19+
- Checks if the objective was completed successfully
20+
- **CRITICAL: DO NOT use `return` statements - end with a boolean expression**
21+
- Is based on ACTUAL observed changes (not assumptions)
22+
- Works in the browser context
23+
24+
## CRITICAL: Output Format
25+
26+
**DO NOT USE RETURN STATEMENTS!** The code is evaluated as an expression, not a function.
27+
28+
❌ WRONG:
29+
```javascript
30+
return document.querySelector('#success') !== null;
31+
```
32+
33+
✅ CORRECT:
34+
```javascript
35+
// Check for the specific change
36+
const element = document.querySelector('...');
37+
element && element.value === 'expected'
38+
```
39+
40+
The last line should be a boolean expression (no return keyword).
41+
42+
## Testing Your Code
43+
44+
**YOU MUST TEST YOUR CODE ON BOTH TABS** before declaring it complete.
45+
46+
**Endpoint:** POST http://localhost:8080/page/execute
47+
48+
**Browser State Information:**
49+
- **Client ID:** 4af9762d-c210-4c82-b5ff-a261c1167c04
50+
- **Tab ID (AFTER - task completed):** DA823E525F671B595F4C7D13BCBB461B
51+
- **Tab ID (BEFORE - initial state):** 72717EC9480BA1610DA260E917C14E63
52+
53+
**Test 1: AFTER Tab (Should Return TRUE):**
54+
```bash
55+
curl -X POST http://localhost:8080/page/execute \
56+
-H "Content-Type: application/json" \
57+
-d '{
58+
"clientId": "4af9762d-c210-4c82-b5ff-a261c1167c04",
59+
"tabId": "DA823E525F671B595F4C7D13BCBB461B",
60+
"expression": "YOUR_JAVASCRIPT_CODE_HERE",
61+
"returnByValue": true,
62+
"awaitPromise": false
63+
}'
64+
```
65+
66+
**Expected Response:** `{"result": {"value": true}}`
67+
68+
**Test 2: BEFORE Tab (Should Return FALSE):**
69+
```bash
70+
curl -X POST http://localhost:8080/page/execute \
71+
-H "Content-Type: application/json" \
72+
-d '{
73+
"clientId": "4af9762d-c210-4c82-b5ff-a261c1167c04",
74+
"tabId": "72717EC9480BA1610DA260E917C14E63",
75+
"expression": "YOUR_JAVASCRIPT_CODE_HERE",
76+
"returnByValue": true,
77+
"awaitPromise": false
78+
}'
79+
```
80+
81+
**Expected Response:** `{"result": {"value": false}}`
82+
83+
**CRITICAL:** Your validation MUST:
84+
- Return TRUE on the AFTER tab (task completed)
85+
- Return FALSE on the BEFORE tab (task not done)
86+
- This proves your validation correctly detects the change
87+
88+
**Error Response:**
89+
```json
90+
{
91+
"exceptionDetails": { "text": "Error message here" }
92+
}
93+
```
94+
95+
## Workflow
96+
97+
1. Write validation code to: native/data/js-verifier/action/dynamic/verify.js
98+
2. Test it on the AFTER tab (should return TRUE)
99+
3. Test it on the BEFORE tab (should return FALSE)
100+
4. If you get errors or wrong results:
101+
- Read the existing native/data/js-verifier/action/dynamic/verify.js
102+
- Identify the issue from the API error response
103+
- Edit and fix the file
104+
- Save the improved version
105+
- Test BOTH tabs again
106+
5. Iterate until:
107+
- AFTER tab returns {"result": {"value": true}}
108+
- BEFORE tab returns {"result": {"value": false}}
109+
6. Only then is your code complete
110+
111+
## Save Your Response
112+
When you generate WORKING validation JavaScript (tested via API), save it to:
113+
native/data/js-verifier/action/dynamic/verify.js
114+
115+
The orchestrator will automatically pick it up and test it again for confirmation.
116+
117+
**IMPORTANT:** The file will NOT be deleted between iterations. You can read it,
118+
learn from previous attempts, and improve it iteratively.

0 commit comments

Comments
 (0)