Skip to content

Commit a92fe91

Browse files
committed
Better retries for eval builder
1 parent 5b83e83 commit a92fe91

File tree

1 file changed

+100
-116
lines changed

1 file changed

+100
-116
lines changed

evals/eval_builder_snapshots.py

Lines changed: 100 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -643,13 +643,12 @@ async def step_7_generate_validation(self):
643643
if lines:
644644
js_code = '\n'.join(lines)
645645

646-
# Test it with retry loop
646+
# Test it with retry loop (unlimited retries until user cancels)
647647
validation_saved = False
648-
max_retries = 3
649648
retry_count = 0
650649

651-
while retry_count <= max_retries and not validation_saved:
652-
print(f"\n🧪 Testing validation... (attempt {retry_count + 1}/{max_retries + 1})")
650+
while not validation_saved:
651+
print(f"\n🧪 Testing validation... (attempt {retry_count + 1})")
653652

654653
if await self._test_validation(js_code):
655654
# Save validation JavaScript to external file
@@ -667,67 +666,63 @@ async def step_7_generate_validation(self):
667666

668667
# Reference external file in YAML
669668
if 'validation' not in self.eval_data:
670-
self.eval_data['validation'] = {'type': 'js-eval', 'js-eval': {}}
669+
self.eval_data['validation'] = {}
670+
if 'type' not in self.eval_data['validation']:
671+
self.eval_data['validation']['type'] = 'js-eval'
672+
if 'js-eval' not in self.eval_data['validation']:
673+
self.eval_data['validation']['js-eval'] = {}
671674

672675
self.eval_data['validation']['js-eval']['script'] = 'verify.js'
673676
self.eval_data['validation']['js-eval']['expected_result'] = True
674677
self.eval_data['validation']['js-eval']['timeout'] = 5000
675678
print("✅ Validation saved")
676679
validation_saved = True
677680
else:
678-
# Test failed
679-
if retry_count < max_retries:
680-
print(f"\n⚠️ Validation test failed. You have {max_retries - retry_count} retries remaining.")
681-
print("\nOptions:")
682-
print("1. Let Claude Code fix it (updates verify.js)")
683-
print("2. Enter new validation manually")
684-
print("3. Save anyway (not recommended)")
685-
print("4. Skip validation for now")
686-
687-
retry_choice = input("\nChoice (1/2/3/4): ").strip()
688-
689-
if retry_choice == '1':
690-
# Wait for Claude Code to update the file
691-
print()
692-
print("=" * 80)
693-
print("🔧 CLAUDE CODE: Fix the Validation JavaScript")
694-
print("=" * 80)
695-
print()
696-
print("📋 API Testing Information:")
697-
print(f" Endpoint: POST http://localhost:8080/page/execute")
698-
print(f" Client ID: {self.client_id}")
699-
print(f" Tab ID (AFTER - task completed): {self.tab_id}")
700-
if self.tab_id_before:
701-
print(f" Tab ID (BEFORE - initial state): {self.tab_id_before}")
702-
print()
703-
print("📝 What to do in Claude Code:")
704-
print(f" 1. Edit and fix the JavaScript in: {validation_file}")
705-
print(" 2. Remove any 'return' statements")
706-
print(" 3. Test on BOTH tabs using /page/execute endpoint:")
707-
if self.tab_id_before:
708-
print(f" - AFTER tab ({self.tab_id}) → should return TRUE")
709-
print(f" - BEFORE tab ({self.tab_id_before}) → should return FALSE")
710-
else:
711-
print(f" - AFTER tab ({self.tab_id}) → should return TRUE")
712-
print(f" 4. Iterate until both tests pass correctly")
713-
print(f" 5. Save your fixes back to: {validation_file}")
714-
print()
715-
print(" 💡 Claude Code can read the existing file and improve it iteratively")
716-
if self.tab_id_before:
717-
print(" ⚠️ Validation MUST return TRUE on AFTER, FALSE on BEFORE")
718-
print()
719-
print("=" * 80)
720-
print()
721-
722-
input("Press Enter when Claude Code has updated the file...")
681+
# Test failed - allow unlimited retries
682+
print(f"\n⚠️ Validation test failed.")
683+
print("\nOptions:")
684+
print("1. Auto-run Claude Code to fix it (recommended)")
685+
print("2. Enter new validation manually")
686+
print("3. Save anyway (not recommended)")
687+
print("4. Skip validation for now")
688+
689+
retry_choice = input("\nChoice (1/2/3/4): ").strip()
690+
691+
if retry_choice == '1':
692+
# Auto-run Claude Code subprocess to fix the validation
693+
print(f"\n🤖 Launching Claude Code subprocess to fix validation...")
694+
print()
695+
696+
# Construct the prompt for Claude Code
697+
marker_file = f"{snapshot_dir}/CLAUDE_REQUEST.md"
698+
claude_prompt = f"Read @{marker_file} and fix the validation JavaScript in {validation_file}. The previous attempt failed - analyze the error and fix it. Test it on both tabs as instructed."
699+
700+
try:
701+
# Call Claude Code CLI with --dangerously-skip-permissions for auto-accept
702+
result = subprocess.run(
703+
['claude', '--dangerously-skip-permissions', claude_prompt],
704+
cwd=os.getcwd(),
705+
capture_output=True,
706+
text=True,
707+
timeout=300 # 5 minute timeout
708+
)
709+
710+
print("Claude Code output:")
711+
print("─" * 60)
712+
print(result.stdout)
713+
if result.stderr:
714+
print("Errors:")
715+
print(result.stderr)
716+
print("─" * 60)
723717
print()
724718

719+
# Check if verify.js was updated
725720
if os.path.exists(validation_file):
726721
print("✅ Updated file detected!")
727722
with open(validation_file, 'r') as f:
728723
js_code = f.read().strip()
729724

730-
# Clean up markdown if present
725+
# Clean up if it has markdown code blocks
731726
if js_code.startswith('```'):
732727
lines_raw = js_code.split('\n')
733728
if lines_raw[0].startswith('```'):
@@ -745,79 +740,68 @@ async def step_7_generate_validation(self):
745740
print("🔄 Re-testing with updated code...")
746741

747742
retry_count += 1
748-
# Continue to top of loop to re-test
749743
continue
750744
else:
751-
print("❌ File not found. Please try again.")
745+
print(f"⚠️ Claude Code ran but {validation_file} was not found")
752746
retry_count += 1
753747
continue
754748

755-
elif retry_choice == '2':
756-
# Manual entry
757-
print("\nEnter validation JavaScript (type 'END' on new line when done):\n")
758-
new_lines = []
759-
while True:
760-
line = input()
761-
if line.strip() == 'END':
762-
break
763-
new_lines.append(line)
764-
js_code = '\n'.join(new_lines)
749+
except subprocess.TimeoutExpired:
750+
print("⏱️ Claude Code subprocess timed out (5 minutes)")
751+
retry_count += 1
752+
continue
753+
except FileNotFoundError:
754+
print("❌ 'claude' command not found. Is Claude Code installed?")
755+
retry_count += 1
756+
continue
757+
except Exception as e:
758+
print(f"❌ Error running Claude Code: {e}")
765759
retry_count += 1
766760
continue
767761

768-
elif retry_choice == '3':
769-
# Save anyway
770-
# Save validation JavaScript to external file
771-
eval_dir = os.path.dirname(self.file_path)
772-
verify_js_path = os.path.join(eval_dir, 'verify.js')
773-
774-
# Ensure eval directory exists
775-
os.makedirs(eval_dir, exist_ok=True)
776-
777-
# Write JavaScript to external file
778-
with open(verify_js_path, 'w') as f:
779-
f.write(js_code)
780-
781-
print(f"💾 Saved validation script to: {verify_js_path}")
782-
783-
# Reference external file in YAML
784-
if 'validation' not in self.eval_data:
785-
self.eval_data['validation'] = {'type': 'js-eval', 'js-eval': {}}
786-
self.eval_data['validation']['js-eval']['script'] = 'verify.js'
787-
self.eval_data['validation']['js-eval']['expected_result'] = True
788-
self.eval_data['validation']['js-eval']['timeout'] = 5000
789-
print("⚠️ Validation saved (with errors - use caution!)")
790-
validation_saved = True
791-
792-
else: # Choice 4 or anything else
793-
print("⏭️ Skipping validation")
794-
break
795-
else:
796-
# Out of retries
797-
print(f"\n❌ Maximum retries ({max_retries}) reached.")
798-
save_anyway = input("Save validation anyway? (y/n): ").strip().lower()
799-
if save_anyway == 'y':
800-
# Save validation JavaScript to external file
801-
eval_dir = os.path.dirname(self.file_path)
802-
verify_js_path = os.path.join(eval_dir, 'verify.js')
803-
804-
# Ensure eval directory exists
805-
os.makedirs(eval_dir, exist_ok=True)
806-
807-
# Write JavaScript to external file
808-
with open(verify_js_path, 'w') as f:
809-
f.write(js_code)
810-
811-
print(f"💾 Saved validation script to: {verify_js_path}")
812-
813-
# Reference external file in YAML
814-
if 'validation' not in self.eval_data:
815-
self.eval_data['validation'] = {'type': 'js-eval', 'js-eval': {}}
816-
self.eval_data['validation']['js-eval']['script'] = 'verify.js'
817-
self.eval_data['validation']['js-eval']['expected_result'] = True
818-
self.eval_data['validation']['js-eval']['timeout'] = 5000
819-
print("⚠️ Validation saved (with errors)")
820-
validation_saved = True
762+
elif retry_choice == '2':
763+
# Manual entry
764+
print("\nEnter validation JavaScript (type 'END' on new line when done):\n")
765+
new_lines = []
766+
while True:
767+
line = input()
768+
if line.strip() == 'END':
769+
break
770+
new_lines.append(line)
771+
js_code = '\n'.join(new_lines)
772+
retry_count += 1
773+
continue
774+
775+
elif retry_choice == '3':
776+
# Save anyway
777+
# Save validation JavaScript to external file
778+
eval_dir = os.path.dirname(self.file_path)
779+
verify_js_path = os.path.join(eval_dir, 'verify.js')
780+
781+
# Ensure eval directory exists
782+
os.makedirs(eval_dir, exist_ok=True)
783+
784+
# Write JavaScript to external file
785+
with open(verify_js_path, 'w') as f:
786+
f.write(js_code)
787+
788+
print(f"💾 Saved validation script to: {verify_js_path}")
789+
790+
# Reference external file in YAML
791+
if 'validation' not in self.eval_data:
792+
self.eval_data['validation'] = {}
793+
if 'type' not in self.eval_data['validation']:
794+
self.eval_data['validation']['type'] = 'js-eval'
795+
if 'js-eval' not in self.eval_data['validation']:
796+
self.eval_data['validation']['js-eval'] = {}
797+
self.eval_data['validation']['js-eval']['script'] = 'verify.js'
798+
self.eval_data['validation']['js-eval']['expected_result'] = True
799+
self.eval_data['validation']['js-eval']['timeout'] = 5000
800+
print("⚠️ Validation saved (with errors - use caution!)")
801+
validation_saved = True
802+
803+
else: # Choice 4 or anything else
804+
print("⏭️ Skipping validation")
821805
break
822806
else:
823807
print("⚠️ No validation code entered")

0 commit comments

Comments
 (0)