diff --git a/app/services/ai/simple_workflows/evaluation_openrouter_v2.py b/app/services/ai/simple_workflows/evaluation_openrouter_v2.py index 1a127f40..a70f4be9 100644 --- a/app/services/ai/simple_workflows/evaluation_openrouter_v2.py +++ b/app/services/ai/simple_workflows/evaluation_openrouter_v2.py @@ -527,7 +527,7 @@ async def evaluate_proposal_openrouter( model: Optional[str] = None, temperature: Optional[float] = None, reasoning: Optional[bool] = None, -) -> Optional[EvaluationOutput]: +) -> Optional[Dict[str, Any]]: """ Evaluate a proposal using OpenRouter and Grok prompts. @@ -537,7 +537,7 @@ async def evaluate_proposal_openrouter( temperature: Generation temperature. Returns: - Parsed EvaluationOutput or None if evaluation fails. + Dict with evaluation output and captured prompts, or None if evaluation fails. """ try: # parse the uuid @@ -706,7 +706,12 @@ async def evaluate_proposal_openrouter( logger.info(f"Successfully evaluated proposal {proposal_id}") - return evaluation_output + return { + "evaluation_output": evaluation_output.model_dump(), + "full_system_prompt": system_prompt, + "full_user_prompt": formatted_user_content, + "full_messages": messages, + } except json.JSONDecodeError as e: logger.error(f"JSON decode error: {e}") diff --git a/eval_viewer_v2.html b/eval_viewer_v2.html index 51ed6f95..01196c25 100644 --- a/eval_viewer_v2.html +++ b/eval_viewer_v2.html @@ -258,6 +258,9 @@

Eval Viewer V2

expected_decision: item.expected_decision || null, mismatch: false, usage: { input_tokens: "0", output_tokens: "0", est_cost: "$0.000000" }, + full_system_prompt: item.full_system_prompt || "N/A", + full_user_prompt: item.full_user_prompt || "N/A", + full_messages: item.full_messages || [], }; } @@ -285,6 +288,9 @@

Eval Viewer V2

output_tokens: evalOut.usage_output_tokens || "0", est_cost: evalOut.usage_est_cost || "$0.000000", }, + full_system_prompt: item.full_system_prompt || "N/A", + full_user_prompt: item.full_user_prompt || "N/A", + full_messages: item.full_messages || [], }; }); @@ -692,6 +698,25 @@

Category Averages

`; details.appendChild(usageInfo); + // Prompt info (mirroring original viewer) + const promptInfo = document.createElement("div"); + promptInfo.className = "space-y-2 mb-4"; + promptInfo.innerHTML = ` +
+ Full System Prompt +
${escapeHtml(prop.full_system_prompt)}
+
+
+ Full User Prompt +
${escapeHtml(prop.full_user_prompt)}
+
+
+ Full LLM Messages +
${escapeHtml(JSON.stringify(prop.full_messages, null, 2))}
+
+ `; + details.appendChild(promptInfo); + // Categories table details.innerHTML += `
diff --git a/scripts/generate_evals_manifest.py b/scripts/generate_evals_manifest.py index 8b0bf57d..3b94608f 100644 --- a/scripts/generate_evals_manifest.py +++ b/scripts/generate_evals_manifest.py @@ -1,29 +1,44 @@ #!/usr/bin/env python3 + +import os + """ Utility script to generate or update evals-manifest.json based on contents of ./evals/. Scans for files matching *_summary.json and creates a manifest with path and name (timestamp). """ import json -import os from datetime import datetime +import re + def generate_manifest(evals_dir="./evals", manifest_path="./evals/evals-manifest.json"): - """Generate manifest from JSON files in evals_dir.""" + """Generate manifest from JSON files in evals_dir matching new pattern.""" + ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + evals_dir = os.path.join(ROOT_DIR, evals_dir) + manifest_path = os.path.join(ROOT_DIR, manifest_path) manifest = [] + timestamp_pattern = re.compile( + r"^(\d{8}_\d{6})_summary\.json$" + ) # Matches YYYYMMDD_HHMMSS_summary.json + for filename in os.listdir(evals_dir): - if filename.endswith("_summary.json"): - timestamp_str = filename.split("_")[0] # Extract YYYYMMDD_HHMMSS + match = timestamp_pattern.match(filename) + if match: + timestamp_str = match.group(1) # e.g., 20251118_160840 try: - timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S") - name = timestamp.strftime("%Y-%m-%d %H:%M:%S") + # Parse YYYYMMDD_HHMMSS + dt = datetime.strptime(timestamp_str.replace("_", ""), "%Y%m%d%H%M%S") + name = dt.strftime("%Y-%m-%d %H:%M:%S") # Display format except ValueError: name = filename manifest.append({"path": f"./evals/{filename}", "name": name}) - # Sort by timestamp descending - manifest.sort(key=lambda x: x["name"], reverse=True) + # Sort by parsed datetime descending + manifest.sort( + key=lambda x: datetime.strptime(x["name"], "%Y-%m-%d %H:%M:%S"), reverse=True + ) os.makedirs(os.path.dirname(manifest_path), exist_ok=True) with open(manifest_path, "w") as f: diff --git a/scripts/test_evaluation_openrouter_v2.py b/scripts/test_evaluation_openrouter_v2.py index 3c8aa65f..a96f17f1 100644 --- a/scripts/test_evaluation_openrouter_v2.py +++ b/scripts/test_evaluation_openrouter_v2.py @@ -15,21 +15,14 @@ from datetime import datetime from typing import Optional, List, Dict, Any -from urllib.parse import urlparse from uuid import UUID # Add the parent directory to the path to import from app sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from app.backend.factory import backend -from app.backend.models import ContractStatus, ProposalFilter from app.config import config -from app.services.ai.simple_workflows.evaluation_openrouter_v1 import ( - format_proposals_for_context_v2, -) -from app.services.ai.simple_workflows.prompts.evaluation_grok import ( - EVALUATION_GROK_SYSTEM_PROMPT, - EVALUATION_GROK_USER_PROMPT_TEMPLATE, +from app.services.ai.simple_workflows.evaluation_openrouter_v2 import ( + evaluate_proposal_openrouter, ) @@ -105,482 +98,57 @@ async def test_evaluation( if model: print(f"Using model from args: {model}") - # get the proposal from the backend - proposal = backend.get_proposal(proposal_uuid) - if not proposal: - print(f"❌ Proposal {proposal_id} not found in database") - return - - print(f"Found proposal: {proposal_id}") - - # extract proposal into a consistent object - proposal_info_for_evaluation = { - "proposal_number": proposal.proposal_id, - "title": proposal.title, - # "content": proposal.content, # includes metadata/tags - "summary": proposal.summary, # just post and ref link - "created_at_timestamp": proposal.created_at, - "created_at_btc_block": proposal.created_btc, - "executable_at_btc_block": proposal.exec_start, - "x_url": proposal.x_url, - # "tweet_id": proposal.tweet_id, # used internally - # "tags": proposal.tags, # noise at this point? - "tx_sender": proposal.tx_sender, - } - - for key, value in proposal_info_for_evaluation.items(): - print(f" {key}: {value[:80] if isinstance(value, str) else value}") - - # fetch dao info from proposal info - print("\n" + "=" * 80) - dao = None - dao_info_for_evaluation = None - if proposal.dao_id: - dao = backend.get_dao(proposal.dao_id) - - if not dao: - print(f"❌ DAO {proposal.dao_id} not found for proposal {proposal_id}") - return - - print(f"Found related DAO: {dao.name} (ID: {dao.id})") - - dao_info_for_evaluation = { - "dao_id": dao.id, - "name": dao.name, - "mission": dao.mission, - } - - for key, value in dao_info_for_evaluation.items(): - print(f" {key}: {value[:80] if isinstance(value, str) else value}") - - # fetch tweet info from DB - print("\n" + "=" * 80) - tweet_info_for_evaluation = None - - if proposal.tweet_id: - tweet_content = backend.get_x_tweet(proposal.tweet_id) - print(f"Fetched tweet info for tweet ID: {proposal.tweet_id}") - else: - tweet_content = None - print("No tweet ID associated with this proposal") - - if tweet_content: - tweet_info_for_evaluation = { - # "message": tweet_content.message, # already in summary - # "author_id": tweet_content.author_id, # local to our DB - "x_post_id": tweet_content.tweet_id, - # "conversation_id": tweet_content.conversation_id, # verify used? - "images": tweet_content.images, - "author_name": tweet_content.author_name, - "author_username": tweet_content.author_username, - "created_at": tweet_content.created_at_twitter, - "public_metrics": tweet_content.public_metrics, - # "entities": tweet_content.entities, - # "attachments": tweet_content.attachments, - "quoted_tweet_id": tweet_content.quoted_tweet_id, - "in_reply_to_user_id": tweet_content.in_reply_to_user_id, - "replied_to_tweet_id": tweet_content.replied_to_tweet_id, - } - - for key, value in (tweet_info_for_evaluation or {}).items(): - print(f" {key}: {value[:100] if isinstance(value, str) else value}") - - # fetch tweet author info from db (if present) - print("\n" + "=" * 80) - tweet_author_info_for_evaluation = None - - if tweet_content and tweet_content.author_id: - tweet_author_id = tweet_content.author_id - tweet_author_content = backend.get_x_user(tweet_author_id) - print(f"Fetched tweet author info for user ID: {tweet_author_id}") - - if tweet_author_content: - tweet_author_info_for_evaluation = { - "user_id": tweet_author_content.user_id, - "name": tweet_author_content.name, - "username": tweet_author_content.username, - "description": tweet_author_content.description, - "verified": tweet_author_content.verified, - "verified_type": tweet_author_content.verified_type, - "location": tweet_author_content.location, - } - - for key, value in tweet_author_info_for_evaluation.items(): - print( - f" {key}: {value[:100] if isinstance(value, str) else value}" - ) - else: - print("❌ Could not fetch tweet author content") - - # fetch quoted tweet from db (if present) - print("\n" + "=" * 80) - quote_tweet_info_for_evaluation = None - - if tweet_content and tweet_content.quoted_tweet_db_id: - quoted_tweet_id = tweet_content.quoted_tweet_db_id - quoted_tweet_content = backend.get_x_tweet(quoted_tweet_id) - print(f"Fetched quoted tweet info for tweet ID: {quoted_tweet_id}") - - if quoted_tweet_content: - quote_tweet_info_for_evaluation = { - "x_post_id": quoted_tweet_content.tweet_id, - # "conversation_id": quoted_tweet_content.conversation_id, - "images": quoted_tweet_content.images, - "author_name": quoted_tweet_content.author_name, - "author_username": quoted_tweet_content.author_username, - "created_at": quoted_tweet_content.created_at_twitter, - "public_metrics": quoted_tweet_content.public_metrics, - } - - for key, value in quote_tweet_info_for_evaluation.items(): - print( - f" {key}: {value[:100] if isinstance(value, str) else value}" - ) - else: - print("❌ Could not fetch quoted tweet content") - else: - print("No quoted tweet associated with this tweet") - - # fetch replied-to tweet from db (if present) - print("\n" + "=" * 80) - reply_tweet_info_for_evaluation = None - - if tweet_content and tweet_content.replied_to_tweet_db_id: - replied_to_tweet_id = tweet_content.replied_to_tweet_db_id - replied_to_tweet_content = backend.get_x_tweet(replied_to_tweet_id) - print(f"Fetched replied-to tweet info for tweet ID: {replied_to_tweet_id}") - - if replied_to_tweet_content: - reply_tweet_info_for_evaluation = { - "x_post_id": replied_to_tweet_content.tweet_id, - # "conversation_id": replied_to_tweet_content.conversation_id, - "images": replied_to_tweet_content.images, - "author_name": replied_to_tweet_content.author_name, - "author_username": replied_to_tweet_content.author_username, - "created_at": replied_to_tweet_content.created_at_twitter, - "public_metrics": replied_to_tweet_content.public_metrics, - } - - for key, value in reply_tweet_info_for_evaluation.items(): - print( - f" {key}: {value[:100] if isinstance(value, str) else value}" - ) - else: - print("❌ Could not fetch replied-to tweet content") - else: - print("No replied-to tweet associated with this tweet") - - # fetch past proposals for context - print("\n" + "=" * 80) - dao_past_proposals_categorized = None - dao_past_proposals_stats_for_evaluation = None - dao_draft_proposals_for_evaluation = None - dao_deployed_proposals_for_evaluation = None - - # get all proposals for the dao - dao_proposals = backend.list_proposals(ProposalFilter(dao_id=proposal.dao_id)) - # exclude the current proposal - dao_proposals = [p for p in dao_proposals if p.id != proposal.id] - print( - f"Fetched {len(dao_proposals)} past proposals for DAO ID: {proposal.dao_id}" - ) - - # print all proposals - # for p in dao_proposals: - # print( - # f" Proposal ID: {p.id}, Title: {p.title[:50] if isinstance(p.title, str) else p.title}..." - # ) - # print(p) - - # match past proposals by same tx_sender - user_past_proposals_for_evaluation = None - if proposal.tx_sender: - user_past_proposals = [ - p for p in dao_proposals if p.tx_sender == proposal.tx_sender - ] - user_past_proposals_for_evaluation = format_proposals_for_context_v2( - user_past_proposals - ) - print("\n" + "=" * 80) - print( - f"Found {len(user_past_proposals)} past proposals by same sender:\n{proposal.tx_sender}" - ) - - # remove tx-sender matched proposals from dao proposals - # if not present then default to full object - dao_past_proposals = [ - p - for p in dao_proposals - if user_past_proposals and p not in user_past_proposals - ] - - sorted_dao_past_proposals = sorted( - dao_past_proposals, - key=lambda p: getattr(p, "created_at", datetime.min), - reverse=True, + evaluation_result = await evaluate_proposal_openrouter( + proposal_uuid, model=model, temperature=0.7, reasoning=True ) - dao_past_proposals_categorized = { - "ALL": sorted_dao_past_proposals, - ContractStatus.DRAFT: [ - p for p in sorted_dao_past_proposals if p.status == ContractStatus.DRAFT - ], - ContractStatus.PENDING: [ - p - for p in sorted_dao_past_proposals - if p.status == ContractStatus.PENDING - ], - ContractStatus.DEPLOYED: [ - p - for p in sorted_dao_past_proposals - if p.status == ContractStatus.DEPLOYED - ], - ContractStatus.FAILED: [ - p - for p in sorted_dao_past_proposals - if p.status == ContractStatus.FAILED - ], - } - - dao_past_proposals_stats_for_evaluation = { - "ALL": len(sorted_dao_past_proposals), - ContractStatus.DRAFT: len( - dao_past_proposals_categorized[ContractStatus.DRAFT] - ), - ContractStatus.PENDING: len( - dao_past_proposals_categorized[ContractStatus.PENDING] - ), - ContractStatus.DEPLOYED: len( - dao_past_proposals_categorized[ContractStatus.DEPLOYED] - ), - ContractStatus.FAILED: len( - dao_past_proposals_categorized[ContractStatus.FAILED] - ), - } - - print("Stats:", dao_past_proposals_stats_for_evaluation) - - # limit to last 20 - dao_draft_proposals = dao_past_proposals_categorized[ContractStatus.DRAFT][:20] - - dao_draft_proposals_for_evaluation = format_proposals_for_context_v2( - dao_draft_proposals - ) - - print("\n" + "=" * 80) - print( - f"Using {len(dao_draft_proposals)} DAO draft proposals for evaluation context" - ) - - # limit to last 100 - dao_deployed_proposals = dao_past_proposals_categorized[ - ContractStatus.DEPLOYED - ][:100] - - dao_deployed_proposals_for_evaluation = format_proposals_for_context_v2( - dao_deployed_proposals - ) - - print("\n" + "=" * 80) - print( - f"Using {len(dao_deployed_proposals)} DAO deployed proposals for evaluation context" - ) - - # add images in format so grok will read them - # this should be appended to user chat object - print("\n" + "=" * 80) - images_for_evaluation = [] - - if tweet_content and tweet_content.images: - for img_url in tweet_content.images: - # Basic validation of URL - parsed_url = urlparse(img_url) - if parsed_url.scheme in ["http", "https"]: - images_for_evaluation.append( - { - "type": "image_url", - "image_url": {"url": img_url, "detail": "auto"}, - } - ) - else: - print(f"❌ Invalid image URL skipped: {img_url}") - - print(f"Prepared {len(images_for_evaluation)} images for AI evaluation") - - # determine prompt type based on DAO - print("\n" + "=" * 80) - system_prompt = EVALUATION_GROK_SYSTEM_PROMPT - user_prompt = EVALUATION_GROK_USER_PROMPT_TEMPLATE - - # fail if not found - if not system_prompt or not user_prompt: - print("❌ Could not load prompts for evaluation") + if not evaluation_result: + print("❌ Evaluation failed") return - # list of dict with str keys and values that are either str or list of dicts - # say that 10 times fast (using it based on format in Full LLM Messages in Eval)) - messages: list[dict[str, str | list[dict[str, str]]]] = [] - - system_content = system_prompt - - messages = [{"role": "system", "content": system_content}] - - formatted_user_content = user_prompt.format( - dao_info_for_evaluation=dao_info_for_evaluation, - proposal_content_for_evaluation=proposal_info_for_evaluation, - tweet_info_for_evaluation=tweet_info_for_evaluation, - tweet_author_info_for_evaluation=tweet_author_info_for_evaluation, - quote_tweet_info_for_evaluation=quote_tweet_info_for_evaluation, - reply_tweet_info_for_evaluation=reply_tweet_info_for_evaluation, - dao_past_proposals_stats_for_evaluation=dao_past_proposals_stats_for_evaluation, - user_past_proposals_for_evaluation=user_past_proposals_for_evaluation, - dao_draft_proposals_for_evaluation=dao_draft_proposals_for_evaluation, - dao_deployed_proposals_for_evaluation=dao_deployed_proposals_for_evaluation, - ) - - user_content = [{"type": "text", "text": formatted_user_content}] - - if len(images_for_evaluation) > 0: - for image in images_for_evaluation: - user_content.append(image) - - messages.append({"role": "user", "content": user_content}) - print("\n" + "=" * 80) - print("Final formatted messages to be sent to LLM:") - print(messages) - - # NEXT STEP: call OpenRouter with this data, see if it processes - # then we work backwards to how we want to implement it in the other areas - # also add these headers to any OpenRouter methods: - # 'default_headers': {'HTTP-Referer': 'https://aibtc.com', 'X-Title': 'AIBTC'} - - x_ai_tools = [{"type": "web_search"}, {"type": "x_search"}] - - openrouter_response = await call_openrouter( - messages=messages, model=model or None, temperature=0.7, tools=x_ai_tools - ) - - print("\n" + "=" * 80) - print("OpenRouter Full API Response:") - print(openrouter_response) - - print("\n" + "=" * 80) - print("OpenRouter Response Breakdown:") - - response_id = openrouter_response.get("id") - response_provider = openrouter_response.get("provider") - response_model = openrouter_response.get("model") - response_usage = openrouter_response.get("usage") - response_usage_total_tokens = ( - response_usage.get("total_tokens") if response_usage else None - ) + # print("Evaluation Result:") + # print(json.dumps(evaluation_result, indent=2)) - print(f"Response ID: {response_id}") - print(f"Response Provider: {response_provider}") - print(f"Response Model: {response_model}") - print(f"Response Usage: {response_usage_total_tokens} total tokens") - - response_choices = openrouter_response.get("choices", []) - - if len(response_choices) == 0: - print("❌ No choices returned in OpenRouter response") - return - - if len(response_choices) > 1: - print( - f"⚠️ Multiple choices returned ({len(response_choices)}), using the first one." - ) - - first_choice = response_choices[0] - - choice_finish_reason = first_choice.get("finish_reason") - choice_native_finish_reason = first_choice.get("native_finish_reason") - - print(f"Choice Finish Reason: {choice_finish_reason}") - print(f"Choice Native Finish Reason: {choice_native_finish_reason}") - - print("\n" + "=" * 80) - print("Parsing JSON from message content") - - choice_message = first_choice.get("message") - if not choice_message: - print("❌ No message found in the first choice") - return - - if not isinstance(choice_message, dict): - print("❌ Choice message is not a dictionary") - - choice_message_role = choice_message.get("role") - choice_message_content = choice_message.get("content") - choice_message_refusal = choice_message.get("refusal") - choice_message_reasoning = choice_message.get("reasoning") - choice_message_reasoning_details = choice_message.get("reasoning_details") - - if ( - choice_message_reasoning_details - and len(choice_message_reasoning_details) > 1 - ): - print( - f"⚠️ Multiple reasoning details returned ({len(choice_message_reasoning_details)}), using the first one." + if save_output: + output_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "evals", ) + os.makedirs(output_dir, exist_ok=True) + + # New naming: YYYYMMDD-HHMMSS_{short_proposal_id}_summary.json + now = datetime.now() + timestamp = now.strftime("%Y%m%d-%H%M%S") # e.g., 20251118-160840 + short_id = proposal_id[:8] # First 8 chars of UUID + output_filename = f"{timestamp}_{short_id}_summary.json" + output_path = os.path.join(output_dir, output_filename) + + evaluation_output = evaluation_result["evaluation_output"] + if hasattr(evaluation_output, "model_dump"): + evaluation_output_dumped = evaluation_output.model_dump() + else: + evaluation_output_dumped = evaluation_output # already dumped + + output_data = { + "timestamp": datetime.now().isoformat(), + "results": [ + { + "proposal_id": proposal_id, + "evaluation_output": evaluation_output_dumped, + "full_system_prompt": evaluation_result.get( + "full_system_prompt" + ), + "full_user_prompt": evaluation_result.get("full_user_prompt"), + "full_messages": evaluation_result.get("full_messages"), + } + ], + } - choice_annotations = choice_message.get("annotations") - - print(f"Choice Message Role: {choice_message_role}") - print(f"Choice Message Refusal: {choice_message_refusal}") - print(f"Choice Message Reasoning: {choice_message_reasoning}") - print( - f"Choice Annotations: {len(choice_annotations) if choice_annotations else 0}" - ) - choice_annotations_urls = [] - if choice_annotations: - for annotation in choice_annotations: - if annotation.get("type") == "url_citation": - url_citation = annotation.get("url_citation") - if url_citation: - url = url_citation.get("url") - if url: - choice_annotations_urls.append(url) - else: - print(f"Unknown annotation type: {annotation.get('type')}") - - if len(choice_annotations_urls) > 0: - print(" URLs cited in annotations:") - for url in choice_annotations_urls: - print(f" {url}") - - if not choice_message_content: - print("❌ No content found in the choice message") - - if not isinstance(choice_message_content, str): - print("❌ Choice message content is not a string") - - try: - choice_message_json = json.loads(choice_message_content) - print("\n" + "=" * 80) - print("Successfully parsed JSON from message content") - print(json.dumps(choice_message_json, indent=2)) - - if save_output: - # save to file - output_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "evals", - ) - os.makedirs(output_dir, exist_ok=True) - output_filename = f"evaluation_openrouter_{proposal_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" - output_path = os.path.join(output_dir, output_filename) - - with open(output_path, "w") as f: - json.dump(choice_message_json, f, indent=2) - - print(f"\nSaved evaluation output to: {output_path}") - except json.JSONDecodeError as e: - print(f"❌ JSON decoding error: {e}") - return + with open(output_path, "w") as f: + json.dump(output_data, f, indent=2) - return + print(f"\nSaved evaluation output to: {output_path}") except ValueError as e: print(f"❌ Invalid UUID format: {e}") diff --git a/scripts/test_proposal_evaluation_v3.py b/scripts/test_proposal_evaluation_v3.py index b37fa6b5..f6490d5d 100644 --- a/scripts/test_proposal_evaluation_v3.py +++ b/scripts/test_proposal_evaluation_v3.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 + +import os + """ CLI test script for proposal evaluations using the v3 strict workflow. @@ -14,7 +17,6 @@ import asyncio import json import logging -import os import sys from datetime import datetime from typing import Dict, Any @@ -24,10 +26,15 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.lib.logger import StructuredFormatter, setup_uvicorn_logging -from app.services.ai.simple_workflows.orchestrator import evaluate_proposal_strict +from app.services.ai.simple_workflows.evaluation_openrouter_v2 import ( + evaluate_proposal_openrouter, + EvaluationOutput, +) from app.backend.factory import get_backend from scripts.generate_evals_manifest import generate_manifest +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + class Tee(object): def __init__(self, *files): @@ -84,7 +91,9 @@ async def evaluate_single_proposal( tee_stderr = original_stderr if args.save_output: prop_short_id = short_uuid(proposal_id) - log_filename = f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_log.txt" + log_filename = os.path.join( + ROOT_DIR, f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_log.txt" + ) log_f = open(log_filename, "w") tee_stdout = Tee(original_stdout, log_f) tee_stderr = Tee(original_stderr, log_f) @@ -112,7 +121,7 @@ async def evaluate_single_proposal( proposal_uuid = UUID(proposal_id) print(f"📋 Evaluating proposal {index}: {proposal_id}") - result = await evaluate_proposal_strict( + result = await evaluate_proposal_openrouter( proposal_id=proposal_uuid, model=args.model, temperature=args.temperature, @@ -121,7 +130,14 @@ async def evaluate_single_proposal( if not result: error_msg = f"Evaluation failed for proposal {proposal_id}" - print(result) + print(error_msg) + return {"proposal_id": proposal_id, "error": error_msg} + + # Validate EvaluationOutput + try: + EvaluationOutput(**result["evaluation_output"]) + except ValueError as ve: + error_msg = f"Validation failed for EvaluationOutput: {str(ve)}" print(error_msg) return {"proposal_id": proposal_id, "error": error_msg} @@ -135,13 +151,16 @@ async def evaluate_single_proposal( result_dict = { "proposal_id": proposal_id, "expected_decision": expected_dec, - "evaluation_output": result.model_dump(), # Raw as dict + "evaluation_output": result["evaluation_output"], # Raw as dict + "full_system_prompt": result.get("full_system_prompt", "N/A"), + "full_user_prompt": result.get("full_user_prompt", "N/A"), + "full_messages": result.get("full_messages", []), } # Save JSON if requested if args.save_output: - json_filename = ( - f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_raw.json" + json_filename = os.path.join( + ROOT_DIR, f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_raw.json" ) with open(json_filename, "w") as f: json.dump(result_dict, f, indent=2, default=str) @@ -180,7 +199,7 @@ def generate_summary( print("=" * 60) if save_output: - summary_json = f"evals/{timestamp}_summary.json" + summary_json = os.path.join(ROOT_DIR, f"evals/{timestamp}_summary.json") with open(summary_json, "w") as f: json.dump(summary, f, indent=2, default=str) print(f"✅ Summary saved to {summary_json}") @@ -264,7 +283,7 @@ def main(): timestamp = now.strftime("%Y%m%d_%H%M%S") if args.save_output: - os.makedirs("evals", exist_ok=True) + os.makedirs(os.path.join(ROOT_DIR, "evals"), exist_ok=True) print("🚀 Starting Proposal Evaluation Test V3") print("=" * 60)