diff --git a/app/services/ai/simple_workflows/evaluation_openrouter_v2.py b/app/services/ai/simple_workflows/evaluation_openrouter_v2.py
index 1a127f40..a70f4be9 100644
--- a/app/services/ai/simple_workflows/evaluation_openrouter_v2.py
+++ b/app/services/ai/simple_workflows/evaluation_openrouter_v2.py
@@ -527,7 +527,7 @@ async def evaluate_proposal_openrouter(
     model: Optional[str] = None,
     temperature: Optional[float] = None,
     reasoning: Optional[bool] = None,
-) -> Optional[EvaluationOutput]:
+) -> Optional[Dict[str, Any]]:
     """
     Evaluate a proposal using OpenRouter and Grok prompts.
 
@@ -537,7 +537,7 @@ async def evaluate_proposal_openrouter(
         temperature: Generation temperature.
 
     Returns:
-        Parsed EvaluationOutput or None if evaluation fails.
+        Dict with evaluation output and captured prompts, or None if evaluation fails.
     """
     try:
         # parse the uuid
@@ -706,7 +706,12 @@ async def evaluate_proposal_openrouter(
 
             logger.info(f"Successfully evaluated proposal {proposal_id}")
 
-            return evaluation_output
+            return {
+                "evaluation_output": evaluation_output.model_dump(),
+                "full_system_prompt": system_prompt,
+                "full_user_prompt": formatted_user_content,
+                "full_messages": messages,
+            }
 
         except json.JSONDecodeError as e:
             logger.error(f"JSON decode error: {e}")
diff --git a/eval_viewer_v2.html b/eval_viewer_v2.html
index 51ed6f95..01196c25 100644
--- a/eval_viewer_v2.html
+++ b/eval_viewer_v2.html
@@ -258,6 +258,9 @@ <h1 class="text-3xl font-bold text-gray-900">Eval Viewer V2</h1>
               expected_decision: item.expected_decision || null,
               mismatch: false,
               usage: { input_tokens: "0", output_tokens: "0", est_cost: "$0.000000" },
+              full_system_prompt: item.full_system_prompt || "N/A",
+              full_user_prompt: item.full_user_prompt || "N/A",
+              full_messages: item.full_messages || [],
             };
           }
 
@@ -285,6 +288,9 @@ <h1 class="text-3xl font-bold text-gray-900">Eval Viewer V2</h1>
               output_tokens: evalOut.usage_output_tokens || "0",
               est_cost: evalOut.usage_est_cost || "$0.000000",
             },
+            full_system_prompt: item.full_system_prompt || "N/A",
+            full_user_prompt: item.full_user_prompt || "N/A",
+            full_messages: item.full_messages || [],
           };
         });
 
@@ -692,6 +698,25 @@ <h3 class="font-bold mb-2">Category Averages</h3>
         `;
         details.appendChild(usageInfo);
 
+        // Prompt info (mirroring original viewer)
+        const promptInfo = document.createElement("div");
+        promptInfo.className = "space-y-2 mb-4";
+        promptInfo.innerHTML = `
+          <details class="bg-gray-50 p-3 rounded-md">
+            <summary class="font-medium text-gray-700 cursor-pointer">Full System Prompt</summary>
+            <pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(prop.full_system_prompt)}</pre>
+          </details>
+          <details class="bg-gray-50 p-3 rounded-md">
+            <summary class="font-medium text-gray-700 cursor-pointer">Full User Prompt</summary>
+            <pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(prop.full_user_prompt)}</pre>
+          </details>
+          <details class="bg-gray-50 p-3 rounded-md">
+            <summary class="font-medium text-gray-700 cursor-pointer">Full LLM Messages</summary>
+            <pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(JSON.stringify(prop.full_messages, null, 2))}</pre>
+          </details>
+        `;
+        details.appendChild(promptInfo);
+
         // Categories table
         details.innerHTML += `
           <div class="border-t border-gray-200 pt-4 overflow-x-auto">
diff --git a/scripts/generate_evals_manifest.py b/scripts/generate_evals_manifest.py
index 8b0bf57d..3b94608f 100644
--- a/scripts/generate_evals_manifest.py
+++ b/scripts/generate_evals_manifest.py
@@ -1,29 +1,44 @@
 #!/usr/bin/env python3
+
+import os
+
 """
 Utility script to generate or update evals-manifest.json based on contents of ./evals/.
 Scans for files matching *_summary.json and creates a manifest with path and name (timestamp).
 """
 
 import json
-import os
 from datetime import datetime
 
+import re
+
 
 def generate_manifest(evals_dir="./evals", manifest_path="./evals/evals-manifest.json"):
-    """Generate manifest from JSON files in evals_dir."""
+    """Generate manifest from JSON files in evals_dir matching new pattern."""
+    ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    evals_dir = os.path.join(ROOT_DIR, evals_dir)
+    manifest_path = os.path.join(ROOT_DIR, manifest_path)
     manifest = []
+    timestamp_pattern = re.compile(
+        r"^(\d{8}_\d{6})_summary\.json$"
+    )  # Matches YYYYMMDD_HHMMSS_summary.json
+
     for filename in os.listdir(evals_dir):
-        if filename.endswith("_summary.json"):
-            timestamp_str = filename.split("_")[0]  # Extract YYYYMMDD_HHMMSS
+        match = timestamp_pattern.match(filename)
+        if match:
+            timestamp_str = match.group(1)  # e.g., 20251118_160840
             try:
-                timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
-                name = timestamp.strftime("%Y-%m-%d %H:%M:%S")
+                # Parse YYYYMMDD_HHMMSS
+                dt = datetime.strptime(timestamp_str.replace("_", ""), "%Y%m%d%H%M%S")
+                name = dt.strftime("%Y-%m-%d %H:%M:%S")  # Display format
             except ValueError:
                 name = filename
             manifest.append({"path": f"./evals/{filename}", "name": name})
 
-    # Sort by timestamp descending
-    manifest.sort(key=lambda x: x["name"], reverse=True)
+    # Sort by parsed datetime descending
+    manifest.sort(
+        key=lambda x: datetime.strptime(x["name"], "%Y-%m-%d %H:%M:%S"), reverse=True
+    )
 
     os.makedirs(os.path.dirname(manifest_path), exist_ok=True)
     with open(manifest_path, "w") as f:
diff --git a/scripts/test_evaluation_openrouter_v2.py b/scripts/test_evaluation_openrouter_v2.py
index 3c8aa65f..a96f17f1 100644
--- a/scripts/test_evaluation_openrouter_v2.py
+++ b/scripts/test_evaluation_openrouter_v2.py
@@ -15,21 +15,14 @@
 
 from datetime import datetime
 from typing import Optional, List, Dict, Any
-from urllib.parse import urlparse
 from uuid import UUID
 
 # Add the parent directory to the path to import from app
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from app.backend.factory import backend
-from app.backend.models import ContractStatus, ProposalFilter
 from app.config import config
-from app.services.ai.simple_workflows.evaluation_openrouter_v1 import (
-    format_proposals_for_context_v2,
-)
-from app.services.ai.simple_workflows.prompts.evaluation_grok import (
-    EVALUATION_GROK_SYSTEM_PROMPT,
-    EVALUATION_GROK_USER_PROMPT_TEMPLATE,
+from app.services.ai.simple_workflows.evaluation_openrouter_v2 import (
+    evaluate_proposal_openrouter,
 )
 
 
@@ -105,482 +98,57 @@ async def test_evaluation(
         if model:
             print(f"Using model from args: {model}")
 
-        # get the proposal from the backend
-        proposal = backend.get_proposal(proposal_uuid)
-        if not proposal:
-            print(f"❌ Proposal {proposal_id} not found in database")
-            return
-
-        print(f"Found proposal: {proposal_id}")
-
-        # extract proposal into a consistent object
-        proposal_info_for_evaluation = {
-            "proposal_number": proposal.proposal_id,
-            "title": proposal.title,
-            # "content": proposal.content,  # includes metadata/tags
-            "summary": proposal.summary,  # just post and ref link
-            "created_at_timestamp": proposal.created_at,
-            "created_at_btc_block": proposal.created_btc,
-            "executable_at_btc_block": proposal.exec_start,
-            "x_url": proposal.x_url,
-            # "tweet_id": proposal.tweet_id, # used internally
-            # "tags": proposal.tags, # noise at this point?
-            "tx_sender": proposal.tx_sender,
-        }
-
-        for key, value in proposal_info_for_evaluation.items():
-            print(f"  {key}: {value[:80] if isinstance(value, str) else value}")
-
-        # fetch dao info from proposal info
-        print("\n" + "=" * 80)
-        dao = None
-        dao_info_for_evaluation = None
-        if proposal.dao_id:
-            dao = backend.get_dao(proposal.dao_id)
-
-        if not dao:
-            print(f"❌ DAO {proposal.dao_id} not found for proposal {proposal_id}")
-            return
-
-        print(f"Found related DAO: {dao.name} (ID: {dao.id})")
-
-        dao_info_for_evaluation = {
-            "dao_id": dao.id,
-            "name": dao.name,
-            "mission": dao.mission,
-        }
-
-        for key, value in dao_info_for_evaluation.items():
-            print(f"  {key}: {value[:80] if isinstance(value, str) else value}")
-
-        # fetch tweet info from DB
-        print("\n" + "=" * 80)
-        tweet_info_for_evaluation = None
-
-        if proposal.tweet_id:
-            tweet_content = backend.get_x_tweet(proposal.tweet_id)
-            print(f"Fetched tweet info for tweet ID: {proposal.tweet_id}")
-        else:
-            tweet_content = None
-            print("No tweet ID associated with this proposal")
-
-        if tweet_content:
-            tweet_info_for_evaluation = {
-                # "message": tweet_content.message, # already in summary
-                # "author_id": tweet_content.author_id, # local to our DB
-                "x_post_id": tweet_content.tweet_id,
-                # "conversation_id": tweet_content.conversation_id,  # verify used?
-                "images": tweet_content.images,
-                "author_name": tweet_content.author_name,
-                "author_username": tweet_content.author_username,
-                "created_at": tweet_content.created_at_twitter,
-                "public_metrics": tweet_content.public_metrics,
-                # "entities": tweet_content.entities,
-                # "attachments": tweet_content.attachments,
-                "quoted_tweet_id": tweet_content.quoted_tweet_id,
-                "in_reply_to_user_id": tweet_content.in_reply_to_user_id,
-                "replied_to_tweet_id": tweet_content.replied_to_tweet_id,
-            }
-
-        for key, value in (tweet_info_for_evaluation or {}).items():
-            print(f"  {key}: {value[:100] if isinstance(value, str) else value}")
-
-        # fetch tweet author info from db (if present)
-        print("\n" + "=" * 80)
-        tweet_author_info_for_evaluation = None
-
-        if tweet_content and tweet_content.author_id:
-            tweet_author_id = tweet_content.author_id
-            tweet_author_content = backend.get_x_user(tweet_author_id)
-            print(f"Fetched tweet author info for user ID: {tweet_author_id}")
-
-            if tweet_author_content:
-                tweet_author_info_for_evaluation = {
-                    "user_id": tweet_author_content.user_id,
-                    "name": tweet_author_content.name,
-                    "username": tweet_author_content.username,
-                    "description": tweet_author_content.description,
-                    "verified": tweet_author_content.verified,
-                    "verified_type": tweet_author_content.verified_type,
-                    "location": tweet_author_content.location,
-                }
-
-                for key, value in tweet_author_info_for_evaluation.items():
-                    print(
-                        f"  {key}: {value[:100] if isinstance(value, str) else value}"
-                    )
-            else:
-                print("❌ Could not fetch tweet author content")
-
-        # fetch quoted tweet from db (if present)
-        print("\n" + "=" * 80)
-        quote_tweet_info_for_evaluation = None
-
-        if tweet_content and tweet_content.quoted_tweet_db_id:
-            quoted_tweet_id = tweet_content.quoted_tweet_db_id
-            quoted_tweet_content = backend.get_x_tweet(quoted_tweet_id)
-            print(f"Fetched quoted tweet info for tweet ID: {quoted_tweet_id}")
-
-            if quoted_tweet_content:
-                quote_tweet_info_for_evaluation = {
-                    "x_post_id": quoted_tweet_content.tweet_id,
-                    # "conversation_id": quoted_tweet_content.conversation_id,
-                    "images": quoted_tweet_content.images,
-                    "author_name": quoted_tweet_content.author_name,
-                    "author_username": quoted_tweet_content.author_username,
-                    "created_at": quoted_tweet_content.created_at_twitter,
-                    "public_metrics": quoted_tweet_content.public_metrics,
-                }
-
-                for key, value in quote_tweet_info_for_evaluation.items():
-                    print(
-                        f"  {key}: {value[:100] if isinstance(value, str) else value}"
-                    )
-            else:
-                print("❌ Could not fetch quoted tweet content")
-        else:
-            print("No quoted tweet associated with this tweet")
-
-        # fetch replied-to tweet from db (if present)
-        print("\n" + "=" * 80)
-        reply_tweet_info_for_evaluation = None
-
-        if tweet_content and tweet_content.replied_to_tweet_db_id:
-            replied_to_tweet_id = tweet_content.replied_to_tweet_db_id
-            replied_to_tweet_content = backend.get_x_tweet(replied_to_tweet_id)
-            print(f"Fetched replied-to tweet info for tweet ID: {replied_to_tweet_id}")
-
-            if replied_to_tweet_content:
-                reply_tweet_info_for_evaluation = {
-                    "x_post_id": replied_to_tweet_content.tweet_id,
-                    # "conversation_id": replied_to_tweet_content.conversation_id,
-                    "images": replied_to_tweet_content.images,
-                    "author_name": replied_to_tweet_content.author_name,
-                    "author_username": replied_to_tweet_content.author_username,
-                    "created_at": replied_to_tweet_content.created_at_twitter,
-                    "public_metrics": replied_to_tweet_content.public_metrics,
-                }
-
-                for key, value in reply_tweet_info_for_evaluation.items():
-                    print(
-                        f"  {key}: {value[:100] if isinstance(value, str) else value}"
-                    )
-            else:
-                print("❌ Could not fetch replied-to tweet content")
-        else:
-            print("No replied-to tweet associated with this tweet")
-
-        # fetch past proposals for context
-        print("\n" + "=" * 80)
-        dao_past_proposals_categorized = None
-        dao_past_proposals_stats_for_evaluation = None
-        dao_draft_proposals_for_evaluation = None
-        dao_deployed_proposals_for_evaluation = None
-
-        # get all proposals for the dao
-        dao_proposals = backend.list_proposals(ProposalFilter(dao_id=proposal.dao_id))
-        # exclude the current proposal
-        dao_proposals = [p for p in dao_proposals if p.id != proposal.id]
-        print(
-            f"Fetched {len(dao_proposals)} past proposals for DAO ID: {proposal.dao_id}"
-        )
-
-        # print all proposals
-        # for p in dao_proposals:
-        #    print(
-        #        f"  Proposal ID: {p.id}, Title: {p.title[:50] if isinstance(p.title, str) else p.title}..."
-        #   )
-        # print(p)
-
-        # match past proposals by same tx_sender
-        user_past_proposals_for_evaluation = None
-        if proposal.tx_sender:
-            user_past_proposals = [
-                p for p in dao_proposals if p.tx_sender == proposal.tx_sender
-            ]
-            user_past_proposals_for_evaluation = format_proposals_for_context_v2(
-                user_past_proposals
-            )
-            print("\n" + "=" * 80)
-            print(
-                f"Found {len(user_past_proposals)} past proposals by same sender:\n{proposal.tx_sender}"
-            )
-
-        # remove tx-sender matched proposals from dao proposals
-        # if not present then default to full object
-        dao_past_proposals = [
-            p
-            for p in dao_proposals
-            if user_past_proposals and p not in user_past_proposals
-        ]
-
-        sorted_dao_past_proposals = sorted(
-            dao_past_proposals,
-            key=lambda p: getattr(p, "created_at", datetime.min),
-            reverse=True,
+        evaluation_result = await evaluate_proposal_openrouter(
+            proposal_uuid, model=model, temperature=0.7, reasoning=True
         )
 
-        dao_past_proposals_categorized = {
-            "ALL": sorted_dao_past_proposals,
-            ContractStatus.DRAFT: [
-                p for p in sorted_dao_past_proposals if p.status == ContractStatus.DRAFT
-            ],
-            ContractStatus.PENDING: [
-                p
-                for p in sorted_dao_past_proposals
-                if p.status == ContractStatus.PENDING
-            ],
-            ContractStatus.DEPLOYED: [
-                p
-                for p in sorted_dao_past_proposals
-                if p.status == ContractStatus.DEPLOYED
-            ],
-            ContractStatus.FAILED: [
-                p
-                for p in sorted_dao_past_proposals
-                if p.status == ContractStatus.FAILED
-            ],
-        }
-
-        dao_past_proposals_stats_for_evaluation = {
-            "ALL": len(sorted_dao_past_proposals),
-            ContractStatus.DRAFT: len(
-                dao_past_proposals_categorized[ContractStatus.DRAFT]
-            ),
-            ContractStatus.PENDING: len(
-                dao_past_proposals_categorized[ContractStatus.PENDING]
-            ),
-            ContractStatus.DEPLOYED: len(
-                dao_past_proposals_categorized[ContractStatus.DEPLOYED]
-            ),
-            ContractStatus.FAILED: len(
-                dao_past_proposals_categorized[ContractStatus.FAILED]
-            ),
-        }
-
-        print("Stats:", dao_past_proposals_stats_for_evaluation)
-
-        # limit to last 20
-        dao_draft_proposals = dao_past_proposals_categorized[ContractStatus.DRAFT][:20]
-
-        dao_draft_proposals_for_evaluation = format_proposals_for_context_v2(
-            dao_draft_proposals
-        )
-
-        print("\n" + "=" * 80)
-        print(
-            f"Using {len(dao_draft_proposals)} DAO draft proposals for evaluation context"
-        )
-
-        # limit to last 100
-        dao_deployed_proposals = dao_past_proposals_categorized[
-            ContractStatus.DEPLOYED
-        ][:100]
-
-        dao_deployed_proposals_for_evaluation = format_proposals_for_context_v2(
-            dao_deployed_proposals
-        )
-
-        print("\n" + "=" * 80)
-        print(
-            f"Using {len(dao_deployed_proposals)} DAO deployed proposals for evaluation context"
-        )
-
-        # add images in format so grok will read them
-        # this should be appended to user chat object
-        print("\n" + "=" * 80)
-        images_for_evaluation = []
-
-        if tweet_content and tweet_content.images:
-            for img_url in tweet_content.images:
-                # Basic validation of URL
-                parsed_url = urlparse(img_url)
-                if parsed_url.scheme in ["http", "https"]:
-                    images_for_evaluation.append(
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": img_url, "detail": "auto"},
-                        }
-                    )
-                else:
-                    print(f"❌ Invalid image URL skipped: {img_url}")
-
-        print(f"Prepared {len(images_for_evaluation)} images for AI evaluation")
-
-        # determine prompt type based on DAO
-        print("\n" + "=" * 80)
-        system_prompt = EVALUATION_GROK_SYSTEM_PROMPT
-        user_prompt = EVALUATION_GROK_USER_PROMPT_TEMPLATE
-
-        # fail if not found
-        if not system_prompt or not user_prompt:
-            print("❌ Could not load prompts for evaluation")
+        if not evaluation_result:
+            print("❌ Evaluation failed")
             return
 
-        # list of dict with str keys and values that are either str or list of dicts
-        # say that 10 times fast (using it based on format in Full LLM Messages in Eval))
-        messages: list[dict[str, str | list[dict[str, str]]]] = []
-
-        system_content = system_prompt
-
-        messages = [{"role": "system", "content": system_content}]
-
-        formatted_user_content = user_prompt.format(
-            dao_info_for_evaluation=dao_info_for_evaluation,
-            proposal_content_for_evaluation=proposal_info_for_evaluation,
-            tweet_info_for_evaluation=tweet_info_for_evaluation,
-            tweet_author_info_for_evaluation=tweet_author_info_for_evaluation,
-            quote_tweet_info_for_evaluation=quote_tweet_info_for_evaluation,
-            reply_tweet_info_for_evaluation=reply_tweet_info_for_evaluation,
-            dao_past_proposals_stats_for_evaluation=dao_past_proposals_stats_for_evaluation,
-            user_past_proposals_for_evaluation=user_past_proposals_for_evaluation,
-            dao_draft_proposals_for_evaluation=dao_draft_proposals_for_evaluation,
-            dao_deployed_proposals_for_evaluation=dao_deployed_proposals_for_evaluation,
-        )
-
-        user_content = [{"type": "text", "text": formatted_user_content}]
-
-        if len(images_for_evaluation) > 0:
-            for image in images_for_evaluation:
-                user_content.append(image)
-
-        messages.append({"role": "user", "content": user_content})
-
         print("\n" + "=" * 80)
-        print("Final formatted messages to be sent to LLM:")
-        print(messages)
-
-        # NEXT STEP: call OpenRouter with this data, see if it processes
-        # then we work backwards to how we want to implement it in the other areas
-        # also add these headers to any OpenRouter methods:
-        # 'default_headers': {'HTTP-Referer': 'https://aibtc.com', 'X-Title': 'AIBTC'}
-
-        x_ai_tools = [{"type": "web_search"}, {"type": "x_search"}]
-
-        openrouter_response = await call_openrouter(
-            messages=messages, model=model or None, temperature=0.7, tools=x_ai_tools
-        )
-
-        print("\n" + "=" * 80)
-        print("OpenRouter Full API Response:")
-        print(openrouter_response)
-
-        print("\n" + "=" * 80)
-        print("OpenRouter Response Breakdown:")
-
-        response_id = openrouter_response.get("id")
-        response_provider = openrouter_response.get("provider")
-        response_model = openrouter_response.get("model")
-        response_usage = openrouter_response.get("usage")
-        response_usage_total_tokens = (
-            response_usage.get("total_tokens") if response_usage else None
-        )
+        # print("Evaluation Result:")
+        # print(json.dumps(evaluation_result, indent=2))
 
-        print(f"Response ID: {response_id}")
-        print(f"Response Provider: {response_provider}")
-        print(f"Response Model: {response_model}")
-        print(f"Response Usage: {response_usage_total_tokens} total tokens")
-
-        response_choices = openrouter_response.get("choices", [])
-
-        if len(response_choices) == 0:
-            print("❌ No choices returned in OpenRouter response")
-            return
-
-        if len(response_choices) > 1:
-            print(
-                f"⚠️ Multiple choices returned ({len(response_choices)}), using the first one."
-            )
-
-        first_choice = response_choices[0]
-
-        choice_finish_reason = first_choice.get("finish_reason")
-        choice_native_finish_reason = first_choice.get("native_finish_reason")
-
-        print(f"Choice Finish Reason: {choice_finish_reason}")
-        print(f"Choice Native Finish Reason: {choice_native_finish_reason}")
-
-        print("\n" + "=" * 80)
-        print("Parsing JSON from message content")
-
-        choice_message = first_choice.get("message")
-        if not choice_message:
-            print("❌ No message found in the first choice")
-            return
-
-        if not isinstance(choice_message, dict):
-            print("❌ Choice message is not a dictionary")
-
-        choice_message_role = choice_message.get("role")
-        choice_message_content = choice_message.get("content")
-        choice_message_refusal = choice_message.get("refusal")
-        choice_message_reasoning = choice_message.get("reasoning")
-        choice_message_reasoning_details = choice_message.get("reasoning_details")
-
-        if (
-            choice_message_reasoning_details
-            and len(choice_message_reasoning_details) > 1
-        ):
-            print(
-                f"⚠️ Multiple reasoning details returned ({len(choice_message_reasoning_details)}), using the first one."
+        if save_output:
+            output_dir = os.path.join(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                "evals",
             )
+            os.makedirs(output_dir, exist_ok=True)
+
+            # New naming: YYYYMMDD-HHMMSS_{short_proposal_id}_summary.json
+            now = datetime.now()
+            timestamp = now.strftime("%Y%m%d-%H%M%S")  # e.g., 20251118-160840
+            short_id = proposal_id[:8]  # First 8 chars of UUID
+            output_filename = f"{timestamp}_{short_id}_summary.json"
+            output_path = os.path.join(output_dir, output_filename)
+
+            evaluation_output = evaluation_result["evaluation_output"]
+            if hasattr(evaluation_output, "model_dump"):
+                evaluation_output_dumped = evaluation_output.model_dump()
+            else:
+                evaluation_output_dumped = evaluation_output  # already dumped
+
+            output_data = {
+                "timestamp": datetime.now().isoformat(),
+                "results": [
+                    {
+                        "proposal_id": proposal_id,
+                        "evaluation_output": evaluation_output_dumped,
+                        "full_system_prompt": evaluation_result.get(
+                            "full_system_prompt"
+                        ),
+                        "full_user_prompt": evaluation_result.get("full_user_prompt"),
+                        "full_messages": evaluation_result.get("full_messages"),
+                    }
+                ],
+            }
 
-        choice_annotations = choice_message.get("annotations")
-
-        print(f"Choice Message Role: {choice_message_role}")
-        print(f"Choice Message Refusal: {choice_message_refusal}")
-        print(f"Choice Message Reasoning: {choice_message_reasoning}")
-        print(
-            f"Choice Annotations: {len(choice_annotations) if choice_annotations else 0}"
-        )
-        choice_annotations_urls = []
-        if choice_annotations:
-            for annotation in choice_annotations:
-                if annotation.get("type") == "url_citation":
-                    url_citation = annotation.get("url_citation")
-                    if url_citation:
-                        url = url_citation.get("url")
-                        if url:
-                            choice_annotations_urls.append(url)
-                else:
-                    print(f"Unknown annotation type: {annotation.get('type')}")
-
-        if len(choice_annotations_urls) > 0:
-            print("  URLs cited in annotations:")
-            for url in choice_annotations_urls:
-                print(f"    {url}")
-
-        if not choice_message_content:
-            print("❌ No content found in the choice message")
-
-        if not isinstance(choice_message_content, str):
-            print("❌ Choice message content is not a string")
-
-        try:
-            choice_message_json = json.loads(choice_message_content)
-            print("\n" + "=" * 80)
-            print("Successfully parsed JSON from message content")
-            print(json.dumps(choice_message_json, indent=2))
-
-            if save_output:
-                # save to file
-                output_dir = os.path.join(
-                    os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-                    "evals",
-                )
-                os.makedirs(output_dir, exist_ok=True)
-                output_filename = f"evaluation_openrouter_{proposal_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
-                output_path = os.path.join(output_dir, output_filename)
-
-                with open(output_path, "w") as f:
-                    json.dump(choice_message_json, f, indent=2)
-
-                print(f"\nSaved evaluation output to: {output_path}")
-        except json.JSONDecodeError as e:
-            print(f"❌ JSON decoding error: {e}")
-            return
+            with open(output_path, "w") as f:
+                json.dump(output_data, f, indent=2)
 
-        return
+            print(f"\nSaved evaluation output to: {output_path}")
 
     except ValueError as e:
         print(f"❌ Invalid UUID format: {e}")
diff --git a/scripts/test_proposal_evaluation_v3.py b/scripts/test_proposal_evaluation_v3.py
index b37fa6b5..f6490d5d 100644
--- a/scripts/test_proposal_evaluation_v3.py
+++ b/scripts/test_proposal_evaluation_v3.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+
+import os
+
 """
 CLI test script for proposal evaluations using the v3 strict workflow.
 
@@ -14,7 +17,6 @@
 import asyncio
 import json
 import logging
-import os
 import sys
 from datetime import datetime
 from typing import Dict, Any
@@ -24,10 +26,15 @@
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from app.lib.logger import StructuredFormatter, setup_uvicorn_logging
-from app.services.ai.simple_workflows.orchestrator import evaluate_proposal_strict
+from app.services.ai.simple_workflows.evaluation_openrouter_v2 import (
+    evaluate_proposal_openrouter,
+    EvaluationOutput,
+)
 from app.backend.factory import get_backend
 from scripts.generate_evals_manifest import generate_manifest
 
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
 
 class Tee(object):
     def __init__(self, *files):
@@ -84,7 +91,9 @@ async def evaluate_single_proposal(
     tee_stderr = original_stderr
     if args.save_output:
         prop_short_id = short_uuid(proposal_id)
-        log_filename = f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_log.txt"
+        log_filename = os.path.join(
+            ROOT_DIR, f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_log.txt"
+        )
         log_f = open(log_filename, "w")
         tee_stdout = Tee(original_stdout, log_f)
         tee_stderr = Tee(original_stderr, log_f)
@@ -112,7 +121,7 @@ async def evaluate_single_proposal(
         proposal_uuid = UUID(proposal_id)
         print(f"📋 Evaluating proposal {index}: {proposal_id}")
 
-        result = await evaluate_proposal_strict(
+        result = await evaluate_proposal_openrouter(
             proposal_id=proposal_uuid,
             model=args.model,
             temperature=args.temperature,
@@ -121,7 +130,14 @@ async def evaluate_single_proposal(
 
         if not result:
             error_msg = f"Evaluation failed for proposal {proposal_id}"
-            print(result)
+            print(error_msg)
+            return {"proposal_id": proposal_id, "error": error_msg}
+
+        # Validate EvaluationOutput
+        try:
+            EvaluationOutput(**result["evaluation_output"])
+        except ValueError as ve:
+            error_msg = f"Validation failed for EvaluationOutput: {str(ve)}"
             print(error_msg)
             return {"proposal_id": proposal_id, "error": error_msg}
 
@@ -135,13 +151,16 @@ async def evaluate_single_proposal(
         result_dict = {
             "proposal_id": proposal_id,
             "expected_decision": expected_dec,
-            "evaluation_output": result.model_dump(),  # Raw as dict
+            "evaluation_output": result["evaluation_output"],  # Raw as dict
+            "full_system_prompt": result.get("full_system_prompt", "N/A"),
+            "full_user_prompt": result.get("full_user_prompt", "N/A"),
+            "full_messages": result.get("full_messages", []),
         }
 
         # Save JSON if requested
         if args.save_output:
-            json_filename = (
-                f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_raw.json"
+            json_filename = os.path.join(
+                ROOT_DIR, f"evals/{timestamp}_prop{index:02d}_{prop_short_id}_raw.json"
             )
             with open(json_filename, "w") as f:
                 json.dump(result_dict, f, indent=2, default=str)
@@ -180,7 +199,7 @@ def generate_summary(
     print("=" * 60)
 
     if save_output:
-        summary_json = f"evals/{timestamp}_summary.json"
+        summary_json = os.path.join(ROOT_DIR, f"evals/{timestamp}_summary.json")
         with open(summary_json, "w") as f:
             json.dump(summary, f, indent=2, default=str)
         print(f"✅ Summary saved to {summary_json}")
@@ -264,7 +283,7 @@ def main():
     timestamp = now.strftime("%Y%m%d_%H%M%S")
 
     if args.save_output:
-        os.makedirs("evals", exist_ok=True)
+        os.makedirs(os.path.join(ROOT_DIR, "evals"), exist_ok=True)
 
     print("🚀 Starting Proposal Evaluation Test V3")
     print("=" * 60)