Skip to content
11 changes: 8 additions & 3 deletions app/services/ai/simple_workflows/evaluation_openrouter_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ async def evaluate_proposal_openrouter(
model: Optional[str] = None,
temperature: Optional[float] = None,
reasoning: Optional[bool] = None,
) -> Optional[EvaluationOutput]:
) -> Optional[Dict[str, Any]]:
"""
Evaluate a proposal using OpenRouter and Grok prompts.

Expand All @@ -537,7 +537,7 @@ async def evaluate_proposal_openrouter(
temperature: Generation temperature.

Returns:
Parsed EvaluationOutput or None if evaluation fails.
Dict with evaluation output and captured prompts, or None if evaluation fails.
"""
try:
# parse the uuid
Expand Down Expand Up @@ -706,7 +706,12 @@ async def evaluate_proposal_openrouter(

logger.info(f"Successfully evaluated proposal {proposal_id}")

return evaluation_output
return {
"evaluation_output": evaluation_output.model_dump(),
"full_system_prompt": system_prompt,
"full_user_prompt": formatted_user_content,
"full_messages": messages,
}

except json.JSONDecodeError as e:
logger.error(f"JSON decode error: {e}")
Expand Down
25 changes: 25 additions & 0 deletions eval_viewer_v2.html
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ <h1 class="text-3xl font-bold text-gray-900">Eval Viewer V2</h1>
expected_decision: item.expected_decision || null,
mismatch: false,
usage: { input_tokens: "0", output_tokens: "0", est_cost: "$0.000000" },
full_system_prompt: item.full_system_prompt || "N/A",
full_user_prompt: item.full_user_prompt || "N/A",
full_messages: item.full_messages || [],
};
}

Expand Down Expand Up @@ -285,6 +288,9 @@ <h1 class="text-3xl font-bold text-gray-900">Eval Viewer V2</h1>
output_tokens: evalOut.usage_output_tokens || "0",
est_cost: evalOut.usage_est_cost || "$0.000000",
},
full_system_prompt: item.full_system_prompt || "N/A",
full_user_prompt: item.full_user_prompt || "N/A",
full_messages: item.full_messages || [],
};
});

Expand Down Expand Up @@ -692,6 +698,25 @@ <h3 class="font-bold mb-2">Category Averages</h3>
`;
details.appendChild(usageInfo);

// Prompt info (mirroring original viewer)
const promptInfo = document.createElement("div");
promptInfo.className = "space-y-2 mb-4";
promptInfo.innerHTML = `
<details class="bg-gray-50 p-3 rounded-md">
<summary class="font-medium text-gray-700 cursor-pointer">Full System Prompt</summary>
<pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(prop.full_system_prompt)}</pre>
</details>
<details class="bg-gray-50 p-3 rounded-md">
<summary class="font-medium text-gray-700 cursor-pointer">Full User Prompt</summary>
<pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(prop.full_user_prompt)}</pre>
</details>
<details class="bg-gray-50 p-3 rounded-md">
<summary class="font-medium text-gray-700 cursor-pointer">Full LLM Messages</summary>
<pre class="mt-2 text-sm text-gray-600 whitespace-pre-wrap overflow-x-auto">${escapeHtml(JSON.stringify(prop.full_messages, null, 2))}</pre>
</details>
`;
details.appendChild(promptInfo);

// Categories table
details.innerHTML += `
<div class="border-t border-gray-200 pt-4 overflow-x-auto">
Expand Down
31 changes: 23 additions & 8 deletions scripts/generate_evals_manifest.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,44 @@
#!/usr/bin/env python3

import os

"""
Utility script to generate or update evals-manifest.json based on contents of ./evals/.
Scans for files matching *_summary.json and creates a manifest with path and name (timestamp).
"""

import json
import os
from datetime import datetime

import re


def generate_manifest(evals_dir="./evals", manifest_path="./evals/evals-manifest.json"):
"""Generate manifest from JSON files in evals_dir."""
"""Generate manifest from JSON files in evals_dir matching new pattern."""
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
evals_dir = os.path.join(ROOT_DIR, evals_dir)
manifest_path = os.path.join(ROOT_DIR, manifest_path)
manifest = []
timestamp_pattern = re.compile(
r"^(\d{8}_\d{6})_summary\.json$"
) # Matches YYYYMMDD_HHMMSS_summary.json

for filename in os.listdir(evals_dir):
if filename.endswith("_summary.json"):
timestamp_str = filename.split("_")[0] # Extract YYYYMMDD_HHMMSS
match = timestamp_pattern.match(filename)
if match:
timestamp_str = match.group(1) # e.g., 20251118_160840
try:
timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
name = timestamp.strftime("%Y-%m-%d %H:%M:%S")
# Parse YYYYMMDD_HHMMSS
dt = datetime.strptime(timestamp_str.replace("_", ""), "%Y%m%d%H%M%S")
name = dt.strftime("%Y-%m-%d %H:%M:%S") # Display format
except ValueError:
name = filename
manifest.append({"path": f"./evals/{filename}", "name": name})

# Sort by timestamp descending
manifest.sort(key=lambda x: x["name"], reverse=True)
# Sort by parsed datetime descending
manifest.sort(
key=lambda x: datetime.strptime(x["name"], "%Y-%m-%d %H:%M:%S"), reverse=True
)

os.makedirs(os.path.dirname(manifest_path), exist_ok=True)
with open(manifest_path, "w") as f:
Expand Down
Loading