diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fee90a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+*.pyc
+.env
+venv/
+evaluation_results/
diff --git a/eval.ipynb b/eval.ipynb
deleted file mode 100644
index 7f83fd6..0000000
--- a/eval.ipynb
+++ /dev/null
@@ -1,407 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "id": "GsE2oHdaOQg0"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Your browser has been opened to visit:\n",
- "\n",
- " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=PQQrVSNVxh0pI6M4Botw6XtdMgRhrP&access_type=offline&code_challenge=mkaFsQA8NMnJ8JWIBhXIpvqHOfi10QVmdqHRxG0QspU&code_challenge_method=S256\n",
- "\n",
- "\n",
- "You are now logged in as [duyguider@google.com].\n",
- "Your current project is [claims-assistant-fsa]. You can change this setting by running:\n",
- " $ gcloud config set project PROJECT_ID\n",
- "\n",
- "\n",
- "Updates are available for some Google Cloud CLI components. To install them,\n",
- "please run:\n",
- " $ gcloud components update\n",
- "\n",
- "\n",
- "\n",
- "To take a quick anonymous survey, run:\n",
- " $ gcloud survey\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!gcloud auth login"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 356
- },
- "id": "UmoJCyc_OGmT",
- "outputId": "8a266eaa-0bf0-4517-a964-1963ca3b95be"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import re\n",
- "import requests\n",
- "import json\n",
- "from agents import Runner, Agent\n",
- "from datetime import datetime\n",
- "from agents.sessions.in_memory_session_service import InMemorySessionService\n",
- "from agents.artifacts.in_memory_artifact_service import InMemoryArtifactService\n",
- "from google.generativeai import GenerativeModel, Part, Content, GenerationConfig # Vertex AI Gemini\n",
- "import google.cloud.aiplatform as aiplatform\n",
- "\n",
- "\n",
- "# --- Utility Functions ---\n",
- "\n",
- "def extract_responses_from_code(agent_code: str) -> list:\n",
- " \"\"\"\n",
- " Extracts sample responses from the agent's code (Simplified Placeholder).\n",
- " This is VERY difficult to do reliably without running the code.\n",
- " This version uses a very simple regex, which will likely fail in many cases.\n",
- " \"\"\"\n",
- " # VERY rudimentary attempt to find 'text' parts in responses.\n",
- " matches = re.findall(r\"text=['\\\"](.*?)['\\\"]\", agent_code, re.DOTALL)\n",
- " return matches\n",
- "\n",
- "\n",
- "def parse_evaluation(evaluation_text: str) -> dict:\n",
- " \"\"\"Parses the evaluation text from the Gemini model (Placeholder).\"\"\"\n",
- " scores = {}\n",
- " try:\n",
- " # Attempt to parse as JSON (ideal case)\n",
- " scores = json.loads(evaluation_text)\n",
- " except json.JSONDecodeError:\n",
- " # Fallback: Use regex to find scores (less reliable)\n",
- " match = re.search(r\"fluency:\\s*(\\d+).*coherence:\\s*(\\d+).*relevance:\\s*(\\d+).*helpfulness:\\s*(\\d+)\", evaluation_text, re.IGNORECASE)\n",
- " if match:\n",
- " scores = {\n",
- " \"fluency\": int(match.group(1)),\n",
- " \"coherence\": int(match.group(2)),\n",
- " \"relevance\": int(match.group(3)),\n",
- " \"helpfulness\": int(match.group(4)),\n",
- " }\n",
- " return scores\n",
- "\n",
- "\n",
- "def google_search(query: str) -> dict:\n",
- " \"\"\"Performs a Google Search (using the Custom Search JSON API).\"\"\"\n",
- " try:\n",
- " url = f\"https://www.googleapis.com/customsearch/v1?key={os.environ['GOOGLE_SEARCH_API_KEY']}&cx={os.environ['SEARCH_ENGINE_ID']}&q={query}\"\n",
- " response = requests.get(url)\n",
- " response.raise_for_status() # Raise an exception for bad status codes\n",
- " return response.json()\n",
- " except requests.exceptions.RequestException as e:\n",
- " print(f\"Error during Google Search: {e}\") # Log the error\n",
- " return {} # Return an empty dict\n",
- " except KeyError as e:\n",
- " print(f\"Missing environment variable: {e}\")\n",
- " return {}\n",
- "\n",
- "\n",
- "def extract_claims(text: str) -> list:\n",
- " \"\"\"Extracts factual claims from the agent's response (Placeholder).\"\"\"\n",
- " # Very basic sentence splitting. Real-world claim extraction is MUCH harder.\n",
- " sentences = re.split(r'(? bool:\n",
- " \"\"\"Verifies a claim against search results (Placeholder).\"\"\"\n",
- " # Extremely simplified verification. Real-world verification is complex.\n",
- " if not search_results or 'items' not in search_results:\n",
- " return False #Unable to verify\n",
- "\n",
- " for item in search_results['items']:\n",
- " if claim.lower() in item.get('title', '').lower() or claim.lower() in item.get('snippet', '').lower():\n",
- " return True\n",
- " return False\n",
- "\n",
- "\n",
- "# --- Tool Definitions ---\n",
- "\n",
- "def generation_evaluator(agent_code: str, sample_interactions: list = None) -> dict:\n",
- " \"\"\"Evaluates the generation quality of the agent's responses.\"\"\"\n",
- " model = GenerativeModel(\"gemini-1.5-pro-002\")\n",
- "\n",
- " if sample_interactions is None:\n",
- " sample_responses = extract_responses_from_code(agent_code)\n",
- " else:\n",
- " sample_responses = [interaction['output'] for interaction in sample_interactions]\n",
- "\n",
- " scores = {}\n",
- " for i, response in enumerate(sample_responses):\n",
- " prompt = f\"\"\"Evaluate the following response based on fluency, coherence, relevance, and helpfulness. Provide a score from 0 to 10 for each, where 10 is best.\n",
- "\n",
- " Response:\n",
- " {response}\n",
- "\n",
- " Scores (in JSON format):\n",
- " \"\"\"\n",
- " try:\n",
- " evaluation = model.generate_content(prompt)\n",
- " scores[f\"response_{i}\"] = parse_evaluation(evaluation.text)\n",
- " except Exception as e:\n",
- " print(f\"Error during generation evaluation: {e}\") #Error handeling\n",
- " scores[f\"response_{i}\"] = {\"fluency\": 0, \"coherence\": 0, \"relevance\": 0, \"helpfulness\": 0}\n",
- " return scores\n",
- "\n",
- "\n",
- "def context_evaluator(agent_code: str, sample_interactions: list) -> float:\n",
- " \"\"\"Evaluates the agent's ability to maintain and use context.\"\"\"\n",
- " # Placeholder: In a real implementation, this would involve running the agent's code.\n",
- " # For now, we'll just return a placeholder score based on whether interactions are provided.\n",
- "\n",
- " if not sample_interactions:\n",
- " return 0.0\n",
- "\n",
- " model = GenerativeModel(\"gemini-1.5-pro-002\")\n",
- "\n",
- " conversation_history = \"\"\n",
- " total_context_score = 0\n",
- "\n",
- " for interaction in sample_interactions:\n",
- " user_input = interaction['input']\n",
- " agent_response = interaction['output']\n",
- "\n",
- " prompt = f\"\"\"\n",
- " You are evaluating an AI agent's ability to maintain context in a conversation.\n",
- " Here is the conversation history so far:\n",
- " {conversation_history}\n",
- "\n",
- " The user's latest input is:\n",
- " {user_input}\n",
- "\n",
- " The agent's response is:\n",
- " {agent_response}\n",
- "\n",
- " Rate the agent's response on a scale of 0 to 10 (10 being best) based on how well it uses and maintains context from the previous conversation. Return a single number.\n",
- " \"\"\"\n",
- "\n",
- " try:\n",
- " evaluation = model.generate_content(prompt)\n",
- " score = int(evaluation.text.strip()) # Try to convert to integer\n",
- " total_context_score += score\n",
- " except Exception as e:\n",
- " print(f\"Error during evaluation: {e}\")\n",
- " total_context_score += 0\n",
- "\n",
- " # Update conversation history\n",
- " conversation_history += f\"User: {user_input}\\nAgent: {agent_response}\\n\"\n",
- "\n",
- "\n",
- " return total_context_score / len(sample_interactions) if sample_interactions else 0.0\n",
- "\n",
- "\n",
- "\n",
- "def groundness_evaluator(agent_code: str, sample_interactions: list) -> float:\n",
- " \"\"\"Evaluates the factual accuracy of the agent's responses.\"\"\"\n",
- " if not sample_interactions:\n",
- " return 0.0\n",
- "\n",
- " scores = []\n",
- " for interaction in sample_interactions:\n",
- " agent_response = interaction['output']\n",
- " claims = extract_claims(agent_response)\n",
- "\n",
- " for claim in claims:\n",
- " try:\n",
- " search_results = google_search(claim)\n",
- " if verify_claim(claim, search_results):\n",
- " scores.append(10)\n",
- " else:\n",
- " scores.append(0)\n",
- " except Exception as e:\n",
- " print(f\"Error during groundness evaluation: {e}\")\n",
- " scores.append(0) # Assume ungrounded if error\n",
- "\n",
- " return sum(scores) / len(scores) if scores else 0.0\n",
- "\n",
- "\n",
- "def function_caller_evaluator(agent_code: str, sample_interactions: list) -> float:\n",
- " \"\"\"Evaluates the agent's ability to select and use the correct functions.\"\"\"\n",
- " # Placeholder: Requires dynamic execution of the agent code.\n",
- " if not sample_interactions:\n",
- " return 0.0\n",
- "\n",
- " model = GenerativeModel(\"gemini-1.5-pro-002\")\n",
- " total_function_score = 0\n",
- "\n",
- " for interaction in sample_interactions:\n",
- " user_input = interaction['input']\n",
- " agent_response = interaction['output']\n",
- "\n",
- " prompt = f\"\"\"You are evaluating an AI agent's ability to use function calls.\n",
- " User Input: {user_input}\n",
- " Agent response: {agent_response}\n",
- " Based on the user input, analyze agent's response and rate from 0 to 10 if used the correct functions to get the response (where 10 is best). Return a single number.\n",
- " \"\"\"\n",
- " try:\n",
- " evaluation = model.generate_content(prompt)\n",
- " score = int(evaluation.text.strip()) # Try to convert to integer\n",
- " total_function_score += score\n",
- " except Exception as e:\n",
- " print(f\"Error during evaluation: {e}\")\n",
- " total_function_score += 0\n",
- " return total_function_score / len(sample_interactions) if sample_interactions else 0.0\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "# --- Agent Definition ---\n",
- "judge_agent = Agent(\n",
- " model=\"gemini-1.5-pro-002\", # Use a strong model\n",
- " name=\"AgentEvaluator\",\n",
- " instruction=\"\"\"\n",
- " You are an agent designed to evaluate the quality and correctness of other AI agents.\n",
- " You will receive the source code of a target agent as input, and optionally, sample interactions.\n",
- " Analyze the code and use the provided tools to assess its:\n",
- " - Generation quality (fluency, coherence, relevance, helpfulness)\n",
- " - Context handling (ability to maintain conversation state)\n",
- " - Groundness (factual accuracy)\n",
- " - Function selection (correct use of tools)\n",
- "\n",
- " Report the individual scores from each tool and the final score. The final score should be the average of generation, context, groundness and function caller scores. Each of those scores should be a single number between 0 and 10.\n",
- " \"\"\",\n",
- " tools=[\n",
- " generation_evaluator,\n",
- " context_evaluator,\n",
- " groundness_evaluator,\n",
- " function_caller_evaluator,\n",
- " google_search # Added to be used by groundness_evaluator\n",
- " ],\n",
- " flow='sequential',\n",
- ")\n",
- "\n",
- "# --- Agent Initialization ---\n",
- "session_service = InMemorySessionService()\n",
- "artifact_service = InMemoryArtifactService()\n",
- "runner = Runner(app_name=\"AgentEvaluator\", agent=judge_agent, artifact_service=artifact_service, session_service=session_service)\n",
- "session = session_service.create(app_name=\"AgentEvaluator\", user_id=\"1\")\n",
- "\n",
- "\n",
- "def run_prompt(agent_code: str, sample_interactions: list = None):\n",
- " # Use Vertex AI Content objects\n",
- " content = Content(role='user', parts=[Part.from_text(agent_code)])\n",
- " #print(content) # Remove to avoid noisy output.\n",
- " final_response = None #To get the last response\n",
- " for event in runner.run(\n",
- " session=session,\n",
- " new_message=content,\n",
- " tool_args={\"sample_interactions\": sample_interactions}\n",
- " ):\n",
- " if event.content:\n",
- " #print(event.content) # Optional: Print intermediate steps (Vertex AI format)\n",
- " final_response = event.content\n",
- "\n",
- " return final_response\n",
- "\n",
- "\n",
- "# --- Main Execution ---\n",
- "if __name__ == \"__main__\":\n",
- " # --- Setup API Keys and Vertex AI ---\n",
- " # Best practice: Load from environment variables\n",
- " os.environ[\"GOOGLE_API_KEY\"] = \"YOUR_GOOGLE_API_KEY\" # For Gemini API (still needed for google_search)\n",
- " os.environ[\"GOOGLE_SEARCH_API_KEY\"] = \"YOUR_GOOGLE_SEARCH_API_KEY\" # For Google Custom Search API\n",
- " os.environ[\"SEARCH_ENGINE_ID\"] = \"YOUR_SEARCH_ENGINE_ID\" # Your Custom Search Engine ID\n",
- " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = \"YOUR_GOOGLE_CLOUD_PROJECT\" #Your project id\n",
- " os.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"us-central1\" #The region\n",
- "\n",
- " # Initialize Vertex AI\n",
- " aiplatform.init(project=os.environ[\"GOOGLE_CLOUD_PROJECT\"], location=os.environ[\"GOOGLE_CLOUD_LOCATION\"])\n",
- "\n",
- "\n",
- " # --- Load Target Agent's Code and sample interactions ---\n",
- " # Example usage (replace with the path to your agent's code):\n",
- " try:\n",
- " with open(\"target_agent.py\", \"r\") as f:\n",
- " target_agent_code = f.read()\n",
- " except FileNotFoundError:\n",
- " print(\"Error: target_agent.py not found. Please provide the correct path.\")\n",
- " exit(1) #Exit if can't continue\n",
- "\n",
- " sample_interactions = [\n",
- " {'input': \"What's the weather like today?\",\n",
- " 'output': \"I'm sorry, I don't have the ability to look up weather information.\"},\n",
- " {'input': \"What's 2 + 2?\", 'output': \"2 + 2 = 4\"},\n",
- " {'input': \"Can you tell me the capital of France?\", 'output': \"The capital of France is Paris.\"},\n",
- " {'input': \"And what is the population of Paris?\", 'output': \"The population of Paris is about 2.1 million people.\"}, #Context question\n",
- " {'input':\"What is the exchange rate from USD to EUR?\", 'output': \"The exchange rate of USD to EUR is bla bla.\"}, # Test the function caller\n",
- "\n",
- " ]\n",
- "\n",
- " # --- Run the Evaluation ---\n",
- " evaluation_result = run_prompt(target_agent_code, sample_interactions)\n",
- "\n",
- " # --- Process and Print Results ---\n",
- " if evaluation_result and evaluation_result.parts:\n",
- " # Extract the final text response\n",
- " final_text = evaluation_result.parts[0].text\n",
- " print(\"Evaluation Results:\")\n",
- " print(final_text)\n",
- "\n",
- "\n",
- " #Attempt to extract scores.\n",
- " try:\n",
- "\n",
- " match = re.search(r\"Generation quality score:\\s*([\\d\\.]+).*Context handling score:\\s*([\\d\\.]+).*Groundness score:\\s*([\\d\\.]+).*Function selection score:\\s*([\\d\\.]+).*Final score:\\s*([\\d\\.]+)\", final_text, re.IGNORECASE)\n",
- " if match:\n",
- " print(\"\\nExtracted Scores:\")\n",
- " print(f\" Generation Quality: {float(match.group(1))}\")\n",
- " print(f\" Context Handling: {float(match.group(2))}\")\n",
- " print(f\" Groundness: {float(match.group(3))}\")\n",
- " print(f\" Function Selection: {float(match.group(4))}\")\n",
- " print(f\" Final Score: {float(match.group(5))}\")\n",
- " except Exception as e:\n",
- " print(f\"\\nCould not extract individual scores: {e}\")\n",
- "\n",
- "\n",
- " else:\n",
- " print(\"No evaluation results returned.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Zq_I6jBfOI00"
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "colab": {
- "provenance": []
- },
- "kernelspec": {
- "display_name": "3.12.0",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/main.py b/main.py
deleted file mode 100644
index df44328..0000000
--- a/main.py
+++ /dev/null
@@ -1,337 +0,0 @@
-import streamlit as st
-import json
-import pandas as pd
-import os
-from datetime import datetime
-
-# Set page configuration
-st.set_page_config(
- page_title="AI Response Evaluator",
- page_icon="🤖",
- layout="wide"
-)
-
-# Custom color scheme
-st.markdown("""
-
-""", unsafe_allow_html=True)
-
-# Function to load data from JSONL file
-def load_data(file_path):
- data = []
- try:
- with open(file_path, 'r') as f:
- for line in f:
- data.append(json.loads(line))
- return data
- except FileNotFoundError:
- st.error(f"File not found: {file_path}")
- return []
- except json.JSONDecodeError:
- st.error(f"Invalid JSON format in file: {file_path}")
- return []
-
-# Function to save evaluation results
-def save_evaluation(results, output_file):
- # Create directory if it doesn't exist
- os.makedirs(os.path.dirname(output_file), exist_ok=True)
-
- # Save results to CSV
- df = pd.DataFrame(results)
- df.to_csv(output_file, index=False)
-
- # Also save as JSONL for compatibility
- with open(output_file.replace('.csv', '.jsonl'), 'w') as f:
- for result in results:
- f.write(json.dumps(result) + '\n')
-
-# Main app
-def main():
- # Sidebar styling
- st.sidebar.markdown("""
-
-
Configuration
-
- """, unsafe_allow_html=True)
-
- # Sidebar content
- with st.sidebar:
- data_file = st.text_input("Evaluation dataset path", "eval_dataset.jsonl")
- output_path = st.text_input("Output directory", "evaluation_results")
- evaluator_name = st.text_input("Evaluator Name", "Evaluator")
- include_alternative = st.checkbox("Include alternative responses", True)
-
- if st.button("Load Data", key="load_data"):
- st.session_state.data = load_data(data_file)
- st.session_state.current_index = 0
- st.session_state.results = []
- st.session_state.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- st.session_state.output_file = f"{output_path}/{evaluator_name}_{st.session_state.timestamp}.csv"
-
- # Initialize a list to track option order for each question
- st.session_state.option_orders = {}
-
- # Initialize session state for selected option
- if 'selected_option' not in st.session_state:
- st.session_state.selected_option = None
-
- # Initialize option orders if not present
- if 'option_orders' not in st.session_state:
- st.session_state.option_orders = {}
-
- # Main content
- st.markdown("""
- Human-in-the-Loop: AI Response Evaluator
- """, unsafe_allow_html=True)
-
- if 'data' not in st.session_state:
- st.info("👈 Please configure the evaluation settings and load data from the sidebar.")
- return
-
- if not st.session_state.data:
- st.warning("No data loaded or empty dataset.")
- return
-
- # Display progress
- total_items = len(st.session_state.data)
- current = st.session_state.current_index + 1
- st.progress(current / total_items)
- st.markdown(f"Question {current} of {total_items}
", unsafe_allow_html=True)
-
- # Get current question
- current_item = st.session_state.data[st.session_state.current_index]
- question_id = f"question_{st.session_state.current_index}"
-
- # Display question
- st.markdown(f'Question:
', unsafe_allow_html=True)
- st.markdown(f'{current_item["input"]}
', unsafe_allow_html=True)
-
- # Create options
- options = [
- {"label": "A", "response": current_item["output"]},
- ]
-
- # Add alternative option if enabled
- if include_alternative:
- # For demo, we'll use the next item's output as an alternative
- # In a real scenario, you'd want to use actual alternative responses
- alt_index = (st.session_state.current_index + 1) % len(st.session_state.data)
- alt_response = st.session_state.data[alt_index]["output"]
- options.append({"label": "B", "response": alt_response})
-
- # Display options
- st.markdown(f'Select the best answer:
', unsafe_allow_html=True)
-
- # Create columns for options
- cols = st.columns(len(options))
-
- # Display options with clickable boxes
- for i, option in enumerate(options):
- with cols[i]:
- # Create a unique key for each button
- key = f"option_{option['label']}_{st.session_state.current_index}"
-
- # Check if this option is selected
- is_selected = st.session_state.selected_option == option["label"]
- selected_class = "selected" if is_selected else "unselected"
-
- # Create a clickable container with the option text
- if st.button(
- f"Option {option['label']}",
- key=key,
- help=f"Click to select Option {option['label']}",
- use_container_width=True
- ):
- st.session_state.selected_option = option["label"]
- st.rerun()
-
- # Display the option content
- st.markdown(
- f'Option {option["label"]}:
{option["response"]}
',
- unsafe_allow_html=True
- )
-
- # Comments with white background
- comments = st.text_area("Additional comments (optional)")
-
- # Navigation buttons
- col1, col2, col3 = st.columns([1, 1, 1])
-
- with col1:
- st.markdown('', unsafe_allow_html=True)
- if st.button("Previous", disabled=st.session_state.current_index == 0):
- # Reset selection for new question
- st.session_state.selected_option = None
- st.session_state.current_index -= 1
- st.rerun()
- st.markdown('
', unsafe_allow_html=True)
-
- with col2:
- st.markdown('', unsafe_allow_html=True)
-
- with col3:
- st.markdown('', unsafe_allow_html=True)
- if st.button("Next", disabled=st.session_state.current_index == len(st.session_state.data) - 1):
- # Check if an option is selected
- if st.session_state.selected_option is None:
- st.error("Please select an option before proceeding")
- return
-
- # Save current evaluation
- selected_option = next((option for option in options if option["label"] == st.session_state.selected_option), None)
- is_correct = selected_option["response"] == current_item["output"] if selected_option else False
-
- result = {
- "question_index": st.session_state.current_index,
- "question": current_item["input"],
- "ground_truth": current_item["output"],
- "selected_option": st.session_state.selected_option,
- "selected_response": selected_option["response"] if selected_option else "",
- "is_correct": is_correct,
- "comments": comments,
- "timestamp": datetime.now().isoformat()
- }
-
- st.session_state.results.append(result)
-
- # Reset selection for next question
- st.session_state.selected_option = None
- st.session_state.current_index += 1
- st.rerun()
- st.markdown('
', unsafe_allow_html=True)
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..686a388
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+streamlit
+pandas
+google-generativeai
+requests
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/app.py b/src/app.py
new file mode 100644
index 0000000..9941b7c
--- /dev/null
+++ b/src/app.py
@@ -0,0 +1,278 @@
+
+import os
+import streamlit as st
+import json
+import pandas as pd
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+
+# --- Configuration & Styling ---
+def configure_page():
+ st.set_page_config(
+ page_title="AI Response Evaluator",
+ page_icon="🤖",
+ layout="wide"
+ )
+
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+# --- Data Handling ---
+def load_data(file_path: str) -> List[Dict[str, Any]]:
+ """Loads data from a JSONL file."""
+ if not os.path.exists(file_path):
+ st.error(f"File not found: {file_path}")
+ return []
+
+ data = []
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ for line_number, line in enumerate(f, 1):
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ data.append(json.loads(line))
+ except json.JSONDecodeError:
+ st.warning(f"Skipping invalid JSON at line {line_number} in {file_path}")
+ except Exception as e:
+ st.error(f"Error reading file: {e}")
+ return []
+
+ return data
+
+def save_evaluation(results: List[Dict[str, Any]], output_file: str):
+ """Saves evaluation results to CSV and JSONL."""
+ try:
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+ # Save to CSV
+ df = pd.DataFrame(results)
+ df.to_csv(output_file, index=False)
+
+ # Save to JSONL
+ jsonl_file = output_file.rsplit('.', 1)[0] + '.jsonl'
+ with open(jsonl_file, 'w', encoding='utf-8') as f:
+ for result in results:
+ f.write(json.dumps(result) + '\n')
+
+ except Exception as e:
+ st.error(f"Failed to save results: {e}")
+
+# --- UI Components ---
+def render_sidebar():
+ st.sidebar.markdown("""
+
+
Configuration
+
+ """, unsafe_allow_html=True)
+
+ with st.sidebar:
+ data_file = st.text_input("Evaluation dataset path", "eval_dataset.jsonl")
+ output_path = st.text_input("Output directory", "evaluation_results")
+ evaluator_name = st.text_input("Evaluator Name", "Evaluator")
+
+ # Store configuration in session state
+ st.session_state.include_alternative = st.checkbox("Include alternative responses", True)
+
+ if st.button("Load Data", key="load_data"):
+ data = load_data(data_file)
+ if data:
+ st.session_state.data = data
+ st.session_state.current_index = 0
+ st.session_state.results = []
+ st.session_state.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ st.session_state.output_file = os.path.join(output_path, f"{evaluator_name}_{st.session_state.timestamp}.csv")
+ st.session_state.selected_option = None
+ st.success(f"Loaded {len(data)} items.")
+ else:
+ st.error("No data loaded.")
+
+def render_question(current_item: Dict[str, Any], current_index: int, total_items: int):
+ # Progress
+ st.progress((current_index + 1) / total_items)
+ st.markdown(f"Question {current_index + 1} of {total_items}
", unsafe_allow_html=True)
+
+ # Question
+ st.markdown(f'Question:
', unsafe_allow_html=True)
+ st.markdown(f'{current_item.get("input", "")}
', unsafe_allow_html=True)
+
+def get_options(current_item: Dict[str, Any], data: List[Dict[str, Any]], current_index: int, include_alternative: bool):
+ options = [{"label": "A", "response": current_item.get("output", "")}]
+
+ if include_alternative and len(data) > 1:
+ # Simple logic for alternative: next item's output.
+ # In production, this should likely be smarter or pre-computed.
+ alt_index = (current_index + 1) % len(data)
+ alt_response = data[alt_index].get("output", "")
+ options.append({"label": "B", "response": alt_response})
+
+ return options
+
+def render_options(options: List[Dict[str, str]]):
+ st.markdown(f'Select the best answer:
', unsafe_allow_html=True)
+
+ cols = st.columns(len(options))
+
+ for i, option in enumerate(options):
+ with cols[i]:
+ key = f"option_{option['label']}_{st.session_state.current_index}"
+ is_selected = st.session_state.selected_option == option["label"]
+ selected_class = "selected" if is_selected else "unselected"
+
+ if st.button(
+ f"Option {option['label']}",
+ key=key,
+ use_container_width=True
+ ):
+ st.session_state.selected_option = option["label"]
+ st.rerun()
+
+ st.markdown(
+ f'Option {option["label"]}:
{option["response"]}
',
+ unsafe_allow_html=True
+ )
+
+def save_current_result(current_item: Dict[str, Any], options: List[Dict[str, str]], comments: str):
+ selected_option_data = next((opt for opt in options if opt["label"] == st.session_state.selected_option), None)
+
+ # "is_correct" logic is naive here (matches ground truth).
+ # Adjust if "ground_truth" is not the only correct answer.
+ ground_truth = current_item.get("output", "")
+ is_correct = False
+ if selected_option_data:
+ is_correct = (selected_option_data["response"] == ground_truth)
+
+ result = {
+ "question_index": st.session_state.current_index,
+ "question": current_item.get("input", ""),
+ "ground_truth": ground_truth,
+ "selected_option": st.session_state.selected_option,
+ "selected_response": selected_option_data["response"] if selected_option_data else "",
+ "is_correct": is_correct,
+ "comments": comments,
+ "timestamp": datetime.now().isoformat()
+ }
+
+ # Update or append result
+ # We will just append for now to keep log history.
+ st.session_state.results.append(result)
+ save_evaluation(st.session_state.results, st.session_state.output_file)
+ return True
+
+# --- Main Application ---
+def main():
+ configure_page()
+ render_sidebar()
+
+ # Header
+ st.markdown("""
+ Human-in-the-Loop: AI Response Evaluator
+ """, unsafe_allow_html=True)
+
+ if 'data' not in st.session_state or not st.session_state.data:
+ st.info("👈 Please configure the evaluation settings and load data from the sidebar.")
+ return
+
+ # Helper variables
+ data = st.session_state.data
+ current_index = st.session_state.current_index
+ current_item = data[current_index]
+
+ render_question(current_item, current_index, len(data))
+
+ # Options
+ # Use include_alternative from session_state
+ include_alternative = st.session_state.get('include_alternative', True)
+
+ options = get_options(current_item, data, current_index, include_alternative=include_alternative)
+ render_options(options)
+
+ # Comments
+ comments = st.text_area("Additional comments (optional)")
+
+ # Navigation
+ col1, col2, col3 = st.columns([1, 1, 1])
+
+ with col1:
+ if st.button("Previous", disabled=current_index == 0):
+ st.session_state.selected_option = None
+ st.session_state.current_index -= 1
+ st.rerun()
+
+ with col2:
+ if st.button("Save & Finish"):
+ if st.session_state.selected_option is None:
+ st.error("Please select an option before saving")
+ else:
+ save_current_result(current_item, options, comments)
+
+ # Success Message
+ st.markdown(f"""
+
+
+
Results saved to: {st.session_state.output_file}
+
+ """, unsafe_allow_html=True)
+
+ # Summary
+ correct_count = sum(1 for r in st.session_state.results if r["is_correct"])
+ total_evaluated = len(st.session_state.results)
+
+ st.markdown(f"""
+
+
Evaluation Summary
+
Total questions evaluated: {total_evaluated}
+
Correct selections: {correct_count} ({correct_count/total_evaluated*100:.1f}%)
+
+ """, unsafe_allow_html=True)
+
+ with col3:
+ if st.button("Next", disabled=current_index == len(data) - 1):
+ if st.session_state.selected_option is None:
+ st.error("Please select an option before proceeding")
+ else:
+ save_current_result(current_item, options, comments)
+ st.session_state.selected_option = None
+ st.session_state.current_index += 1
+ st.rerun()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/evaluator.py b/src/evaluator.py
new file mode 100644
index 0000000..adbe577
--- /dev/null
+++ b/src/evaluator.py
@@ -0,0 +1,284 @@
+
+import os
+import re
+import requests
+import json
+import sys
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+
+# Add the parent directory to sys.path to allow imports from src
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+try:
+ from agents import Runner, Agent
+ from agents.sessions.in_memory_session_service import InMemorySessionService
+ from agents.artifacts.in_memory_artifact_service import InMemoryArtifactService
+except ImportError:
+ print("Warning: 'agents' library not found. Using mock implementation.")
+ from src.mock_agents import Runner, Agent, InMemorySessionService, InMemoryArtifactService
+
+import google.generativeai as genai
+from google.generativeai import GenerativeModel, GenerationConfig
+from google.generativeai.protos import Part, Content
+
+# --- Utility Functions ---
+
+def extract_responses_from_code(agent_code: str) -> list:
+ """
+ Extracts sample responses from the agent's code (Simplified Placeholder).
+ """
+ matches = re.findall(r"text=['\"](.*?)['\"]", agent_code, re.DOTALL)
+ return matches
+
+def parse_evaluation(evaluation_text: str) -> dict:
+ """Parses the evaluation text from the Gemini model."""
+ scores = {}
+ try:
+ scores = json.loads(evaluation_text)
+ except json.JSONDecodeError:
+ match = re.search(r"fluency:\s*(\d+).*coherence:\s*(\d+).*relevance:\s*(\d+).*helpfulness:\s*(\d+)", evaluation_text, re.IGNORECASE)
+ if match:
+ scores = {
+ "fluency": int(match.group(1)),
+ "coherence": int(match.group(2)),
+ "relevance": int(match.group(3)),
+ "helpfulness": int(match.group(4)),
+ }
+ return scores
+
+def google_search(query: str) -> dict:
+ """Performs a Google Search (using the Custom Search JSON API)."""
+ api_key = os.environ.get('GOOGLE_SEARCH_API_KEY')
+ cx = os.environ.get('SEARCH_ENGINE_ID')
+
+ if not api_key or not cx:
+ print("Missing GOOGLE_SEARCH_API_KEY or SEARCH_ENGINE_ID environment variables.")
+ return {}
+
+ try:
+ url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={query}"
+ response = requests.get(url)
+ response.raise_for_status()
+ return response.json()
+ except requests.exceptions.RequestException as e:
+ print(f"Error during Google Search: {e}")
+ return {}
+
+def extract_claims(text: str) -> list:
+ """Extracts factual claims from the agent's response (Placeholder)."""
+ sentences = re.split(r'(? bool:
+ """Verifies a claim against search results (Placeholder)."""
+ if not search_results or 'items' not in search_results:
+ return False
+
+ for item in search_results['items']:
+ if claim.lower() in item.get('title', '').lower() or claim.lower() in item.get('snippet', '').lower():
+ return True
+ return False
+
+# --- Tool Definitions ---
+
+def generation_evaluator(agent_code: str, sample_interactions: list = None) -> dict:
+ """Evaluates the generation quality of the agent's responses."""
+ try:
+ model = GenerativeModel("gemini-1.5-pro-002")
+ except Exception:
+ print("Warning: Could not initialize GenerativeModel. Check API keys.")
+ return {}
+
+ if sample_interactions is None:
+ sample_responses = extract_responses_from_code(agent_code)
+ else:
+ sample_responses = [interaction['output'] for interaction in sample_interactions]
+
+ scores = {}
+ for i, response in enumerate(sample_responses):
+ prompt = f"""Evaluate the following response based on fluency, coherence, relevance, and helpfulness. Provide a score from 0 to 10 for each, where 10 is best.
+
+ Response:
+ {response}
+
+ Scores (in JSON format):
+ """
+ try:
+ evaluation = model.generate_content(prompt)
+ scores[f"response_{i}"] = parse_evaluation(evaluation.text)
+ except Exception as e:
+ print(f"Error during generation evaluation: {e}")
+ scores[f"response_{i}"] = {"fluency": 0, "coherence": 0, "relevance": 0, "helpfulness": 0}
+ return scores
+
+def context_evaluator(agent_code: str, sample_interactions: list) -> float:
+ """Evaluates the agent's ability to maintain and use context."""
+ if not sample_interactions:
+ return 0.0
+
+ try:
+ model = GenerativeModel("gemini-1.5-pro-002")
+ except Exception:
+ return 0.0
+
+ conversation_history = ""
+ total_context_score = 0
+
+ for interaction in sample_interactions:
+ user_input = interaction['input']
+ agent_response = interaction['output']
+
+ prompt = f"""
+ You are evaluating an AI agent's ability to maintain context in a conversation.
+ Here is the conversation history so far:
+ {conversation_history}
+
+ The user's latest input is:
+ {user_input}
+
+ The agent's response is:
+ {agent_response}
+
+ Rate the agent's response on a scale of 0 to 10 (10 being best) based on how well it uses and maintains context from the previous conversation. Return a single number.
+ """
+
+ try:
+ evaluation = model.generate_content(prompt)
+ score = int(evaluation.text.strip())
+ total_context_score += score
+ except Exception as e:
+ print(f"Error during evaluation: {e}")
+ total_context_score += 0
+
+ conversation_history += f"User: {user_input}\nAgent: {agent_response}\n"
+
+ return total_context_score / len(sample_interactions) if sample_interactions else 0.0
+
+def groundness_evaluator(agent_code: str, sample_interactions: list) -> float:
+ """Evaluates the factual accuracy of the agent's responses."""
+ if not sample_interactions:
+ return 0.0
+
+ scores = []
+ for interaction in sample_interactions:
+ agent_response = interaction['output']
+ claims = extract_claims(agent_response)
+
+ for claim in claims:
+ try:
+ search_results = google_search(claim)
+ if verify_claim(claim, search_results):
+ scores.append(10)
+ else:
+ scores.append(0)
+ except Exception as e:
+ print(f"Error during groundness evaluation: {e}")
+ scores.append(0)
+
+ return sum(scores) / len(scores) if scores else 0.0
+
+def function_caller_evaluator(agent_code: str, sample_interactions: list) -> float:
+ """Evaluates the agent's ability to select and use the correct functions."""
+ if not sample_interactions:
+ return 0.0
+
+ try:
+ model = GenerativeModel("gemini-1.5-pro-002")
+ except Exception:
+ return 0.0
+
+ total_function_score = 0
+
+ for interaction in sample_interactions:
+ user_input = interaction['input']
+ agent_response = interaction['output']
+
+ prompt = f"""You are evaluating an AI agent's ability to use function calls.
+ User Input: {user_input}
+ Agent response: {agent_response}
+ Based on the user input, analyze agent's response and rate from 0 to 10 if used the correct functions to get the response (where 10 is best). Return a single number.
+ """
+ try:
+ evaluation = model.generate_content(prompt)
+ score = int(evaluation.text.strip())
+ total_function_score += score
+ except Exception as e:
+ print(f"Error during evaluation: {e}")
+ total_function_score += 0
+ return total_function_score / len(sample_interactions) if sample_interactions else 0.0
+
+# --- Main Evaluation Logic ---
+
+def run_evaluation(target_agent_code: str, sample_interactions: list):
+ # Agent Definition
+ judge_agent = Agent(
+ model="gemini-1.5-pro-002",
+ name="AgentEvaluator",
+ instruction="""
+ You are an agent designed to evaluate the quality and correctness of other AI agents.
+ """,
+ tools=[
+ generation_evaluator,
+ context_evaluator,
+ groundness_evaluator,
+ function_caller_evaluator,
+ google_search
+ ],
+ flow='sequential',
+ )
+
+ session_service = InMemorySessionService()
+ artifact_service = InMemoryArtifactService()
+ runner = Runner(app_name="AgentEvaluator", agent=judge_agent, artifact_service=artifact_service, session_service=session_service)
+ session = session_service.create(app_name="AgentEvaluator", user_id="1")
+
+ # Use Vertex AI Content objects if available, otherwise mock or string
+ # Assuming the 'agents' library expects specific types.
+ # Here we send just text if mocking might fail on types.
+
+ content = f"Evaluate this code: {target_agent_code}"
+
+ final_response = None
+ for event in runner.run(
+ session=session,
+ new_message=content,
+ tool_args={"sample_interactions": sample_interactions}
+ ):
+ if hasattr(event, 'content') and event.content:
+ final_response = event.content
+ print(final_response)
+
+ return final_response
+
+if __name__ == "__main__":
+ # Check for API keys
+ required_env_vars = ["GOOGLE_API_KEY", "GOOGLE_SEARCH_API_KEY", "SEARCH_ENGINE_ID", "GOOGLE_CLOUD_PROJECT"]
+ missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
+
+ if missing_vars:
+ print(f"Warning: Missing environment variables: {', '.join(missing_vars)}")
+ print("Functionality will be limited.")
+
+ # Load target agent code
+ target_agent_path = "target_agent.py"
+ target_agent_code = ""
+ if os.path.exists(target_agent_path):
+ with open(target_agent_path, "r") as f:
+ target_agent_code = f.read()
+ else:
+ print(f"Warning: {target_agent_path} not found. Using default mock agent.")
+ target_agent_code = """
+ def respond(input):
+ return "I am a simple agent."
+ """
+
+ sample_interactions = [
+ {'input': "What's the weather like today?",
+ 'output': "I'm sorry, I don't have the ability to look up weather information."},
+ {'input': "What's 2 + 2?", 'output': "2 + 2 = 4"},
+ ]
+
+ print("Starting evaluation...")
+ result = run_evaluation(target_agent_code, sample_interactions)
+ print("Evaluation complete.")
diff --git a/src/mock_agents.py b/src/mock_agents.py
new file mode 100644
index 0000000..e93c79b
--- /dev/null
+++ b/src/mock_agents.py
@@ -0,0 +1,31 @@
+
+class Agent:
+ def __init__(self, model, name, instruction, tools, flow):
+ self.model = model
+ self.name = name
+ self.instruction = instruction
+ self.tools = tools
+ self.flow = flow
+
+class Runner:
+ def __init__(self, app_name, agent, artifact_service, session_service):
+ self.app_name = app_name
+ self.agent = agent
+ self.artifact_service = artifact_service
+ self.session_service = session_service
+
+ def run(self, session, new_message, tool_args):
+ # Mock run method
+ print(f"Runner: Mocking run for message: {new_message}")
+ yield MockEvent("Mock response from agent")
+
+class MockEvent:
+ def __init__(self, content):
+ self.content = content
+
+class InMemorySessionService:
+ def create(self, app_name, user_id):
+ return "mock_session"
+
+class InMemoryArtifactService:
+ pass
diff --git a/tests/test_app.py b/tests/test_app.py
new file mode 100644
index 0000000..cf9d680
--- /dev/null
+++ b/tests/test_app.py
@@ -0,0 +1,42 @@
+
+import os
+import sys
+
+# Add src to python path for tests
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+import unittest
+from app import load_data, get_options
+import json
+
+class TestApp(unittest.TestCase):
+ def setUp(self):
+ self.test_file = "test_data.jsonl"
+ with open(self.test_file, "w") as f:
+ f.write(json.dumps({"input": "Q1", "output": "A1"}) + "\n")
+ f.write(json.dumps({"input": "Q2", "output": "A2"}) + "\n")
+
+ def tearDown(self):
+ if os.path.exists(self.test_file):
+ os.remove(self.test_file)
+
+ def test_load_data(self):
+ data = load_data(self.test_file)
+ self.assertEqual(len(data), 2)
+ self.assertEqual(data[0]["input"], "Q1")
+
+ def test_load_data_missing(self):
+ data = load_data("non_existent.jsonl")
+ self.assertEqual(len(data), 0)
+
+ def test_get_options(self):
+ data = [{"input": "Q1", "output": "A1"}, {"input": "Q2", "output": "A2"}]
+ options = get_options(data[0], data, 0, include_alternative=True)
+ self.assertEqual(len(options), 2)
+ self.assertEqual(options[0]["label"], "A")
+ self.assertEqual(options[0]["response"], "A1")
+ self.assertEqual(options[1]["label"], "B")
+ self.assertEqual(options[1]["response"], "A2")
+
+if __name__ == '__main__':
+ unittest.main()