From c8ab69365cf8612889933429e847ab68b23cb3e6 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 27 Aug 2025 15:30:01 -0700 Subject: [PATCH] test_import_logs --- eval_protocol/models.py | 4 +-- eval_protocol/pytest/evaluation_test.py | 4 +-- tests/pytest/data/import_data.jsonl | 2 ++ .../test_pytest_import_external_logs.py | 33 +++++++++++++++++++ 4 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 tests/pytest/data/import_data.jsonl create mode 100644 tests/pytest/test_pytest_import_external_logs.py diff --git a/eval_protocol/models.py b/eval_protocol/models.py index f930c717..c6c76b88 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -422,10 +422,10 @@ class InputMetadata(BaseModel): default_factory=dict, description="Completion endpoint parameters used" ) dataset_info: Optional[Dict[str, Any]] = Field( - None, description="Dataset row details: seed, system_prompt, environment_context, etc" + default=None, description="Dataset row details: seed, system_prompt, environment_context, etc" ) session_data: Optional[Dict[str, Any]] = Field( - None, description="Session metadata like timestamp (input only, no duration/usage)" + default=None, description="Session metadata like timestamp (input only, no duration/usage)" ) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 47d98eb6..dc152252 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -242,7 +242,7 @@ def postprocess( def evaluation_test( # noqa: C901 *, - completion_params: List[CompletionParams], + completion_params: List[CompletionParams] = [None], input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, input_rows: Optional[List[EvaluationRow]] = None, @@ -359,8 +359,6 @@ def decorator( threshold = None sig = inspect.signature(test_func) - if not completion_params: - raise ValueError("completion_params is required") # For pointwise/groupwise mode, we expect a different signature # we expect single row to be passed in as the original row diff --git a/tests/pytest/data/import_data.jsonl b/tests/pytest/data/import_data.jsonl new file mode 100644 index 00000000..06a4ad9d --- /dev/null +++ b/tests/pytest/data/import_data.jsonl @@ -0,0 +1,2 @@ +{"step": 0, "prompt_id": 0, "generation_id": 4, "status": "success", "start_time": 1755699131.2267165, "end_time": 1755699159.6630864, "total_duration": 28.43636989593506, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.05, "grading_details": "The agent completely failed to utilize any of the available tools despite having relevant options like batch_web_search and scholar_search that could have provided authoritative sources on the aluminum/aluminium distinction. The answer provided appears to be based on pre-existing knowledge rather than tool-assisted research, which directly contradicts the task requirement of evaluating a 'tool-assisted problem-solving process.' The agent shows no tool selection strategy, no execution of any tools, and no integration of tool outputs. However, the final answer itself is factually correct and well-structured, explaining the spelling difference, IUPAC conventions, and regional usage patterns. This demonstrates subject knowledge but represents a fundamental misunderstanding of the task requirements. The agent needed to search for sources, potentially crawl relevant websites, and synthesize findings from multiple tools to properly complete this task. The complete absence of tool usage when tools were explicitly available and expected makes this a clear failure in the context of tool-assisted problem-solving.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Differences Between Aluminum and Aluminium Explainedx", "final_answer": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 4.259281158447266, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Differences Between Aluminum and Aluminium Explainedx"}, {"role": "assistant", "content": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?"}], "response_length": 828, "tools": null} +{"step": 0, "prompt_id": 1, "generation_id": 4, "status": "success", "start_time": 1755699131.2445304, "end_time": 1755699161.9821692, "total_duration": 30.73763871192932, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.15, "grading_details": "The agent completely failed to use any available tools despite having access to relevant search capabilities. When presented with the ambiguous query 'Man with Timid Sword Reluctantly Pulling Away,' the agent did not attempt any web searches, scholarly searches, or URL crawling to gather information about this phrase's potential meanings, origins, or cultural significance. Instead, it immediately defaulted to asking clarifying questions without first exploring what information might be available. The agent showed good judgment in recognizing the phrase could have multiple interpretations (artistic, literary, psychological), but this recognition should have prompted tool usage to gather context. A competent approach would have involved using batch_web_search to explore different interpretations, possibly scholar_search for literary or cultural analysis, and then synthesizing findings before requesting clarification. The complete absence of tool usage represents a fundamental failure in the task execution, though the agent's interpretive thinking about the phrase shows some analytical capability.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Man with Timid Sword Reluctantly Pulling Away", "final_answer": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase.", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 3.486870288848877, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Man with Timid Sword Reluctantly Pulling Away"}, {"role": "assistant", "content": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase."}], "response_length": 647, "tools": null} diff --git a/tests/pytest/test_pytest_import_external_logs.py b/tests/pytest/test_pytest_import_external_logs.py new file mode 100644 index 00000000..3bcb8e23 --- /dev/null +++ b/tests/pytest/test_pytest_import_external_logs.py @@ -0,0 +1,33 @@ +from eval_protocol.pytest import evaluation_test +import os +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata +from typing import List +from typing import Dict, Any + + +def jsonl_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: + eval_rows: List[EvaluationRow] = [] + for row in rows: + eval_rows.append( + EvaluationRow( + messages=row["conversation_messages"], + input_metadata=InputMetadata( + row_id=str(row["prompt_id"]), + ), + evaluation_result=EvaluateResult(score=row["final_score"]), + ) + ) + return eval_rows + + +@evaluation_test( + input_dataset=[os.path.join(os.path.dirname(__file__), "data", "import_data.jsonl")], + dataset_adapter=jsonl_dataset_adapter, +) +def test_import_logs(row: EvaluationRow) -> EvaluationRow: + """ + The existence of this test ensures that importing external logs from + arbitrary jsonl files works by just specifying input_dataset to file path + and a dataset_adapter function. + """ + return row