Skip to content

Commit c8ab693

Browse files
author
Dylan Huang
committed
test_import_logs
1 parent c0bece6 commit c8ab693

File tree

4 files changed

+38
-5
lines changed

4 files changed

+38
-5
lines changed

eval_protocol/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -422,10 +422,10 @@ class InputMetadata(BaseModel):
422422
default_factory=dict, description="Completion endpoint parameters used"
423423
)
424424
dataset_info: Optional[Dict[str, Any]] = Field(
425-
None, description="Dataset row details: seed, system_prompt, environment_context, etc"
425+
default=None, description="Dataset row details: seed, system_prompt, environment_context, etc"
426426
)
427427
session_data: Optional[Dict[str, Any]] = Field(
428-
None, description="Session metadata like timestamp (input only, no duration/usage)"
428+
default=None, description="Session metadata like timestamp (input only, no duration/usage)"
429429
)
430430

431431

eval_protocol/pytest/evaluation_test.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def postprocess(
242242

243243
def evaluation_test( # noqa: C901
244244
*,
245-
completion_params: List[CompletionParams],
245+
completion_params: List[CompletionParams] = [None],
246246
input_messages: Optional[List[InputMessagesParam]] = None,
247247
input_dataset: Optional[List[DatasetPathParam]] = None,
248248
input_rows: Optional[List[EvaluationRow]] = None,
@@ -359,8 +359,6 @@ def decorator(
359359
threshold = None
360360

361361
sig = inspect.signature(test_func)
362-
if not completion_params:
363-
raise ValueError("completion_params is required")
364362

365363
# For pointwise/groupwise mode, we expect a different signature
366364
# we expect single row to be passed in as the original row
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"step": 0, "prompt_id": 0, "generation_id": 4, "status": "success", "start_time": 1755699131.2267165, "end_time": 1755699159.6630864, "total_duration": 28.43636989593506, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.05, "grading_details": "The agent completely failed to utilize any of the available tools despite having relevant options like batch_web_search and scholar_search that could have provided authoritative sources on the aluminum/aluminium distinction. The answer provided appears to be based on pre-existing knowledge rather than tool-assisted research, which directly contradicts the task requirement of evaluating a 'tool-assisted problem-solving process.' The agent shows no tool selection strategy, no execution of any tools, and no integration of tool outputs. However, the final answer itself is factually correct and well-structured, explaining the spelling difference, IUPAC conventions, and regional usage patterns. This demonstrates subject knowledge but represents a fundamental misunderstanding of the task requirements. The agent needed to search for sources, potentially crawl relevant websites, and synthesize findings from multiple tools to properly complete this task. The complete absence of tool usage when tools were explicitly available and expected makes this a clear failure in the context of tool-assisted problem-solving.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Differences Between Aluminum and Aluminium Explainedx", "final_answer": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 4.259281158447266, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Differences Between Aluminum and Aluminium Explainedx"}, {"role": "assistant", "content": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?"}], "response_length": 828, "tools": null}
2+
{"step": 0, "prompt_id": 1, "generation_id": 4, "status": "success", "start_time": 1755699131.2445304, "end_time": 1755699161.9821692, "total_duration": 30.73763871192932, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.15, "grading_details": "The agent completely failed to use any available tools despite having access to relevant search capabilities. When presented with the ambiguous query 'Man with Timid Sword Reluctantly Pulling Away,' the agent did not attempt any web searches, scholarly searches, or URL crawling to gather information about this phrase's potential meanings, origins, or cultural significance. Instead, it immediately defaulted to asking clarifying questions without first exploring what information might be available. The agent showed good judgment in recognizing the phrase could have multiple interpretations (artistic, literary, psychological), but this recognition should have prompted tool usage to gather context. A competent approach would have involved using batch_web_search to explore different interpretations, possibly scholar_search for literary or cultural analysis, and then synthesizing findings before requesting clarification. The complete absence of tool usage represents a fundamental failure in the task execution, though the agent's interpretive thinking about the phrase shows some analytical capability.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Man with Timid Sword Reluctantly Pulling Away", "final_answer": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase.", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 3.486870288848877, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Man with Timid Sword Reluctantly Pulling Away"}, {"role": "assistant", "content": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase."}], "response_length": 647, "tools": null}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from eval_protocol.pytest import evaluation_test
2+
import os
3+
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata
4+
from typing import List
5+
from typing import Dict, Any
6+
7+
8+
def jsonl_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
9+
eval_rows: List[EvaluationRow] = []
10+
for row in rows:
11+
eval_rows.append(
12+
EvaluationRow(
13+
messages=row["conversation_messages"],
14+
input_metadata=InputMetadata(
15+
row_id=str(row["prompt_id"]),
16+
),
17+
evaluation_result=EvaluateResult(score=row["final_score"]),
18+
)
19+
)
20+
return eval_rows
21+
22+
23+
@evaluation_test(
24+
input_dataset=[os.path.join(os.path.dirname(__file__), "data", "import_data.jsonl")],
25+
dataset_adapter=jsonl_dataset_adapter,
26+
)
27+
def test_import_logs(row: EvaluationRow) -> EvaluationRow:
28+
"""
29+
The existence of this test ensures that importing external logs from
30+
arbitrary jsonl files works by just specifying input_dataset to file path
31+
and a dataset_adapter function.
32+
"""
33+
return row

0 commit comments

Comments
 (0)