Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,10 +422,10 @@ class InputMetadata(BaseModel):
default_factory=dict, description="Completion endpoint parameters used"
)
dataset_info: Optional[Dict[str, Any]] = Field(
None, description="Dataset row details: seed, system_prompt, environment_context, etc"
default=None, description="Dataset row details: seed, system_prompt, environment_context, etc"
)
session_data: Optional[Dict[str, Any]] = Field(
None, description="Session metadata like timestamp (input only, no duration/usage)"
default=None, description="Session metadata like timestamp (input only, no duration/usage)"
)


Expand Down
4 changes: 1 addition & 3 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def postprocess(

def evaluation_test( # noqa: C901
*,
completion_params: List[CompletionParams],
completion_params: List[CompletionParams] = [None],
input_messages: Optional[List[InputMessagesParam]] = None,
input_dataset: Optional[List[DatasetPathParam]] = None,
input_rows: Optional[List[EvaluationRow]] = None,
Expand Down Expand Up @@ -359,8 +359,6 @@ def decorator(
threshold = None

sig = inspect.signature(test_func)
if not completion_params:
raise ValueError("completion_params is required")

# For pointwise/groupwise mode, we expect a different signature
# we expect single row to be passed in as the original row
Expand Down
2 changes: 2 additions & 0 deletions tests/pytest/data/import_data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"step": 0, "prompt_id": 0, "generation_id": 4, "status": "success", "start_time": 1755699131.2267165, "end_time": 1755699159.6630864, "total_duration": 28.43636989593506, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.05, "grading_details": "The agent completely failed to utilize any of the available tools despite having relevant options like batch_web_search and scholar_search that could have provided authoritative sources on the aluminum/aluminium distinction. The answer provided appears to be based on pre-existing knowledge rather than tool-assisted research, which directly contradicts the task requirement of evaluating a 'tool-assisted problem-solving process.' The agent shows no tool selection strategy, no execution of any tools, and no integration of tool outputs. However, the final answer itself is factually correct and well-structured, explaining the spelling difference, IUPAC conventions, and regional usage patterns. This demonstrates subject knowledge but represents a fundamental misunderstanding of the task requirements. The agent needed to search for sources, potentially crawl relevant websites, and synthesize findings from multiple tools to properly complete this task. The complete absence of tool usage when tools were explicitly available and expected makes this a clear failure in the context of tool-assisted problem-solving.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Differences Between Aluminum and Aluminium Explainedx", "final_answer": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 4.259281158447266, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Differences Between Aluminum and Aluminium Explainedx"}, {"role": "assistant", "content": "The difference between \"aluminum\" and \"aluminium\" is primarily one of **spelling convention rather than any intrinsic chemical difference**. The International Union of Pure and Applied Chemistry (IUPAC) officially adopts **\"aluminium\"** with the '-ium' suffix, aligning with other metallic elements like sodium, potassium, and magnesium. However, **\"aluminum\"** is the standard spelling in American English, reflecting historical spelling conventions that were prevalent in the United States during the 19th century. Both terms refer to the same element (atomic number 13), and there is **no difference in physical or chemical properties** between the element described as aluminum and that described as aluminium.\n\nWould you like a more detailed breakdown of the historical and cultural factors behind this spelling divergence?"}], "response_length": 828, "tools": null}
{"step": 0, "prompt_id": 1, "generation_id": 4, "status": "success", "start_time": 1755699131.2445304, "end_time": 1755699161.9821692, "total_duration": 30.73763871192932, "conversation_turns": 1, "tool_calls_count": 0, "tool_calls_success": 0, "tool_calls_failed": 0, "final_score": 0.15, "grading_details": "The agent completely failed to use any available tools despite having access to relevant search capabilities. When presented with the ambiguous query 'Man with Timid Sword Reluctantly Pulling Away,' the agent did not attempt any web searches, scholarly searches, or URL crawling to gather information about this phrase's potential meanings, origins, or cultural significance. Instead, it immediately defaulted to asking clarifying questions without first exploring what information might be available. The agent showed good judgment in recognizing the phrase could have multiple interpretations (artistic, literary, psychological), but this recognition should have prompted tool usage to gather context. A competent approach would have involved using batch_web_search to explore different interpretations, possibly scholar_search for literary or cultural analysis, and then synthesizing findings before requesting clarification. The complete absence of tool usage represents a fundamental failure in the task execution, though the agent's interpretive thinking about the phrase shows some analytical capability.", "grading_success": true, "grading_error": null, "grading_duration": 0.0, "reasoning_steps": [], "original_query": "Man with Timid Sword Reluctantly Pulling Away", "final_answer": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase.", "error_type": null, "error_message": null, "error_traceback": null, "llm_calls": 1, "llm_tokens_input": 0, "llm_tokens_output": 0, "llm_timeouts": 0, "llm_duration": 3.486870288848877, "tool_call_details": [], "conversation_messages": [{"role": "user", "content": "Man with Timid Sword Reluctantly Pulling Away"}, {"role": "assistant", "content": "I recognize this phrase as a powerful visual metaphor that could represent various emotional or psychological states - from withdrawal from conflict to hesitation about taking action. To help you better, could you clarify what you're looking for?\n\nAre you wanting:\n- **Art/image creation** to visualize this scene?\n- **Literary analysis** of this concept?\n- **Historical/cultural context** about similar motifs?\n- **Psychological interpretation** of what this represents?\n- **Something else entirely**?\n\nIf you're looking for visual art, I can search for similar artistic representations or help create something inspired by this evocative phrase."}], "response_length": 647, "tools": null}
33 changes: 33 additions & 0 deletions tests/pytest/test_pytest_import_external_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from eval_protocol.pytest import evaluation_test
import os
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata
from typing import List
from typing import Dict, Any


def jsonl_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
eval_rows: List[EvaluationRow] = []
for row in rows:
eval_rows.append(
EvaluationRow(
messages=row["conversation_messages"],
input_metadata=InputMetadata(
row_id=str(row["prompt_id"]),
),
evaluation_result=EvaluateResult(score=row["final_score"]),
)
)
return eval_rows


@evaluation_test(
input_dataset=[os.path.join(os.path.dirname(__file__), "data", "import_data.jsonl")],
dataset_adapter=jsonl_dataset_adapter,
)
def test_import_logs(row: EvaluationRow) -> EvaluationRow:
"""
The existence of this test ensures that importing external logs from
arbitrary jsonl files works by just specifying input_dataset to file path
and a dataset_adapter function.
"""
return row
Loading