Skip to content

Commit 91d627d

Browse files
committed
adjust test
1 parent f1f1355 commit 91d627d

File tree

2 files changed

+4
-56
lines changed

2 files changed

+4
-56
lines changed

eval_protocol/quickstart/svg_agent/evaluator/test_livesvgbench.py renamed to eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@
2020
from pathlib import Path
2121
from typing import Any, Dict, List
2222
import asyncio
23+
import pytest
2324

2425
import litellm
2526
from pydantic import BaseModel
2627

27-
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
28+
from eval_protocol.models import EvaluateResult, EvaluationRow
2829
from eval_protocol.pytest import evaluation_test
2930
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
3031

@@ -38,60 +39,6 @@ class SVGBenchResponse(BaseModel):
3839
number_of_fulfilled_requirements: int
3940

4041

41-
class IntentMatchingResponse(BaseModel):
42-
"""Response structure for intent matching evaluation."""
43-
44-
intent_reasoning: str
45-
intent_matching_score: float # 0-1: Does the content match the intended purpose?
46-
47-
48-
def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
49-
"""
50-
Convert SVGBench dataset entries to EvaluationRow objects.
51-
52-
Args:
53-
data: List of dictionaries containing prompt and requirements
54-
55-
Returns:
56-
List of EvaluationRow objects
57-
"""
58-
rows = []
59-
60-
for i, row in enumerate(data):
61-
# Format requirements as numbered list
62-
requirements = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(row["requirements"])])
63-
64-
# Create the generation prompt following SVGBench format
65-
prompt = f"""{row["prompt"]} Wrap the SVG code in an SVG code block following the example below.
66-
67-
Example:
68-
```svg
69-
<svg viewBox="0 0 100 100" width="100" height="100">
70-
<circle cx="50" cy="50" r="40" fill="red" />
71-
</svg>
72-
```
73-
74-
Requirements:
75-
{requirements}"""
76-
77-
eval_row = EvaluationRow(
78-
messages=[Message(role="user", content=prompt)],
79-
input_metadata=InputMetadata(
80-
row_id=f"row_{i}",
81-
dataset_info={
82-
"original_prompt": row["prompt"],
83-
"requirements": row["requirements"],
84-
"total_requirements": len(row["requirements"]),
85-
"formatted_prompt": prompt,
86-
},
87-
),
88-
)
89-
90-
rows.append(eval_row)
91-
92-
return rows
93-
94-
9542
async def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[str, Any]:
9643
"""
9744
Use LLM judge to evaluate how many requirements are fulfilled.
@@ -161,9 +108,9 @@ async def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> D
161108
raise ValueError("Missing required field in response")
162109

163110

111+
@pytest.mark.skip(reason="Skipping SVG generation evaluation test")
164112
@evaluation_test(
165113
input_dataset=[str(Path(__file__).parent / "svgbench_dataset.jsonl")],
166-
dataset_adapter=svgbench_to_evaluation_row,
167114
completion_params=[
168115
{
169116
"temperature": 0.8,

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ markers =
33
asyncio
44
asyncio_mode = auto
55
asyncio_default_fixture_loop_scope = function
6+
testpaths = tests ./eval_protocol/quickstart
67
python_files = test_*.py llm_judge_*.py
78
python_classes = Test*
89
python_functions = test_*

0 commit comments

Comments
 (0)