2020from pathlib import Path
2121from typing import Any , Dict , List
2222import asyncio
23+ import pytest
2324
2425import litellm
2526from pydantic import BaseModel
2627
27- from eval_protocol .models import EvaluateResult , EvaluationRow , InputMetadata , Message , MetricResult
28+ from eval_protocol .models import EvaluateResult , EvaluationRow
2829from eval_protocol .pytest import evaluation_test
2930from eval_protocol .pytest .remote_rollout_processor import RemoteRolloutProcessor
3031
@@ -38,60 +39,6 @@ class SVGBenchResponse(BaseModel):
3839 number_of_fulfilled_requirements : int
3940
4041
41- class IntentMatchingResponse (BaseModel ):
42- """Response structure for intent matching evaluation."""
43-
44- intent_reasoning : str
45- intent_matching_score : float # 0-1: Does the content match the intended purpose?
46-
47-
48- def svgbench_to_evaluation_row (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
49- """
50- Convert SVGBench dataset entries to EvaluationRow objects.
51-
52- Args:
53- data: List of dictionaries containing prompt and requirements
54-
55- Returns:
56- List of EvaluationRow objects
57- """
58- rows = []
59-
60- for i , row in enumerate (data ):
61- # Format requirements as numbered list
62- requirements = "\n " .join ([f"{ i + 1 } . { req } " for i , req in enumerate (row ["requirements" ])])
63-
64- # Create the generation prompt following SVGBench format
65- prompt = f"""{ row ["prompt" ]} Wrap the SVG code in an SVG code block following the example below.
66-
67- Example:
68- ```svg
69- <svg viewBox="0 0 100 100" width="100" height="100">
70- <circle cx="50" cy="50" r="40" fill="red" />
71- </svg>
72- ```
73-
74- Requirements:
75- { requirements } """
76-
77- eval_row = EvaluationRow (
78- messages = [Message (role = "user" , content = prompt )],
79- input_metadata = InputMetadata (
80- row_id = f"row_{ i } " ,
81- dataset_info = {
82- "original_prompt" : row ["prompt" ],
83- "requirements" : row ["requirements" ],
84- "total_requirements" : len (row ["requirements" ]),
85- "formatted_prompt" : prompt ,
86- },
87- ),
88- )
89-
90- rows .append (eval_row )
91-
92- return rows
93-
94-
9542async def evaluate_with_llm_judge (image_path : str , requirements : List [str ]) -> Dict [str , Any ]:
9643 """
9744 Use LLM judge to evaluate how many requirements are fulfilled.
@@ -161,9 +108,9 @@ async def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> D
161108 raise ValueError ("Missing required field in response" )
162109
163110
111+ @pytest .mark .skip (reason = "Skipping SVG generation evaluation test" )
164112@evaluation_test (
165113 input_dataset = [str (Path (__file__ ).parent / "svgbench_dataset.jsonl" )],
166- dataset_adapter = svgbench_to_evaluation_row ,
167114 completion_params = [
168115 {
169116 "temperature" : 0.8 ,
0 commit comments