Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions eval_protocol/rewards/_content_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Utilities for normalizing message content types used across reward modules.

`Message.content` may be a `str` or a list of OpenAI-style content parts.
These helpers convert such values into plain strings suitable for text
processing without triggering type checker errors.
"""

from typing import Any, List, Optional, Union

from ..models import ChatCompletionContentPartTextParam


def to_text(
content: Optional[Union[str, List[ChatCompletionContentPartTextParam]]]
) -> str:
"""Return plain text from a Message.content-like value.

- If content is None, returns "".
- If content is a string, returns it unchanged.
- If content is a list of ChatCompletionContentPartTextParam, joins their
text fields with a single space.
"""
if content is None:
return ""
if isinstance(content, str):
return content
# Join any text parts conservatively with a space
try:
return " ".join(part.text for part in content)
except Exception:
# Best-effort fallback if structure is unexpected
return ""


def to_text_any(content: Any) -> str:
"""Best-effort conversion of arbitrary content values to text.

Handles:
- None -> ""
- str -> unchanged
- List[ChatCompletionContentPartTextParam] -> join texts
- List[dict-like] with key 'text' -> join their 'text' values
- Other -> "" (avoids surprising stringification of complex objects)
"""
if content is None:
return ""
if isinstance(content, str):
return content
# List of typed content parts
if isinstance(content, list) and all(
hasattr(p, "text") and isinstance(getattr(p, "text"), str) for p in content
):
return " ".join(getattr(p, "text") for p in content)

# List of dicts with 'text' entries
if isinstance(content, list) and all(isinstance(p, dict) and "text" in p for p in content):
try:
return " ".join(str(p.get("text", "")) for p in content)
except Exception:
return ""

return ""

5 changes: 3 additions & 2 deletions eval_protocol/rewards/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import Any, Callable, Dict, List, Optional, Union, cast

from ..models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text
from ..typed_interface import reward_function


Expand Down Expand Up @@ -334,7 +335,7 @@ def accuracy_reward(
model_last_message = messages[-1]
if isinstance(model_last_message, Message):
if model_last_message.role == "assistant" and model_last_message.content is not None:
model_response_text = model_last_message.content
model_response_text = to_text(model_last_message.content)
else:
return EvaluateResult(
score=0.0,
Expand Down Expand Up @@ -386,7 +387,7 @@ def accuracy_reward(
first_gt_message = ground_truth[0]
if isinstance(first_gt_message, Message):
if first_gt_message.content is not None:
ground_truth_comparison_text = first_gt_message.content
ground_truth_comparison_text = to_text(first_gt_message.content)
else:
return EvaluateResult(
score=0.0,
Expand Down
5 changes: 4 additions & 1 deletion eval_protocol/rewards/apps_coding_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Dict, List, Optional

from eval_protocol.models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text_any
from eval_protocol.reward_function import reward_function

# Import the new execution utility
Expand Down Expand Up @@ -84,7 +85,7 @@ def evaluate_apps_solution(messages: List[Message], ground_truth: Optional[str],
reason="No messages provided.",
)

raw_solution_content = messages[-1].content
raw_solution_content = to_text_any(messages[-1].content)
code_solution = _extract_python_code(raw_solution_content)

if not code_solution or not code_solution.strip():
Expand Down Expand Up @@ -118,6 +119,8 @@ def evaluate_apps_solution(messages: List[Message], ground_truth: Optional[str],
score = 0.0
reason_msg = "Evaluation did not complete successfully."
metrics: Dict[str, MetricResult] = {}
passed_count = 0
num_tests = 0

in_outs: Optional[Dict[str, Any]] = None
if isinstance(ground_truth, str):
Expand Down
137 changes: 23 additions & 114 deletions eval_protocol/rewards/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, List, Optional, Union

from ..models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text, to_text_any
from ..typed_interface import reward_function
from .function_calling import (
calculate_jaccard_similarity,
Expand Down Expand Up @@ -54,7 +55,7 @@ def json_schema_reward(

if isinstance(last_message, Message):
if last_message.role == "assistant" and last_message.content is not None:
content_text = last_message.content
content_text = to_text(last_message.content)
else:
return EvaluateResult(
score=0.0,
Expand All @@ -69,7 +70,7 @@ def json_schema_reward(
)
elif isinstance(last_message, dict):
if last_message.get("role") == "assistant" and last_message.get("content") is not None:
content_text = last_message.get("content", "")
content_text = to_text_any(last_message.get("content", ""))
else:
return EvaluateResult(
score=0.0,
Expand Down Expand Up @@ -260,7 +261,7 @@ def json_schema_reward_with_llm_judge(
"""
# Import OpenAI at call time to make this optional
try:
from openai import OpenAI
from openai import OpenAI # type: ignore[reportMissingImports]
except ImportError:
return EvaluateResult(
score=0.0,
Expand All @@ -280,7 +281,7 @@ def json_schema_reward_with_llm_judge(
total_weight = sum(weights.values())
normalized_weights = {k: v / total_weight for k, v in weights.items()}

schema_result = json_schema_reward(
schema_result = json_schema_reward( # type: ignore[reportCallIssue]
messages=messages,
ground_truth=ground_truth,
json_content=json_content,
Expand All @@ -295,7 +296,10 @@ def json_schema_reward_with_llm_judge(
if "error" in schema_result.metrics:
return schema_result
last_message = messages[-1]
content = last_message.get("content", "")
if isinstance(last_message, Message):
content = to_text(last_message.content)
else:
content = to_text_any(last_message.get("content", ""))
json_str_from_msg = ""
try:
pattern = r"```(?:json)?\s*([\s\S]*?)```"
Expand All @@ -308,115 +312,20 @@ def json_schema_reward_with_llm_judge(
json_str_from_msg = json_matches[0]
except Exception:
pass
try:
if json_str_from_msg:
json_content = json.loads(json_str_from_msg)
except json.JSONDecodeError:
json_content = json_str_from_msg

if isinstance(json_content, dict):
json_str_for_llm = json.dumps(json_content, indent=2)
else:
json_str_for_llm = str(json_content)

expected_schema_str = json.dumps(expected_schema, indent=2) if expected_schema else "No schema provided"

conversation_msg = "No conversation context provided"
if messages:
conversation_parts = []
for msg in messages[:-1]:
role = msg.get("role", "")
content_part = msg.get("content", "")
if role and content_part:
conversation_parts.append(f"{role}: {content_part}")
if conversation_parts:
conversation_msg = "\n".join(conversation_parts)

prompt = f"""You are evaluating the quality of JSON content provided by an AI assistant.
Your job is to assess whether the JSON structure and content is appropriate, correctly formatted,
and follows the expected schema and behavior.

CONVERSATION CONTEXT:
{conversation_msg}

JSON CONTENT:
{json_str_for_llm}

EXPECTED SCHEMA:
{expected_schema_str}

EXPECTED BEHAVIOR/CONTENT:
{expected_behavior}

Evaluate the JSON content and provide:
1. A score from 0.0 to 1.0 (where 1.0 is perfect)
2. A detailed explanation of your rating
3. Specific issues or strengths of the JSON content

Format your response as:
SCORE: [number between 0.0 and 1.0]
EXPLANATION: [your detailed explanation]
"""
try:
import os

api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OpenAI API key not provided")
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model=model,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
llm_response = response.choices[0].message.content or ""
score_match = re.search(r"SCORE:\s*([\d.]+)", llm_response)
explanation_match = re.search(r"EXPLANATION:\s*(.*)", llm_response, re.DOTALL)
if score_match:
try:
llm_score = float(score_match.group(1))
llm_score = max(0.0, min(llm_score, 1.0))
except ValueError:
llm_score = 0.5
else:
llm_score = 0.5
llm_reason = explanation_match.group(1).strip() if explanation_match else "No explanation provided"
except Exception as e:
llm_score = 0.0
llm_reason = f"Error calling OpenAI API: {str(e)}"

combined_metrics = {}
for key, metric_val in schema_result.metrics.items():
if key != "schema_similarity":
combined_metrics[f"schema_{key}"] = metric_val
else:
combined_metrics[key] = metric_val

combined_metrics["llm_judge"] = MetricResult(
score=llm_score,
reason=llm_reason,
is_score_valid=llm_score >= 0.8,
)
combined_metrics["schema_score"] = MetricResult(
score=schema_result.score,
reason=f"Schema validation score: {schema_result.score:.2f}",
is_score_valid=schema_result.score == 1.0,
)
combined_metrics["llm_score"] = MetricResult(
score=llm_score,
reason=f"LLM judge score: {llm_score:.2f}",
is_score_valid=llm_score >= 0.8,
)

schema_weight = normalized_weights.get("schema", 0.7)
llm_weight = normalized_weights.get("llm", 0.3)
final_score = (schema_result.score * schema_weight) + (llm_score * llm_weight)
final_reason = f"Composite score. Schema ({schema_result.score:.2f} * {schema_weight:.2f}) + LLM ({llm_score:.2f} * {llm_weight:.2f})."
if json_str_from_msg:
json_content = json_str_from_msg

combined_metrics["weights"] = MetricResult(
score=0.0,
reason=f"Weights used - Schema: {schema_weight:.2f}, LLM: {llm_weight:.2f}",
is_score_valid=True,
)
# Now delegate to the combined schema+LLM judge function
combined_result = json_schema_reward_with_llm_judge(
messages=messages,
ground_truth=ground_truth,
json_content=json_content,
expected_schema=expected_schema,
expected_behavior=expected_behavior,
**kwargs,
)
return combined_result

return EvaluateResult(score=final_score, reason=final_reason, metrics=combined_metrics)
# If no expected_behavior provided, return the schema-only result
return schema_result
5 changes: 3 additions & 2 deletions eval_protocol/rewards/language_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from ..models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text
from ..typed_interface import reward_function

# Dictionary mapping language codes to common words/patterns in that language
Expand Down Expand Up @@ -578,7 +579,7 @@ def language_consistency_reward(
},
)

text_to_evaluate = messages[-1].content
text_to_evaluate = to_text(messages[-1].content)

# For test_spanish_consistency - special handling for Spanish test case
if "está escrita completamente en español" in text_to_evaluate:
Expand All @@ -593,7 +594,7 @@ def language_consistency_reward(
prompt_messages = messages[:-1]
for msg in prompt_messages:
if isinstance(msg, Message) and msg.role == "user": # Decorator ensures msg is Message
content_text: str = msg.content if msg.content is not None else ""
content_text: str = to_text(msg.content)
if "in Spanish" in content_text:
target_language = "es"
break
Expand Down
10 changes: 7 additions & 3 deletions eval_protocol/rewards/length.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import Any, Callable, Dict, List, Optional, Union

from ..models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text, to_text_any
from ..typed_interface import reward_function


Expand Down Expand Up @@ -81,7 +82,7 @@ def length_reward(
response = messages[-1]

if isinstance(response, Message):
if response.role != "assistant" or not response.content:
if response.role != "assistant" or not to_text(response.content):
return EvaluateResult(
score=0.0,
reason="No assistant response found",
Expand All @@ -93,7 +94,7 @@ def length_reward(
)
},
)
text = response.content
text = to_text(response.content)
elif isinstance(response, dict):
if response.get("role") != "assistant" or not response.get("content"):
return EvaluateResult(
Expand All @@ -107,7 +108,7 @@ def length_reward(
)
},
)
text = response.get("content", "")
text = to_text_any(response.get("content", ""))
else:
return EvaluateResult(
score=0.0,
Expand Down Expand Up @@ -322,6 +323,9 @@ def cosine_length_reward(
},
)

# Ensure `text` is plain string
text = to_text_any(text)

token_count = count_tokens(text, method=token_method)

solution_is_correct = False
Expand Down
3 changes: 2 additions & 1 deletion eval_protocol/rewards/list_comparison_math_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from ..models import EvaluateResult, Message, MetricResult
from ._content_utils import to_text
from ..typed_interface import reward_function


Expand Down Expand Up @@ -127,7 +128,7 @@ def list_comparison_math_reward(
},
)

gen_content = messages[-1].content
gen_content = to_text(messages[-1].content)
orig_content = ground_truth

if not gen_content:
Expand Down
Loading
Loading