diff --git a/eval_protocol/rewards/_content_utils.py b/eval_protocol/rewards/_content_utils.py new file mode 100644 index 00000000..edbb7217 --- /dev/null +++ b/eval_protocol/rewards/_content_utils.py @@ -0,0 +1,64 @@ +""" +Utilities for normalizing message content types used across reward modules. + +`Message.content` may be a `str` or a list of OpenAI-style content parts. +These helpers convert such values into plain strings suitable for text +processing without triggering type checker errors. +""" + +from typing import Any, List, Optional, Union + +from ..models import ChatCompletionContentPartTextParam + + +def to_text( + content: Optional[Union[str, List[ChatCompletionContentPartTextParam]]] +) -> str: + """Return plain text from a Message.content-like value. + + - If content is None, returns "". + - If content is a string, returns it unchanged. + - If content is a list of ChatCompletionContentPartTextParam, joins their + text fields with a single space. + """ + if content is None: + return "" + if isinstance(content, str): + return content + # Join any text parts conservatively with a space + try: + return " ".join(part.text for part in content) + except Exception: + # Best-effort fallback if structure is unexpected + return "" + + +def to_text_any(content: Any) -> str: + """Best-effort conversion of arbitrary content values to text. + + Handles: + - None -> "" + - str -> unchanged + - List[ChatCompletionContentPartTextParam] -> join texts + - List[dict-like] with key 'text' -> join their 'text' values + - Other -> "" (avoids surprising stringification of complex objects) + """ + if content is None: + return "" + if isinstance(content, str): + return content + # List of typed content parts + if isinstance(content, list) and all( + hasattr(p, "text") and isinstance(getattr(p, "text"), str) for p in content + ): + return " ".join(getattr(p, "text") for p in content) + + # List of dicts with 'text' entries + if isinstance(content, list) and all(isinstance(p, dict) and "text" in p for p in content): + try: + return " ".join(str(p.get("text", "")) for p in content) + except Exception: + return "" + + return "" + diff --git a/eval_protocol/rewards/accuracy.py b/eval_protocol/rewards/accuracy.py index 5d90ebc7..3f0bf5aa 100644 --- a/eval_protocol/rewards/accuracy.py +++ b/eval_protocol/rewards/accuracy.py @@ -11,6 +11,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function @@ -334,7 +335,7 @@ def accuracy_reward( model_last_message = messages[-1] if isinstance(model_last_message, Message): if model_last_message.role == "assistant" and model_last_message.content is not None: - model_response_text = model_last_message.content + model_response_text = to_text(model_last_message.content) else: return EvaluateResult( score=0.0, @@ -386,7 +387,7 @@ def accuracy_reward( first_gt_message = ground_truth[0] if isinstance(first_gt_message, Message): if first_gt_message.content is not None: - ground_truth_comparison_text = first_gt_message.content + ground_truth_comparison_text = to_text(first_gt_message.content) else: return EvaluateResult( score=0.0, diff --git a/eval_protocol/rewards/apps_coding_reward.py b/eval_protocol/rewards/apps_coding_reward.py index 6cbd63b9..19f73255 100644 --- a/eval_protocol/rewards/apps_coding_reward.py +++ b/eval_protocol/rewards/apps_coding_reward.py @@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional from eval_protocol.models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text_any from eval_protocol.reward_function import reward_function # Import the new execution utility @@ -84,7 +85,7 @@ def evaluate_apps_solution(messages: List[Message], ground_truth: Optional[str], reason="No messages provided.", ) - raw_solution_content = messages[-1].content + raw_solution_content = to_text_any(messages[-1].content) code_solution = _extract_python_code(raw_solution_content) if not code_solution or not code_solution.strip(): @@ -118,6 +119,8 @@ def evaluate_apps_solution(messages: List[Message], ground_truth: Optional[str], score = 0.0 reason_msg = "Evaluation did not complete successfully." metrics: Dict[str, MetricResult] = {} + passed_count = 0 + num_tests = 0 in_outs: Optional[Dict[str, Any]] = None if isinstance(ground_truth, str): diff --git a/eval_protocol/rewards/json_schema.py b/eval_protocol/rewards/json_schema.py index 9a9e2ce1..012871ff 100644 --- a/eval_protocol/rewards/json_schema.py +++ b/eval_protocol/rewards/json_schema.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text, to_text_any from ..typed_interface import reward_function from .function_calling import ( calculate_jaccard_similarity, @@ -54,7 +55,7 @@ def json_schema_reward( if isinstance(last_message, Message): if last_message.role == "assistant" and last_message.content is not None: - content_text = last_message.content + content_text = to_text(last_message.content) else: return EvaluateResult( score=0.0, @@ -69,7 +70,7 @@ def json_schema_reward( ) elif isinstance(last_message, dict): if last_message.get("role") == "assistant" and last_message.get("content") is not None: - content_text = last_message.get("content", "") + content_text = to_text_any(last_message.get("content", "")) else: return EvaluateResult( score=0.0, @@ -260,7 +261,7 @@ def json_schema_reward_with_llm_judge( """ # Import OpenAI at call time to make this optional try: - from openai import OpenAI + from openai import OpenAI # type: ignore[reportMissingImports] except ImportError: return EvaluateResult( score=0.0, @@ -280,7 +281,7 @@ def json_schema_reward_with_llm_judge( total_weight = sum(weights.values()) normalized_weights = {k: v / total_weight for k, v in weights.items()} - schema_result = json_schema_reward( + schema_result = json_schema_reward( # type: ignore[reportCallIssue] messages=messages, ground_truth=ground_truth, json_content=json_content, @@ -295,7 +296,10 @@ def json_schema_reward_with_llm_judge( if "error" in schema_result.metrics: return schema_result last_message = messages[-1] - content = last_message.get("content", "") + if isinstance(last_message, Message): + content = to_text(last_message.content) + else: + content = to_text_any(last_message.get("content", "")) json_str_from_msg = "" try: pattern = r"```(?:json)?\s*([\s\S]*?)```" @@ -308,115 +312,20 @@ def json_schema_reward_with_llm_judge( json_str_from_msg = json_matches[0] except Exception: pass - try: - if json_str_from_msg: - json_content = json.loads(json_str_from_msg) - except json.JSONDecodeError: - json_content = json_str_from_msg - - if isinstance(json_content, dict): - json_str_for_llm = json.dumps(json_content, indent=2) - else: - json_str_for_llm = str(json_content) - - expected_schema_str = json.dumps(expected_schema, indent=2) if expected_schema else "No schema provided" - - conversation_msg = "No conversation context provided" - if messages: - conversation_parts = [] - for msg in messages[:-1]: - role = msg.get("role", "") - content_part = msg.get("content", "") - if role and content_part: - conversation_parts.append(f"{role}: {content_part}") - if conversation_parts: - conversation_msg = "\n".join(conversation_parts) - - prompt = f"""You are evaluating the quality of JSON content provided by an AI assistant. -Your job is to assess whether the JSON structure and content is appropriate, correctly formatted, -and follows the expected schema and behavior. - -CONVERSATION CONTEXT: -{conversation_msg} - -JSON CONTENT: -{json_str_for_llm} - -EXPECTED SCHEMA: -{expected_schema_str} - -EXPECTED BEHAVIOR/CONTENT: -{expected_behavior} - -Evaluate the JSON content and provide: -1. A score from 0.0 to 1.0 (where 1.0 is perfect) -2. A detailed explanation of your rating -3. Specific issues or strengths of the JSON content - -Format your response as: -SCORE: [number between 0.0 and 1.0] -EXPLANATION: [your detailed explanation] -""" - try: - import os - - api_key = openai_api_key or os.environ.get("OPENAI_API_KEY") - if not api_key: - raise ValueError("OpenAI API key not provided") - client = OpenAI(api_key=api_key) - response = client.chat.completions.create( - model=model, - temperature=temperature, - messages=[{"role": "user", "content": prompt}], - ) - llm_response = response.choices[0].message.content or "" - score_match = re.search(r"SCORE:\s*([\d.]+)", llm_response) - explanation_match = re.search(r"EXPLANATION:\s*(.*)", llm_response, re.DOTALL) - if score_match: - try: - llm_score = float(score_match.group(1)) - llm_score = max(0.0, min(llm_score, 1.0)) - except ValueError: - llm_score = 0.5 - else: - llm_score = 0.5 - llm_reason = explanation_match.group(1).strip() if explanation_match else "No explanation provided" - except Exception as e: - llm_score = 0.0 - llm_reason = f"Error calling OpenAI API: {str(e)}" - - combined_metrics = {} - for key, metric_val in schema_result.metrics.items(): - if key != "schema_similarity": - combined_metrics[f"schema_{key}"] = metric_val - else: - combined_metrics[key] = metric_val - combined_metrics["llm_judge"] = MetricResult( - score=llm_score, - reason=llm_reason, - is_score_valid=llm_score >= 0.8, - ) - combined_metrics["schema_score"] = MetricResult( - score=schema_result.score, - reason=f"Schema validation score: {schema_result.score:.2f}", - is_score_valid=schema_result.score == 1.0, - ) - combined_metrics["llm_score"] = MetricResult( - score=llm_score, - reason=f"LLM judge score: {llm_score:.2f}", - is_score_valid=llm_score >= 0.8, - ) - - schema_weight = normalized_weights.get("schema", 0.7) - llm_weight = normalized_weights.get("llm", 0.3) - final_score = (schema_result.score * schema_weight) + (llm_score * llm_weight) - final_reason = f"Composite score. Schema ({schema_result.score:.2f} * {schema_weight:.2f}) + LLM ({llm_score:.2f} * {llm_weight:.2f})." + if json_str_from_msg: + json_content = json_str_from_msg - combined_metrics["weights"] = MetricResult( - score=0.0, - reason=f"Weights used - Schema: {schema_weight:.2f}, LLM: {llm_weight:.2f}", - is_score_valid=True, - ) + # Now delegate to the combined schema+LLM judge function + combined_result = json_schema_reward_with_llm_judge( + messages=messages, + ground_truth=ground_truth, + json_content=json_content, + expected_schema=expected_schema, + expected_behavior=expected_behavior, + **kwargs, + ) + return combined_result - return EvaluateResult(score=final_score, reason=final_reason, metrics=combined_metrics) + # If no expected_behavior provided, return the schema-only result + return schema_result diff --git a/eval_protocol/rewards/language_consistency.py b/eval_protocol/rewards/language_consistency.py index bfdd2052..d3d72740 100644 --- a/eval_protocol/rewards/language_consistency.py +++ b/eval_protocol/rewards/language_consistency.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function # Dictionary mapping language codes to common words/patterns in that language @@ -578,7 +579,7 @@ def language_consistency_reward( }, ) - text_to_evaluate = messages[-1].content + text_to_evaluate = to_text(messages[-1].content) # For test_spanish_consistency - special handling for Spanish test case if "está escrita completamente en español" in text_to_evaluate: @@ -593,7 +594,7 @@ def language_consistency_reward( prompt_messages = messages[:-1] for msg in prompt_messages: if isinstance(msg, Message) and msg.role == "user": # Decorator ensures msg is Message - content_text: str = msg.content if msg.content is not None else "" + content_text: str = to_text(msg.content) if "in Spanish" in content_text: target_language = "es" break diff --git a/eval_protocol/rewards/length.py b/eval_protocol/rewards/length.py index 75798430..cdbc0d42 100644 --- a/eval_protocol/rewards/length.py +++ b/eval_protocol/rewards/length.py @@ -11,6 +11,7 @@ from typing import Any, Callable, Dict, List, Optional, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text, to_text_any from ..typed_interface import reward_function @@ -81,7 +82,7 @@ def length_reward( response = messages[-1] if isinstance(response, Message): - if response.role != "assistant" or not response.content: + if response.role != "assistant" or not to_text(response.content): return EvaluateResult( score=0.0, reason="No assistant response found", @@ -93,7 +94,7 @@ def length_reward( ) }, ) - text = response.content + text = to_text(response.content) elif isinstance(response, dict): if response.get("role") != "assistant" or not response.get("content"): return EvaluateResult( @@ -107,7 +108,7 @@ def length_reward( ) }, ) - text = response.get("content", "") + text = to_text_any(response.get("content", "")) else: return EvaluateResult( score=0.0, @@ -322,6 +323,9 @@ def cosine_length_reward( }, ) + # Ensure `text` is plain string + text = to_text_any(text) + token_count = count_tokens(text, method=token_method) solution_is_correct = False diff --git a/eval_protocol/rewards/list_comparison_math_reward.py b/eval_protocol/rewards/list_comparison_math_reward.py index ceaef012..15c54326 100644 --- a/eval_protocol/rewards/list_comparison_math_reward.py +++ b/eval_protocol/rewards/list_comparison_math_reward.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function @@ -127,7 +128,7 @@ def list_comparison_math_reward( }, ) - gen_content = messages[-1].content + gen_content = to_text(messages[-1].content) orig_content = ground_truth if not gen_content: diff --git a/eval_protocol/rewards/math.py b/eval_protocol/rewards/math.py index e0406314..192cb192 100644 --- a/eval_protocol/rewards/math.py +++ b/eval_protocol/rewards/math.py @@ -11,6 +11,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function _ALGEBRAIC_VARS_SET: Set[str] = { @@ -587,7 +588,7 @@ def math_reward( ) }, ) - model_response_content = messages[-1].content + model_response_content = to_text(messages[-1].content) if ground_truth is None or ground_truth == "": return EvaluateResult( score=0.0, @@ -603,7 +604,8 @@ def math_reward( gen_answers_extracted_initial = extract_numbers(model_response_content) orig_answers_extracted = extract_numbers(ground_truth) - gen_answers_extracted = list(gen_answers_extracted_initial) + # Keep a precise type to satisfy downstream function signatures + gen_answers_extracted: List[Tuple[str, Union[float, str]]] = list(gen_answers_extracted_initial) metrics: Dict[str, MetricResult] = {} def format_extracted(items: List[Tuple[str, Union[float, str]]]) -> str: diff --git a/eval_protocol/rewards/multiple_choice_math_reward.py b/eval_protocol/rewards/multiple_choice_math_reward.py index 5768de80..119769b7 100644 --- a/eval_protocol/rewards/multiple_choice_math_reward.py +++ b/eval_protocol/rewards/multiple_choice_math_reward.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function @@ -134,7 +135,7 @@ def multiple_choice_math_reward( if messages and len(messages) > 0: gen_response_message = messages[-1] if gen_response_message.role == "assistant": - gen_content = gen_response_message.content or "" + gen_content = to_text(gen_response_message.content) if not gen_content: metrics["error_generated_message"] = MetricResult( @@ -152,7 +153,7 @@ def multiple_choice_math_reward( if ground_truth and len(ground_truth) > 0: orig_response_message = ground_truth[0] if orig_response_message.role == "assistant": - orig_content = orig_response_message.content or "" + orig_content = to_text(orig_response_message.content) if not orig_content: metrics["error_original_message"] = MetricResult( diff --git a/eval_protocol/rewards/reasoning_steps.py b/eval_protocol/rewards/reasoning_steps.py index 98da3d85..df77a69d 100644 --- a/eval_protocol/rewards/reasoning_steps.py +++ b/eval_protocol/rewards/reasoning_steps.py @@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Pattern, Set, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function @@ -48,7 +49,7 @@ def reasoning_steps_reward( response = messages[-1] - if response.role != "assistant" or not response.content: + if response.role != "assistant" or not to_text(response.content): return EvaluateResult( score=0.0, reason="No assistant response found or response has no content", @@ -60,7 +61,7 @@ def reasoning_steps_reward( ) }, ) - text: str = response.content + text: str = to_text(response.content) # Default patterns for detecting reasoning steps default_patterns = [ @@ -187,7 +188,7 @@ def sequence_reward( response = messages[-1] - if response.role != "assistant" or not response.content: + if response.role != "assistant" or not to_text(response.content): return EvaluateResult( score=0.0, reason="No assistant response found or response has no content", @@ -199,7 +200,7 @@ def sequence_reward( ) }, ) - text: str = response.content + text: str = to_text(response.content) if not sequence_terms: sequence_terms = [ diff --git a/eval_protocol/rewards/tag_count.py b/eval_protocol/rewards/tag_count.py index 83acef6f..f2cb06d9 100644 --- a/eval_protocol/rewards/tag_count.py +++ b/eval_protocol/rewards/tag_count.py @@ -9,6 +9,7 @@ from typing import Any, Dict, List, Set, Union from ..models import EvaluateResult, Message, MetricResult +from ._content_utils import to_text from ..typed_interface import reward_function @@ -46,7 +47,7 @@ def tag_count_reward( response = messages[-1] - if response.role != "assistant" or not response.content: + if response.role != "assistant" or not to_text(response.content): return EvaluateResult( score=0.0, reason="No assistant response found or response has no content", @@ -58,7 +59,7 @@ def tag_count_reward( ) }, ) - text: str = response.content + text: str = to_text(response.content) tag_metrics = {} found_tags: Set[str] = set()