fix langchain and properly fix messages

Benny Chen · Benny Chen · commit 1ae92c17db03 · 2025-08-31T07:17:40.000+08:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -243,8 +243,19 @@ class Message(BaseModel):
 
     @classmethod
     def model_validate(cls, obj, *args, **kwargs):
-        if isinstance(obj, dict) and "role" not in obj:
-            raise ValueError("Role is required")
+        if isinstance(obj, dict):
+            if "role" not in obj:
+                raise ValueError("Role is required")
+            # Be lenient: if tool_calls entries are missing required 'id', synthesize one
+            tool_calls_obj = obj.get("tool_calls")
+            if isinstance(tool_calls_obj, list):
+                fixed_tool_calls = []
+                for tc in tool_calls_obj:
+                    if isinstance(tc, dict):
+                        if not tc.get("id"):
+                            tc = {**tc, "id": generate_id()}
+                    fixed_tool_calls.append(tc)
+                obj = {**obj, "tool_calls": fixed_tool_calls}
         return super().model_validate(obj, *args, **kwargs)
 
 
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -8,7 +8,7 @@
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
 
-# Conditional import for optional dependency
+# Conditional import for optional dependencies
 try:
     from .default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
 
diff --git a/eval_protocol/pytest/default_langchain_rollout_processor.py b/eval_protocol/pytest/default_langchain_rollout_processor.py
@@ -1,7 +1,13 @@
 import asyncio
 from typing import List
 
-from langchain_core.messages import BaseMessage
+try:
+    from langchain_core.messages import BaseMessage
+except Exception:  # pragma: no cover - optional dependency path
+    # Minimal fallback base type to satisfy typing when langchain is not present
+    class BaseMessage:  # type: ignore
+        pass
+
 
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest.rollout_processor import RolloutProcessor
@@ -25,7 +31,13 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig):
 
         async def _process_row(row: EvaluationRow) -> EvaluationRow:
             # Build LC messages from EP row
-            from langchain_core.messages import HumanMessage
+            try:
+                from langchain_core.messages import HumanMessage
+            except Exception:
+                # Fallback minimal message if langchain_core is unavailable
+                class HumanMessage:  # type: ignore
+                    def __init__(self, content: str):
+                        self.content = content
 
             lm_messages: List[BaseMessage] = []
             if row.messages:
diff --git a/eval_protocol/rewards/accuracy.py b/eval_protocol/rewards/accuracy.py
@@ -436,6 +436,20 @@ def accuracy_reward(
             metrics={"accuracy": MetricResult(score=0.0, is_score_valid=False, reason="Invalid GT message type.")},
         )
 
+    # If ground truth content is empty after coercion, short-circuit with a clear reason
+    if ground_truth_comparison_text.strip() == "":
+        return EvaluateResult(
+            score=0.0,
+            reason="Ground truth has no content.",
+            metrics={
+                "accuracy": MetricResult(
+                    score=0.0,
+                    is_score_valid=False,
+                    reason="Ground truth has no content.",
+                )
+            },
+        )
+
     extracted_answer = extract_fn(model_response_text) if extract_fn else extract_math_expression(model_response_text)
     if (
         not extracted_answer
diff --git a/eval_protocol/typed_interface.py b/eval_protocol/typed_interface.py
@@ -119,7 +119,7 @@ def _coerce_to_list_message(data_list: Any, arg_name_for_error: str) -> List[Mes
                     if isinstance(item_data, Message):
                         typed_list.append(item_data)
                     elif isinstance(item_data, dict):
-                        typed_list.append(Message(**item_data))
+                        typed_list.append(Message.model_validate(item_data))
                     else:
                         raise TypeError(f"Unexpected type for item {i} in '{arg_name_for_error}': {type(item_data)}")
                 return typed_list
@@ -134,8 +134,9 @@ def _coerce_to_list_message(data_list: Any, arg_name_for_error: str) -> List[Mes
                 ):
                     try:
                         final_func_args["messages"] = _coerce_to_list_message(final_func_args["messages"], "messages")
-                    except Exception as err:
-                        raise ValueError(f"Input 'messages' failed Pydantic validation: {err}") from None
+                    except Exception:
+                        # Be lenient: leave messages as-is if coercion fails (backward compatibility)
+                        pass
 
             elif mode == "batch" and "rollouts_messages" in params and "rollouts_messages" in final_func_args:
                 param_annotation = params["rollouts_messages"].annotation
@@ -157,14 +158,26 @@ def _coerce_to_list_message(data_list: Any, arg_name_for_error: str) -> List[Mes
                 gt_ann = params["ground_truth"].annotation
                 if get_origin(gt_ann) in (list, List) and get_args(gt_ann) and get_args(gt_ann)[0] == Message:
                     if final_func_args["ground_truth"] is not None:
-                        try:
-                            final_func_args["ground_truth"] = _coerce_to_list_message(
-                                final_func_args["ground_truth"], "ground_truth"
-                            )
-                        except Exception as err:
-                            raise ValueError(
-                                f"Input 'ground_truth' failed Pydantic validation for List[Message]: {err}"
-                            ) from None
+                        # Accept flexible ground_truth inputs: list, dict, or str
+                        gt_val = final_func_args["ground_truth"]
+                        if isinstance(gt_val, list):
+                            try:
+                                final_func_args["ground_truth"] = _coerce_to_list_message(gt_val, "ground_truth")
+                            except Exception:
+                                # Leave as-is if strict coercion fails
+                                pass
+                        elif isinstance(gt_val, dict):
+                            try:
+                                final_func_args["ground_truth"] = _coerce_to_list_message([gt_val], "ground_truth")
+                            except Exception:
+                                pass
+                        elif isinstance(gt_val, str):
+                            try:
+                                final_func_args["ground_truth"] = _coerce_to_list_message(
+                                    [{"role": "system", "content": gt_val}], "ground_truth"
+                                )
+                            except Exception:
+                                pass
 
             # Inject resource clients into kwargs (resources are already setup)
             if resource_managers: