fix more errors

Benny Chen · Benny Chen · commit 90448554d044 · 2025-09-01T09:13:07.000+08:00
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
@@ -287,9 +287,21 @@ def extract_text_content(msg_dict):
 
                     # calc llm usage stats happened in this turn if there is aany
                     if usage_stats:
-                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
-                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
-                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
+                        try:
+                            trajectory.usage["prompt_tokens"] += getattr(usage_stats, "prompt_tokens", 0)
+                            trajectory.usage["completion_tokens"] += getattr(usage_stats, "completion_tokens", 0)
+                            trajectory.usage["total_tokens"] += getattr(usage_stats, "total_tokens", 0)
+                        except Exception:
+                            # Fallback if usage_stats is a dict
+                            trajectory.usage["prompt_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("prompt_tokens", 0)
+                            )
+                            trajectory.usage["completion_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("completion_tokens", 0)
+                            )
+                            trajectory.usage["total_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("total_tokens", 0)
+                            )
 
                     # If no tool call is generated, turn is finished
                     if len(tool_calls) == 1:
@@ -300,7 +312,7 @@ def extract_text_content(msg_dict):
                         # If there's no user simulator, then it marks the end of the episode as LLM think there is no tool call needed.
                         elif tool_calls[0].tool_name in ["_playback_terminate", "_no_tool_call"]:
                             trajectory.terminated = True
-                            trajectory.termination_reason = TerminationReason.from_str(finish_reason)
+                            trajectory.termination_reason = TerminationReason.from_str(str(finish_reason))
                             break
 
                     # Execute each tool call sequentially
@@ -404,11 +416,22 @@ def extract_text_content(msg_dict):
                     )
                     update_evaluation_row_messages()
                     if usage_stats:
-                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
-                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
-                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
+                        try:
+                            trajectory.usage["prompt_tokens"] += getattr(usage_stats, "prompt_tokens", 0)
+                            trajectory.usage["completion_tokens"] += getattr(usage_stats, "completion_tokens", 0)
+                            trajectory.usage["total_tokens"] += getattr(usage_stats, "total_tokens", 0)
+                        except Exception:
+                            trajectory.usage["prompt_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("prompt_tokens", 0)
+                            )
+                            trajectory.usage["completion_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("completion_tokens", 0)
+                            )
+                            trajectory.usage["total_tokens"] += int(
+                                getattr(usage_stats, "get", lambda _k, _d=0: 0)("total_tokens", 0)
+                            )
                     trajectory.terminated = True
-                    trajectory.termination_reason = TerminationReason.from_str(finish_reason)
+                    trajectory.termination_reason = TerminationReason.from_str(str(finish_reason))
                     trajectory.control_plane_summary.update(
                         {
                             "total_reward": trajectory.total_reward,
diff --git a/eval_protocol/mcp/mcp_multi_client.py b/eval_protocol/mcp/mcp_multi_client.py
@@ -9,7 +9,7 @@
 from mcp.client.stdio import stdio_client
 from mcp.client.streamable_http import streamablehttp_client
 from mcp.types import CallToolResult
-from openai.types import FunctionDefinition
+from openai.types.shared_params.function_definition import FunctionDefinition
 from openai.types.chat import ChatCompletionToolParam
 
 from eval_protocol.models import (
@@ -135,7 +135,7 @@ async def get_available_tools(self) -> List[ChatCompletionToolParam]:
                     all_tools.append(
                         ChatCompletionToolParam(
                             function=FunctionDefinition(
-                                name=tool.name,  # Prefix with server name
+                                name=tool.name,
                                 description=tool.description,
                                 parameters=tool.inputSchema,
                             ),
@@ -155,7 +155,8 @@ async def call_tool(self, tool_name: str, tool_args: Dict[str, Any]) -> CallTool
             result = await session.call_tool(tool_name, tool_args)
             return result
         except Exception as e:
-            return f"Error calling tool {tool_name}: {e}"
+            # Re-raise the exception so the return type remains CallToolResult
+            raise RuntimeError(f"Error calling tool {tool_name}: {e}") from e
 
     async def cleanup(self):
         """Clean up resources"""
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
@@ -24,7 +24,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from datetime import date, datetime
 from enum import Enum
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple, Literal, cast
 
 import uvicorn
 from mcp.server.fastmcp import Context, FastMCP
@@ -152,12 +152,14 @@ def _get_session_id(self, ctx: Context) -> str:
             print(f"🔍 _get_session_id: hasattr(client_params, 'clientInfo'): {hasattr(client_params, 'clientInfo')}")
 
             if hasattr(client_params, "clientInfo"):
-                client_info = client_params.clientInfo
+                client_info = getattr(client_params, "clientInfo", None)
                 print(f"🔍 _get_session_id: client_info: {client_info}")
-                print(f"🔍 _get_session_id: hasattr(client_info, '_extra'): {hasattr(client_info, '_extra')}")
+                print(
+                    f"🔍 _get_session_id: hasattr(client_info, '_extra'): {hasattr(client_info, '_extra') if client_info is not None else False}"
+                )
 
                 if client_info and hasattr(client_info, "_extra"):
-                    extra_data = client_info._extra
+                    extra_data = getattr(client_info, "_extra", None)
                     print(f"🔍 _get_session_id: extra_data: {extra_data}")
                     print(f"🔍 _get_session_id: extra_data type: {type(extra_data)}")
 
@@ -547,7 +549,7 @@ def format_observation(self, obs: Any, env: Any) -> Dict[str, Any]:
         else:
             return {"observation": serialized_obs}
 
-    def run(self, transport: str = "streamable-http", **kwargs):
+    def run(self, transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http", **kwargs):
         """Run the unified MCP-Gym server with high concurrency settings."""
         if transport == "streamable-http":
             # Run with custom high-concurrency uvicorn config
@@ -558,7 +560,7 @@ async def run_with_high_concurrency():
                 if not kwargs.get("redirect_slashes", True) and hasattr(starlette_app, "router"):
                     starlette_app.router.redirect_slashes = False
 
-                starlette_app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
+                starlette_app.add_middleware(cast(Any, ProxyHeadersMiddleware), trusted_hosts="*")
 
                 config = uvicorn.Config(
                     starlette_app,
@@ -606,7 +608,7 @@ def _to_json_serializable(self, obj: Any) -> Any:
             return obj.model_dump()
 
         # Handle dataclasses
-        elif dataclasses.is_dataclass(obj):
+        elif dataclasses.is_dataclass(obj) and not isinstance(obj, type):
             return dataclasses.asdict(obj)
 
         # Handle dictionaries
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -38,14 +38,13 @@ async def setup(self):
     async def _get_tools(self) -> Optional[List[dict[str, Any]]]:
         if self.evaluation_row.tools is None:
             if self.mcp_client:
-                raw_tools = await self.mcp_client.get_available_tools()
+                raw_tools = await self.mcp_client.connect_to_servers() or None  # ensure servers are connected
+                raw_tools = await self.mcp_client.get_available_tools() if self.mcp_client else None
                 tools_dicts: List[dict[str, Any]] = []
                 for t in raw_tools or []:
                     if isinstance(t, dict):
-                        # Already a dict-like structure
                         tools_dicts.append(t)
                         continue
-                    # Fallback: extract attributes from OpenAI types
                     tool_type = getattr(t, "type", "function")
                     func = getattr(t, "function", None)
                     name = getattr(func, "name", None)
@@ -104,12 +103,10 @@ async def call_agent(self) -> Optional[Union[str, List[ChatCompletionContentPart
         return message.content
 
     async def _call_model(self, messages: list[Message], tools: Optional[List[dict[str, Any]]]) -> Message:
-        # Convert Message models to plain dicts for LLM call
         messages_payload: List[Dict[str, Any]] = [
-            message.model_dump() if hasattr(message, "model_dump") else message  # type: ignore[misc]
+            (message.model_dump() if hasattr(message, "model_dump") else message)  # type: ignore[misc]
             for message in messages
         ]
-        # Normalize tool definitions into OpenAI-compatible dicts
         payload_tools: List[Dict[str, Any]] = []
         for tool in tools or []:
             if isinstance(tool, dict):
@@ -119,7 +116,6 @@ async def _call_model(self, messages: list[Message], tools: Optional[List[dict[s
                 elif isinstance(fn, dict):
                     fn_payload = fn
                 else:
-                    # Best effort fallback
                     name = getattr(fn, "name", None)
                     params = getattr(fn, "parameters", None)
                     if hasattr(params, "model_dump"):
@@ -131,7 +127,6 @@ async def _call_model(self, messages: list[Message], tools: Optional[List[dict[s
                     fn_payload = {"name": name, "parameters": params_payload}
                 payload_tools.append({"type": tool.get("type", "function"), "function": fn_payload})
             else:
-                # Attribute-based fallback
                 tool_type = getattr(tool, "type", "function")
                 func = getattr(tool, "function", None)
                 name = getattr(func, "name", None)
@@ -145,14 +140,17 @@ async def _call_model(self, messages: list[Message], tools: Optional[List[dict[s
                 payload_tools.append({"type": tool_type, "function": {"name": name, "parameters": params_payload}})
 
         response = await self._policy._make_llm_call(messages=messages_payload, tools=payload_tools)
-        # Coerce content to a string to align with our Message model type expectations
         raw_content = response["choices"][0]["message"].get("content")
         if isinstance(raw_content, list):
-            content_for_model = "".join([getattr(p, "text", str(p)) for p in raw_content])
+
+            def _part_to_text(p: Any) -> str:
+                return getattr(p, "text", str(p))
+
+            content_for_model: Union[str, List[Any]] = "".join(_part_to_text(p) for p in raw_content)
         else:
             content_for_model = raw_content
         return Message(
-            role=response["choices"][0]["message"]["role"],
+            role=response["choices"][0]["message"].get("role", "assistant"),
             content=content_for_model,
             tool_calls=response["choices"][0]["message"].get("tool_calls"),
         )
@@ -184,14 +182,9 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex
     def _format_tool_message_content(
         self, content: List[TextContent]
     ) -> Union[str, List[ChatCompletionContentPartTextParam]]:
-        """Format tool result content for inclusion in a tool message.
-
-        - If a single text item, return plain string per OpenAI semantics.
-        - If multiple items, return a list of text parts.
-        """
         if len(content) == 1 and isinstance(content[0], TextContent):
             return content[0].text
-        return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content]
+        return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content if hasattr(c, "text")]
 
 
 class AgentRolloutProcessor(RolloutProcessor):