Refactor type handling and improve error resilience across modules

cursoragent · benjibc · cursoragent · commit ed1dc5984768 · 2025-09-01T03:50:02.000Z
Co-authored-by: bchen &lt;bchen@fireworks.ai&gt;
diff --git a/eval_protocol/agent/orchestrator.py b/eval_protocol/agent/orchestrator.py
@@ -11,39 +11,10 @@
 import logging
 import os
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, cast
+import importlib.util as _importlib_util
 
-# Attempt to import OpenAI client
-try:
-    from openai import AsyncOpenAI, OpenAI
-    from openai.types.chat import ChatCompletionMessage, ChatCompletionToolParam
-    from openai.types.chat.chat_completion_message_tool_call import (
-        ChatCompletionMessageToolCall,
-    )
-
-    OPENAI_AVAILABLE = True
-except ImportError:
-    OPENAI_AVAILABLE = False
-    # Define dummy types if openai is not installed, to avoid runtime errors on load
-    from typing import Any, Dict, List, Optional, Union
-
-    # Use simple class definitions for runtime and type checking
-    class OpenAI:
-        def __init__(self, **kwargs: Any) -> None:
-            pass
-
-    class AsyncOpenAI:
-        def __init__(self, **kwargs: Any) -> None:
-            pass
-
-    class ChatCompletionMessage:
-        content: str = ""
-        role: str = "assistant"
-
-    class ChatCompletionToolParam:
-        pass
-
-    class ChatCompletionMessageToolCall:
-        pass
+# Determine OpenAI availability without importing symbols for typing
+OPENAI_AVAILABLE = _importlib_util.find_spec("openai") is not None
 
 
 # Max steps for the inner loop within a single user turn
@@ -71,17 +42,19 @@ def __init__(self, task_definition: TaskDefinitionModel):
         self.logger = logging.getLogger(f"Orchestrator.{self.task_definition.name}")
         self.logger.setLevel(logging.DEBUG)  # Ensure debug logs are processed
         self.logger.info(f"Orchestrator initialized for task: {self.task_definition.name}")
-        self._openai_client: Optional[AsyncOpenAI] = None
+        # Use Any here to avoid pyright stubs mismatches across openai versions
+        self._openai_client: Optional[Any] = None
 
     def _initialize_openai_client(self):
         """Initializes the AsyncOpenAI client if available and not already initialized."""
         if not OPENAI_AVAILABLE:
             self.logger.warning("OpenAI library not available. Cannot use OpenAI models.")
             return
         if self._openai_client is None:
-            # Consider adding error handling for missing API key
             try:
-                self._openai_client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+                from openai import AsyncOpenAI  # type: ignore[import-not-found]
+
+                self._openai_client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))  # type: ignore[call-arg]
                 self.logger.info("AsyncOpenAI client initialized.")
             except Exception as e:
                 self.logger.error(f"Failed to initialize AsyncOpenAI client: {e}")
@@ -94,7 +67,9 @@ def _initialize_fireworks_client(self):
             return
         if self._openai_client is None:
             try:
-                self._openai_client = AsyncOpenAI(
+                from openai import AsyncOpenAI  # type: ignore[import-not-found]
+
+                self._openai_client = AsyncOpenAI(  # type: ignore[call-arg]
                     api_key=os.environ.get("FIREWORKS_API_KEY"),
                     base_url="https://api.fireworks.ai/inference/v1",
                 )
@@ -469,18 +444,20 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) -
             # Initialize the episode resource with sample data if provided
             if sample_data:
                 self.logger.info(f"Initializing episode resource with sample data: {sample_data}")
-                if hasattr(episode_resource, "initialize"):
-                    await episode_resource.initialize(**sample_data)
+                initializer = getattr(episode_resource, "initialize", None)
+                if callable(initializer):
+                    await initializer(**sample_data)  # type: ignore[misc]
                 else:
                     self.logger.warning(
                         f"Episode resource {type(episode_resource).__name__} does not have initialize method"
                     )
 
             # Get initial state for injection into first prompt (for HTTP rollout)
             initial_state_description = None
-            if hasattr(episode_resource, "get_initial_state_description"):
+            get_init_state = getattr(episode_resource, "get_initial_state_description", None)
+            if callable(get_init_state):
                 try:
-                    initial_state_description = await episode_resource.get_initial_state_description()
+                    initial_state_description = await get_init_state()  # type: ignore[misc]
                     self.logger.info("Retrieved initial state description for first prompt")
                 except Exception as e:
                     self.logger.warning(f"Failed to get initial state description: {e}")
@@ -577,21 +554,21 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) -
                 )  # Get adapters for execution
 
                 # Format tools for OpenAI API (should be done once per user turn, or if tools change)
-                openai_tools: List[ChatCompletionToolParam] = []
+                openai_tools: List[Dict[str, Any]] = []
                 if OPENAI_AVAILABLE:
                     # First add tools from the resource
                     for spec in resource_tool_specs:
                         # Ensure spec has the structure with name and parameters
                         if "name" in spec and "parameters" in spec:
                             openai_tools.append(
-                                ChatCompletionToolParam(
-                                    type="function",
-                                    function={
+                                {
+                                    "type": "function",
+                                    "function": {
                                         "name": spec["name"],
                                         "description": spec.get("description", ""),
-                                        "parameters": spec["parameters"],  # Assuming this matches OpenAI schema
+                                        "parameters": spec["parameters"],  # Assuming OpenAI-compatible schema
                                     },
-                                )
+                                }
                             )
                         else:
                             self.logger.warning(f"Skipping tool spec due to missing name/parameters: {spec}")
@@ -605,14 +582,14 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) -
                         registry_tools = self.tools_module.R.get_openai_tools()
                         for tool_spec in registry_tools:
                             openai_tools.append(
-                                ChatCompletionToolParam(
-                                    type="function",
-                                    function={
+                                {
+                                    "type": "function",
+                                    "function": {
                                         "name": tool_spec["name"],
                                         "description": tool_spec.get("description", ""),
                                         "parameters": tool_spec["parameters"],
                                     },
-                                )
+                                }
                             )
                 else:
                     self.logger.warning("OpenAI not available, cannot format tools for API.")
@@ -642,6 +619,7 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) -
                         if not self._openai_client:
                             raise Exception("OpenAI client not initialized")
 
+                        # type: ignore[reportUnknownMemberType]
                         response = await self._openai_client.chat.completions.create(
                             model=agent_model_name,
                             messages=conversation_messages,  # type: ignore
diff --git a/eval_protocol/agent/resources/docker_resource.py b/eval_protocol/agent/resources/docker_resource.py
@@ -13,20 +13,17 @@
 try:
     import docker
 
-    if TYPE_CHECKING:
-        from docker.errors import APIError, DockerException, NotFound
-        from docker.models.containers import Container
-    else:
-        from docker.errors import APIError, DockerException, NotFound
-        from docker.models.containers import Container
+    # Import for runtime; annotate as Any to avoid mismatched type aliasing across modules
+    from docker.errors import APIError as _APIError, DockerException as _DockerException, NotFound as _NotFound
+    from docker.models.containers import Container as _Container
 
     DOCKER_SDK_AVAILABLE = True
     # Ensure these are available for type checking even if the runtime import fails
     # The `else` block for DOCKER_SDK_AVAILABLE = False will define runtime dummies.
-    DockerException = DockerException
-    NotFound = NotFound
-    APIError = APIError
-    Container = Container
+    DockerException = _DockerException  # type: ignore[assignment]
+    NotFound = _NotFound  # type: ignore[assignment]
+    APIError = _APIError  # type: ignore[assignment]
+    Container = _Container  # type: ignore[assignment]
     try:
         _daemon_check_client = docker.from_env()
         _daemon_check_client.ping()
@@ -99,7 +96,7 @@ def __init__(self) -> None:
             raise RuntimeError("Docker daemon not running or not accessible")
         self._client = docker.from_env()
         self._config: Dict[str, Any] = {}
-        self._container: Optional[Container] = None
+        self._container: Optional[Any] = None
         self._image_id_for_fork_or_checkpoint: Optional[str] = (
             None  # Stores the ID of the image used for the current container
         )
@@ -108,14 +105,14 @@ def __init__(self) -> None:
     def _generate_name(self, prefix: str) -> str:
         return f"rk_{prefix}_{uuid.uuid4().hex}"
 
-    def _cleanup_container(self, container: Optional[Container]) -> None:
+    def _cleanup_container(self, container: Optional[Any]) -> None:
         if container:
             try:
                 container.remove(force=True, v=True)  # v=True to remove volumes
             except NotFound:
                 pass  # Already removed
             except APIError as e:
-                print(f"DockerResource: Error removing container {(container.id or '')[:12]}: {e}")
+                print(f"DockerResource: Error removing container {(getattr(container, 'id', '') or '')[:12]}: {e}")
 
     def _cleanup_image(self, image_id: Optional[str]) -> None:
         if image_id:
diff --git a/eval_protocol/agent/task_manager.py b/eval_protocol/agent/task_manager.py
@@ -492,10 +492,10 @@ async def execute_single_rollout(rollout_index: int):
                     # Convert EvaluateResult to dict if needed
                     if hasattr(result, "model_dump"):
                         # Pydantic model - convert to dict
-                        result = result.model_dump()
+                        result = result.model_dump()  # type: ignore[call-arg]
                     elif hasattr(result, "dict"):
                         # Older pydantic models
-                        result = result.dict()
+                        result = result.dict()  # type: ignore[call-arg]
                     # If it's already a dict, leave it as is
 
                     # Add reward function inputs to the result for JSONL trajectory storage
@@ -529,7 +529,14 @@ async def execute_single_rollout(rollout_index: int):
 
         # Execute all rollouts concurrently
         rollout_tasks = [execute_single_rollout(i) for i in range(num_rollouts)]
-        rollout_results = await asyncio.gather(*rollout_tasks)
+        rollout_results_raw = await asyncio.gather(*rollout_tasks)
+        # Normalize to list of dicts for typing purposes where possible
+        rollout_results: List[Dict[str, Any]] = []
+        for item in rollout_results_raw:
+            if isinstance(item, dict):
+                rollout_results.append(item)
+            else:
+                rollout_results.append({"result": item})
 
         # Log failed rollouts but return all results for comprehensive analysis
         successful_results = [r for r in rollout_results if not (isinstance(r, dict) and "error" in r)]
@@ -665,10 +672,10 @@ async def execute_single_rollout(sample_index: int, rollout_index: int, sample_d
                     # Convert EvaluateResult to dict if needed
                     if hasattr(result, "model_dump"):
                         # Pydantic model - convert to dict
-                        result = result.model_dump()
+                        result = result.model_dump()  # type: ignore[call-arg]
                     elif hasattr(result, "dict"):
                         # Older pydantic models
-                        result = result.dict()
+                        result = result.dict()  # type: ignore[call-arg]
                     # If it's already a dict, leave it as is
 
                     # Add reward function inputs to the result for JSONL trajectory storage
diff --git a/eval_protocol/mcp/mcp_multi_client.py b/eval_protocol/mcp/mcp_multi_client.py
@@ -10,7 +10,6 @@
 from mcp.client.streamable_http import streamablehttp_client
 from mcp.types import CallToolResult
 from openai.types import FunctionDefinition
-from openai.types.chat import ChatCompletionToolParam
 
 from eval_protocol.models import (
     MCPConfigurationServerStdio,
@@ -125,29 +124,29 @@ async def _connect_to_server(
             [tool.name for tool in tools],
         )
 
-    async def get_available_tools(self) -> List[ChatCompletionToolParam]:
+    async def get_available_tools(self) -> List[Dict[str, Any]]:
         """Get all available tools from all connected servers"""
         all_tools = []
         for server_name, session in self.sessions.items():
             try:
                 response = await session.list_tools()
                 for tool in response.tools:
                     all_tools.append(
-                        ChatCompletionToolParam(
-                            function=FunctionDefinition(
-                                name=tool.name,  # Prefix with server name
-                                description=tool.description,
-                                parameters=tool.inputSchema,
-                            ),
-                            type="function",
-                        )
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": tool.name,
+                                "description": tool.description,
+                                "parameters": tool.inputSchema,
+                            },
+                        }
                     )
             except Exception as e:
                 print(f"Error listing tools from server '{server_name}': {e}")
 
         return all_tools
 
-    async def call_tool(self, tool_name: str, tool_args: Dict[str, Any]) -> CallToolResult:
+    async def call_tool(self, tool_name: str, tool_args: Dict[str, Any]) -> Union[CallToolResult, str]:
         """Call a specific tool by name with arguments"""
 
         session = self.tools_to_sessions[tool_name]
diff --git a/eval_protocol/mcp/simple_process_manager.py b/eval_protocol/mcp/simple_process_manager.py
@@ -13,7 +13,7 @@
 import time
 import uuid
 from contextlib import AsyncExitStack
-from typing import Dict, Tuple
+from typing import Dict, Tuple, Optional
 
 from mcp.client.session import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
@@ -26,7 +26,7 @@ class SimpleServerProcessManager:
     def __init__(
         self,
         script_path: str,
-        python_executable: str = None,
+        python_executable: Optional[str] = None,
         port_range: Tuple[int, int] = (10000, 11000),
     ):
         """
diff --git a/eval_protocol/mcp_agent/main.py b/eval_protocol/mcp_agent/main.py
@@ -98,8 +98,12 @@ async def main_async(config_path: str, host: str, port: int):
 
     # 2. Instantiate StreamableHTTPSessionManager
     # Pass the internal _mcp_server (the MCPServer instance) from our FastMCP subclass
+    if _mcp_server_instance_ref is None:
+        logger.error("Failed to initialize RewardKitIntermediaryServer")
+        return
+
     session_manager = StreamableHTTPSessionManager(
-        app=_mcp_server_instance_ref._mcp_server,
+        app=_mcp_server_instance_ref._mcp_server,  # type: ignore[attr-defined]
         event_store=None,
         json_response=True,  # Changed to True
     )
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -552,7 +552,7 @@ class EvaluationRow(BaseModel):
     )
 
     execution_metadata: ExecutionMetadata = Field(
-        default_factory=lambda: ExecutionMetadata(),
+        default_factory=lambda: ExecutionMetadata(run_id=None),
         description="Metadata about the execution of the evaluation.",
     )
 
diff --git a/eval_protocol/playback_policy.py b/eval_protocol/playback_policy.py
@@ -243,12 +243,16 @@ async def __call__(
 
             if messages is None:
                 # No more recorded actions - signal early termination
-                return [
-                    MCPToolCall(
-                        "_playback_terminate",
-                        {"reason": "no_more_recorded_actions"},
-                    )
-                ]
+                return (
+                    [
+                        MCPToolCall(
+                            "_playback_terminate",
+                            {"reason": "no_more_recorded_actions"},
+                        )
+                    ],
+                    None,
+                    None,
+                )
 
             # Return the recorded tool call
             return self._extract_tool_call_from_messages(messages, env_index), None, None
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -6,7 +6,7 @@
 
 from mcp.types import CallToolResult, TextContent
 from openai import NOT_GIVEN, NotGiven
-from openai.types.chat import ChatCompletionContentPartTextParam, ChatCompletionMessage, ChatCompletionToolParam
+from openai.types.chat import ChatCompletionContentPartTextParam
 from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
 
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
@@ -166,13 +166,16 @@ async def _execute_tool_call(
         """
         assert self.mcp_client is not None, "MCP client is not initialized"
         tool_result = await self.mcp_client.call_tool(tool_name, tool_args_dict)
-        content = self._get_content_from_tool_result(tool_result)
+        # Accept string errors from client and normalize to text content
+        content = self._get_content_from_tool_result(tool_result)  # type: ignore[arg-type]
         return tool_call_id, content
 
-    def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[TextContent]:
+    def _get_content_from_tool_result(self, tool_result: CallToolResult | str) -> List[TextContent]:
         if getattr(tool_result, "structuredContent", None):
             return [TextContent(text=json.dumps(tool_result.structuredContent), type="text")]
         normalized: List[TextContent] = []
+        if isinstance(tool_result, str):
+            return [TextContent(text=tool_result, type="text")]
         for content in getattr(tool_result, "content", []) or []:
             if isinstance(content, TextContent):
                 normalized.append(content)

Original file line number	Diff line number	Diff line change
`@@ -552,7 +552,7 @@ class EvaluationRow(BaseModel):`
`552`	`552`	`)`
`553`	`553`
`554`	`554`	`execution_metadata: ExecutionMetadata = Field(`
`555`		`- default_factory=lambda: ExecutionMetadata(),`
	`555`	`+ default_factory=lambda: ExecutionMetadata(run_id=None),`
`556`	`556`	`description="Metadata about the execution of the evaluation.",`
`557`	`557`	`)`
`558`	`558`