Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import logging
import importlib
from datetime import datetime
from enum import Enum
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
Expand All @@ -19,6 +21,9 @@
from eval_protocol.types import TerminationReason


logger = logging.getLogger(__name__)


class ErrorInfo(BaseModel):
"""
AIP-193 ErrorInfo model for structured error details.
Expand Down Expand Up @@ -163,6 +168,167 @@ def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]]
details.append(ErrorInfo.extra_info(extra_info).to_aip193_format())
return cls.error(error_message, details)

@classmethod
def rollout_error_from_exception(
cls, exception: Exception, extra_info: Optional[Dict[str, Any]] = None
) -> "Status":
"""
Create a status indicating the rollout failed with an exception.
Simple approach that stores exception info directly in details.
"""
details = []

details.append(
{
"exception_type": f"{type(exception).__module__}.{type(exception).__name__}",
"exception_message": str(exception),
}
)

if extra_info:
details.append({"extra_info": extra_info})

return cls(code=cls.Code.INTERNAL, message=str(exception), details=details)

@classmethod
def raise_from_status_details(cls, status_details: List[Dict[str, Any]]) -> bool:
"""
Try to raise original exception from simple status details using dynamic imports.
"""

for detail in status_details:
# Look for simple exception info
if "exception_type" in detail and "exception_message" in detail:
exception_type = detail["exception_type"]
exception_message = detail["exception_message"]

logger.info(f"Found exception info: {exception_type}")

# Dynamically import and raise the exception
exception_class = cls._import_exception_class(exception_type)
if exception_class:
logger.info(f"Found exception class: {exception_class}")
# Try different constructor patterns
exception_to_raise = cls._create_exception_instance(exception_class, exception_message)
if exception_to_raise:
logger.info(f"Re-raising {exception_type} from status details")
raise exception_to_raise
else:
logger.info(f"Could not create instance of {exception_type}")
continue
else:
logger.info(f"Could not import exception type: {exception_type}")
continue

return False

@classmethod
def _create_exception_instance(cls, exception_class: type, message: str) -> Optional[Exception]:
"""
Try to create an exception instance using different constructor patterns.

Args:
exception_class: The exception class to instantiate
message: The error message

Returns:
Exception instance if successful, None otherwise
"""
# Common constructor patterns to try
patterns = [
# Pattern 1: Just message
lambda: exception_class(message),
# Pattern 2: Message as named parameter
lambda: exception_class(message=message),
# Pattern 3: Message + common litellm parameters
# NOTE: we are losing some diagnostic information here by not passing the model and llm_provider. We could try to capture full exception state in rollout_error_from_exception.
lambda: exception_class(message, model="unknown", llm_provider="unknown"),
lambda: exception_class(message=message, model="unknown", llm_provider="unknown"),
# Pattern 5: OpenAI exceptions - create mock response object
lambda: cls._create_openai_exception(exception_class, message),
# Pattern 7: No arguments (fallback)
lambda: exception_class(),
Comment thread
xzrderek marked this conversation as resolved.
Outdated
]

for i, pattern in enumerate(patterns):
try:
instance = pattern()
logger.debug(f"Successfully created {exception_class.__name__} using pattern {i + 1}")
return instance
except (TypeError, ValueError) as e:
logger.debug(f"Pattern {i + 1} failed for {exception_class.__name__}: {e}")
continue

logger.debug(f"All constructor patterns failed for {exception_class.__name__}")
return None

@classmethod
def _create_openai_exception(cls, exception_class: type, message: str) -> Optional[Exception]:
"""
Create OpenAI exception with a mock response object.

OpenAI exceptions require httpx.Response objects which are complex to create,
so we create a minimal mock that satisfies the basic requirements.
"""
try:
import httpx

# Create a minimal mock response object
class MockRequest:
def __init__(self):
self.method = "POST"
self.url = "https://api.openai.com/v1/chat/completions"

class MockResponse:
def __init__(self):
self.status_code = 404
self.headers = {"x-request-id": "mock-request-id"}
self.request = MockRequest()

mock_response = MockResponse()
return exception_class(message, response=mock_response, body=None)

except Exception as e:
logging.getLogger(__name__).debug(f"Failed to create OpenAI exception with mock response: {e}")
return None

@classmethod
def _import_exception_class(cls, exception_type: str) -> Optional[type]:
"""
Dynamically import an exception class from a string.

Args:
exception_type: Exception type string like "litellm.exceptions.NotFoundError",
"openai.BadRequestError", "requests.exceptions.ConnectionError", etc.

Returns:
The exception class if found, None otherwise
"""
try:
# Require fully qualified names (no automatic prefixing)
if "." not in exception_type:
logging.getLogger(__name__).debug(f"Exception type must be fully qualified: {exception_type}")
return None

# Parse module and class name
module_name, class_name = exception_type.rsplit(".", 1)

# Import the module
module = importlib.import_module(module_name)

# Get the exception class
exception_class = getattr(module, class_name, None)

# Verify it's actually an exception class
if exception_class and issubclass(exception_class, BaseException):
return exception_class

return None

except (ImportError, AttributeError, ValueError) as e:
logging.getLogger(__name__).debug(f"Could not import exception class {exception_type}: {e}")
return None

@classmethod
def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
"""Create a status indicating the rollout failed with an error."""
Expand Down
7 changes: 6 additions & 1 deletion eval_protocol/pytest/exception_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import litellm
import requests
import httpx
import openai


# Default exceptions that should be retried with backoff
Expand All @@ -29,13 +30,17 @@
httpx.TimeoutException,
httpx.NetworkError,
httpx.RemoteProtocolError,
# LiteLLM library exceptions
litellm.exceptions.RateLimitError,
litellm.exceptions.InternalServerError,
litellm.exceptions.Timeout,
litellm.exceptions.NotFoundError,
litellm.exceptions.BadRequestError, # remove this once we have a long term solution
litellm.exceptions.ServiceUnavailableError,
litellm.exceptions.APIError
litellm.exceptions.APIError,
# OpenAI library exceptions
openai.NotFoundError,
openai.BadRequestError, # remove this once we have a long term solution
Comment thread
xzrderek marked this conversation as resolved.
Outdated
}


Expand Down
2 changes: 2 additions & 0 deletions eval_protocol/pytest/remote_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ def _get_status() -> Dict[str, Any]:
f"Found Fireworks log for rollout {row.execution_metadata.rollout_id} with status code {status_code}"
)

Status.raise_from_status_details(status_details)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Exception Handling Causes Inconsistent Status Updates

The call to Status.raise_from_status_details(status_details) at line 169 can raise an exception if exception information is found in status_details. If an exception is raised, lines 171-175 (which set row.rollout_status) are never executed, leaving the row in an inconsistent state with extracted status information but no updated Status object. The status update should occur before attempting to raise the exception, or the exception should not be raised until after the status is properly set.

Fix in Cursor Fix in Web

row.rollout_status = Status(
code=Status.Code(status_code),
message=status_message,
Expand Down
Loading
Loading