From c1f77016b573200b0d062667b372079f26d8c7a1 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 03:34:39 -0700
Subject: [PATCH 01/16] adding status

---
 eval_protocol/models.py                       | 133 ++++++++++++++++++
 .../pytest/remote_rollout_processor.py        |   2 +
 2 files changed, 135 insertions(+)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index d3fe0f61..98491dff 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -1,4 +1,6 @@
 import os
+import logging
+import importlib
 from datetime import datetime
 from enum import Enum
 from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
@@ -19,6 +21,9 @@
 from eval_protocol.types import TerminationReason
 
 
+logger = logging.getLogger(__name__)
+
+
 class ErrorInfo(BaseModel):
     """
     AIP-193 ErrorInfo model for structured error details.
@@ -163,6 +168,134 @@ def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]]
             details.append(ErrorInfo.extra_info(extra_info).to_aip193_format())
         return cls.error(error_message, details)
 
+    @classmethod
+    def rollout_error_from_exception(
+        cls, exception: Exception, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """
+        Create a status indicating the rollout failed with an exception.
+        Simple approach that stores exception info directly in details.
+        """
+        details = []
+
+        details.append(
+            {
+                "exception_type": f"{type(exception).__module__}.{type(exception).__name__}",
+                "exception_message": str(exception),
+            }
+        )
+
+        if extra_info:
+            details.append({"extra_info": extra_info})
+
+        return cls(code=cls.Code.INTERNAL, message=str(exception), details=details)
+
+    @classmethod
+    def raise_from_status_details(cls, status_details: List[Dict[str, Any]]) -> bool:
+        """
+        Try to raise original exception from simple status details using dynamic imports.
+        """
+
+        for detail in status_details:
+            # Look for simple exception info
+            if "exception_type" in detail and "exception_message" in detail:
+                exception_type = detail["exception_type"]
+                exception_message = detail["exception_message"]
+
+                logger.info(f"Found exception info: {exception_type}")
+
+                # Dynamically import and raise the exception
+                exception_class = cls._import_exception_class(exception_type)
+                if exception_class:
+                    logger.info(f"Re-raising {exception_type} from status details")
+                    # Try different constructor patterns
+                    exception_to_raise = cls._create_exception_instance(exception_class, exception_message)
+                    if exception_to_raise:
+                        raise exception_to_raise
+                    else:
+                        logger.debug(f"Could not create instance of {exception_type}")
+                        continue
+                else:
+                    logger.debug(f"Could not import exception type: {exception_type}")
+                    continue
+
+        return False
+
+    @classmethod
+    def _create_exception_instance(cls, exception_class: type, message: str) -> Optional[Exception]:
+        """
+        Try to create an exception instance using different constructor patterns.
+
+        Args:
+            exception_class: The exception class to instantiate
+            message: The error message
+
+        Returns:
+            Exception instance if successful, None otherwise
+        """
+        # Common constructor patterns to try
+        patterns = [
+            # Pattern 1: Just message
+            lambda: exception_class(message),
+            # Pattern 2: Message as named parameter
+            lambda: exception_class(message=message),
+            # Pattern 3: Message + common litellm parameters
+            # NOTE: we are losing some diagnostic information here by not passing the model and llm_provider. We could try to capture full exception state in rollout_error_from_exception.
+            lambda: exception_class(message, model="unknown", llm_provider="unknown"),
+            lambda: exception_class(message=message, model="unknown", llm_provider="unknown"),
+            # Pattern 4: No arguments (fallback)
+            lambda: exception_class(),
+        ]
+
+        for i, pattern in enumerate(patterns):
+            try:
+                instance = pattern()
+                logger.debug(f"Successfully created {exception_class.__name__} using pattern {i + 1}")
+                return instance
+            except (TypeError, ValueError) as e:
+                logger.debug(f"Pattern {i + 1} failed for {exception_class.__name__}: {e}")
+                continue
+
+        logger.debug(f"All constructor patterns failed for {exception_class.__name__}")
+        return None
+
+    @classmethod
+    def _import_exception_class(cls, exception_type: str) -> Optional[type]:
+        """
+        Dynamically import an exception class from a string.
+
+        Args:
+            exception_type: Exception type string like "litellm.exceptions.NotFoundError",
+                           "openai.BadRequestError", "requests.exceptions.ConnectionError", etc.
+
+        Returns:
+            The exception class if found, None otherwise
+        """
+        try:
+            # Require fully qualified names (no automatic prefixing)
+            if "." not in exception_type:
+                logging.getLogger(__name__).debug(f"Exception type must be fully qualified: {exception_type}")
+                return None
+
+            # Parse module and class name
+            module_name, class_name = exception_type.rsplit(".", 1)
+
+            # Import the module
+            module = importlib.import_module(module_name)
+
+            # Get the exception class
+            exception_class = getattr(module, class_name, None)
+
+            # Verify it's actually an exception class
+            if exception_class and issubclass(exception_class, BaseException):
+                return exception_class
+
+            return None
+
+        except (ImportError, AttributeError, ValueError) as e:
+            logging.getLogger(__name__).debug(f"Could not import exception class {exception_type}: {e}")
+            return None
+
     @classmethod
     def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
         """Create a status indicating the rollout failed with an error."""
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index 68d47dcd..d27f878f 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -166,6 +166,8 @@ def _get_status() -> Dict[str, Any]:
                         f"Found Fireworks log for rollout {row.execution_metadata.rollout_id} with status code {status_code}"
                     )
 
+                    Status.raise_from_status_details(status_details)
+
                     row.rollout_status = Status(
                         code=Status.Code(status_code),
                         message=status_message,

From d3ef46c4ee668a1e77317470184a9daf9c6d5e5c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 03:43:33 -0700
Subject: [PATCH 02/16] better logging

---
 eval_protocol/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 98491dff..6a8e7fad 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -207,10 +207,11 @@ def raise_from_status_details(cls, status_details: List[Dict[str, Any]]) -> bool
                 # Dynamically import and raise the exception
                 exception_class = cls._import_exception_class(exception_type)
                 if exception_class:
-                    logger.info(f"Re-raising {exception_type} from status details")
+                    logger.info(f"Found exception class: {exception_class}")
                     # Try different constructor patterns
                     exception_to_raise = cls._create_exception_instance(exception_class, exception_message)
                     if exception_to_raise:
+                        logger.info(f"Re-raising {exception_type} from status details")
                         raise exception_to_raise
                     else:
                         logger.debug(f"Could not create instance of {exception_type}")

From f84133471cd09ac683c082262720f30b9dfaaa2d Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 03:48:45 -0700
Subject: [PATCH 03/16] retry on openai too

---
 eval_protocol/pytest/exception_config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
index 355e716a..452b8563 100644
--- a/eval_protocol/pytest/exception_config.py
+++ b/eval_protocol/pytest/exception_config.py
@@ -11,6 +11,7 @@
 import litellm
 import requests
 import httpx
+import openai
 
 # Default exceptions that should be retried with backoff
 DEFAULT_RETRYABLE_EXCEPTIONS: Set[Type[Exception]] = {
@@ -34,6 +35,8 @@
     litellm.exceptions.NotFoundError,
     litellm.exceptions.BadRequestError,  # remove this once we have a long term solution
     litellm.exceptions.ServiceUnavailableError,
+    openai.NotFoundError,
+    openai.BadRequestError,  # remove this once we have a long term solution
 }
 
 

From c705cb8d88a8d5966f22c84172d885a4352debc0 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 04:00:08 -0700
Subject: [PATCH 04/16] reraising is working now

---
 eval_protocol/models.py                       |  38 ++-
 .../svg_agent/vercel_svg_server/api/init.py   | 243 ++++++++++--------
 2 files changed, 164 insertions(+), 117 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 6a8e7fad..23231c22 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -214,10 +214,10 @@ def raise_from_status_details(cls, status_details: List[Dict[str, Any]]) -> bool
                         logger.info(f"Re-raising {exception_type} from status details")
                         raise exception_to_raise
                     else:
-                        logger.debug(f"Could not create instance of {exception_type}")
+                        logger.info(f"Could not create instance of {exception_type}")
                         continue
                 else:
-                    logger.debug(f"Could not import exception type: {exception_type}")
+                    logger.info(f"Could not import exception type: {exception_type}")
                     continue
 
         return False
@@ -244,7 +244,9 @@ def _create_exception_instance(cls, exception_class: type, message: str) -> Opti
             # NOTE: we are losing some diagnostic information here by not passing the model and llm_provider. We could try to capture full exception state in rollout_error_from_exception.
             lambda: exception_class(message, model="unknown", llm_provider="unknown"),
             lambda: exception_class(message=message, model="unknown", llm_provider="unknown"),
-            # Pattern 4: No arguments (fallback)
+            # Pattern 5: OpenAI exceptions - create mock response object
+            lambda: cls._create_openai_exception(exception_class, message),
+            # Pattern 7: No arguments (fallback)
             lambda: exception_class(),
         ]
 
@@ -260,6 +262,36 @@ def _create_exception_instance(cls, exception_class: type, message: str) -> Opti
         logger.debug(f"All constructor patterns failed for {exception_class.__name__}")
         return None
 
+    @classmethod
+    def _create_openai_exception(cls, exception_class: type, message: str) -> Optional[Exception]:
+        """
+        Create OpenAI exception with a mock response object.
+
+        OpenAI exceptions require httpx.Response objects which are complex to create,
+        so we create a minimal mock that satisfies the basic requirements.
+        """
+        try:
+            import httpx
+
+            # Create a minimal mock response object
+            class MockRequest:
+                def __init__(self):
+                    self.method = "POST"
+                    self.url = "https://api.openai.com/v1/chat/completions"
+
+            class MockResponse:
+                def __init__(self):
+                    self.status_code = 404
+                    self.headers = {"x-request-id": "mock-request-id"}
+                    self.request = MockRequest()
+
+            mock_response = MockResponse()
+            return exception_class(message, response=mock_response, body=None)
+
+        except Exception as e:
+            logging.getLogger(__name__).debug(f"Failed to create OpenAI exception with mock response: {e}")
+            return None
+
     @classmethod
     def _import_exception_class(cls, exception_type: str) -> Optional[type]:
         """
diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py b/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
index d376880e..502eb96a 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
@@ -9,7 +9,8 @@
 import os
 import logging
 import sys
-from http.server import BaseHTTPRequestHandler
+import asyncio
+from flask import Flask, request, jsonify
 from openai import OpenAI
 from dotenv import load_dotenv
 
@@ -44,119 +45,133 @@ def filter(self, record: logging.LogRecord) -> bool:
 # Attach Fireworks tracing handler to root logger (non-stream HTTP sink)
 root_logger.addHandler(FireworksTracingHttpHandler())
 
+# Create Flask app
+app = Flask(__name__)
+
+
+async def execute_rollout_background(req, api_key):
+    """Execute the OpenAI completion in background and log results"""
+    # Attach rollout_id filter to logger
+    logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
+    logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
+
+    try:
+        model = req.completion_params.get("model")
+        # Uncomment if you need to strip fireworks_ai/ prefix
+        # if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
+        #     model = model[len("fireworks_ai/"):]
+
+        # Prepare completion arguments
+        completion_kwargs = {
+            "messages": req.messages,
+            # "messages": [{"role": "user", "content": "Hello, how are you?"}],
+            "model": model,
+            "temperature": req.completion_params.get("temperature"),
+            "max_tokens": req.completion_params.get("max_tokens"),
+        }
+
+        # Add tools if present
+        if req.tools:
+            completion_kwargs["tools"] = req.tools
+
+        logger.info(
+            f"DEBUG: {req.model_base_url}, COMPLETION_KWARGS: {completion_kwargs}, API_KEY: {api_key}, MODEL: {model}"
+        )
+
+        # Create AsyncOpenAI client
+        # client = AsyncOpenAI(base_url=req.model_base_url, api_key=api_key)
+        client = OpenAI(base_url=req.model_base_url, api_key=api_key)
+
+        logger.info(f"Sending completion request to model {model}")
 
-class handler(BaseHTTPRequestHandler):
-    def do_POST(self):
-        try:
-            # Read and parse request body
-            content_length = int(self.headers.get("Content-Length", 0))
-            request_body = self.rfile.read(content_length).decode("utf-8")
-            request_data = json.loads(request_body)
-
-            # Parse as InitRequest
-            req = InitRequest(**request_data)
-
-            # Attach rollout_id filter to logger
-            logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
-            logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
-
-            # Validate required fields
-            if not req.messages:
-                error_msg = "messages is required"
-                logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
-                self._send_error(400, error_msg)
-                return
-
-            model = req.completion_params.get("model")
-            if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
-                model = model[len("fireworks_ai/") :]
-
-            # Prepare completion arguments
-            completion_kwargs = {
-                "messages": req.messages,
-                "model": model,
-                "temperature": req.completion_params.get("temperature"),
-                "max_tokens": req.completion_params.get("max_tokens"),
-            }
-
-            # Add tools if present
-            if req.tools:
-                completion_kwargs["tools"] = req.tools
-
-            # Get API key (prefer request api_key, fallback to environment)
-            api_key = req.api_key or os.environ.get("FIREWORKS_API_KEY")
-            if not api_key:
-                error_msg = "API key not provided in request or FIREWORKS_API_KEY environment variable"
-                logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
-                self._send_error(500, error_msg)
-                return
-
-            # Create OpenAI client
-            client = OpenAI(base_url=req.model_base_url, api_key=api_key)
-
-            logger.info(f"Sending completion request to model {req.completion_params.get('model')}")
-
-            # Make the model call
-            completion = client.chat.completions.create(**completion_kwargs)
-
-            logger.info(f"Completed response: {completion}")
-
-            # Log completion status
-            logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
-
-            # Return the completion response
-            response_data = {
-                "status": "completed",
-                "rollout_id": req.metadata.rollout_id,
-                "choices": [
-                    {
-                        "message": {
-                            "role": completion.choices[0].message.role,
-                            "content": completion.choices[0].message.content,
-                        }
-                    }
-                ],
-            }
-
-            self._send_json_response(200, response_data)
-
-        except Exception as e:
-            # Log error if we have the request context
-            if "req" in locals() and "logger" in locals():
-                logger.error(f"❌ Error in rollout {req.metadata.rollout_id}: {e}")
-                logger.error(str(e), extra={"status": Status.rollout_error(str(e))})
-
-            self._send_error(500, str(e))
-
-    def do_GET(self):
-        """Health check endpoint"""
-        self._send_json_response(
-            200,
-            {
-                "status": "ok",
-                "message": "SVGBench Vercel Serverless Function",
-                "endpoints": {"POST /": "Process SVGBench evaluation requests"},
-            },
+        # Make the async model call with timeout
+        import time
+
+        logger.info(f"timing start: {time.time()}")
+        completion = client.chat.completions.create(**completion_kwargs)
+        logger.info(f"Completed response: {completion}")
+        logger.info(f"timing end: {time.time()}")
+        # Log successful completion - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
+        logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
+
+    except Exception as e:
+        # Log error with structured status - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
+        logger.error(
+            f"Rollout {req.metadata.rollout_id} failed: {e}", extra={"status": Status.rollout_error_from_exception(e)}
         )
 
-    def do_OPTIONS(self):
-        """Handle CORS preflight requests"""
-        self.send_response(200)
-        self.send_header("Access-Control-Allow-Origin", "*")
-        self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
-        self.send_header("Access-Control-Allow-Headers", "Content-Type")
-        self.end_headers()
-
-    def _send_json_response(self, status_code: int, data: dict):
-        """Send a JSON response"""
-        self.send_response(status_code)
-        self.send_header("Content-Type", "application/json")
-        self.send_header("Access-Control-Allow-Origin", "*")
-        self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
-        self.send_header("Access-Control-Allow-Headers", "Content-Type")
-        self.end_headers()
-        self.wfile.write(json.dumps(data).encode("utf-8"))
-
-    def _send_error(self, status_code: int, message: str):
-        """Send an error response"""
-        self._send_json_response(status_code, {"error": message})
+
+@app.route("/init", methods=["POST"])
+async def init():
+    try:
+        # Parse as InitRequest
+        req = InitRequest(**request.get_json())
+
+        # Create logger for immediate validation logging
+        logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
+        logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
+
+        # Validate required fields
+        if not req.messages:
+            error_msg = "messages is required"
+            logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
+            return jsonify({"error": error_msg}), 400
+
+        # Get API key (prefer request api_key, fallback to environment)
+        if req.api_key:
+            logger.info("Using API key from request")
+            api_key = req.api_key
+        elif os.environ.get("FIREWORKS_API_KEY"):
+            logger.info("Using API key from environment")
+            api_key = os.environ.get("FIREWORKS_API_KEY")
+        else:
+            error_msg = "API key not provided in request or environment variable"
+            logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
+            return jsonify({"error": error_msg}), 401
+
+        # 🔥 FIRE: Return immediately with acceptance (within 30s requirement)
+        response_data = {
+            "status": "accepted",
+            "rollout_id": req.metadata.rollout_id,
+            "message": "Rollout processing started",
+        }
+
+        # Fire and forget: Execute rollout asynchronously
+        asyncio.create_task(execute_rollout_background(req, api_key))
+
+        return jsonify(response_data), 200
+
+    except Exception as e:
+        # For request parsing errors, return error immediately (don't retry)
+        return jsonify({"error": f"Request parsing error: {str(e)}"}), 400
+
+
+@app.route("/", methods=["GET"])
+def health_check():
+    """Health check endpoint"""
+    return jsonify(
+        {
+            "status": "ok",
+            "message": "SVGBench Vercel Serverless Function",
+            "endpoints": {"POST /": "Process SVGBench evaluation requests"},
+        }
+    )
+
+
+@app.route("/", methods=["OPTIONS"])
+def options_handler():
+    """Handle CORS preflight requests"""
+    response = jsonify({})
+    response.headers["Access-Control-Allow-Origin"] = "*"
+    response.headers["Access-Control-Allow-Methods"] = "POST, GET, OPTIONS"
+    response.headers["Access-Control-Allow-Headers"] = "Content-Type"
+    return response
+
+
+# Add CORS headers to all responses
+@app.after_request
+def add_cors_headers(response):
+    response.headers["Access-Control-Allow-Origin"] = "*"
+    response.headers["Access-Control-Allow-Methods"] = "POST, GET, OPTIONS"
+    response.headers["Access-Control-Allow-Headers"] = "Content-Type"
+    return response

From 779c6aa93fe84dcfe7bdbfdc100f6e68ca9754c0 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 04:00:56 -0700
Subject: [PATCH 05/16] new server

---
 .../quickstart/svg_agent/vercel_svg_server/requirements.txt   | 3 ++-
 .../quickstart/svg_agent/vercel_svg_server/vercel.json        | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
index dadd2db1..50f0def0 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
@@ -1,3 +1,4 @@
 openai>=1.0.0
 python-dotenv>=0.19.0
-eval_protocol>=0.2.58
+eval_protocol==0.2.69-dev3
+Flask[async]==3.0.3
diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/vercel.json b/eval_protocol/quickstart/svg_agent/vercel_svg_server/vercel.json
index 112be6e9..4291b8c1 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/vercel.json
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/vercel.json
@@ -1,3 +1,5 @@
 {
-  "redirects": [{ "source": "/init", "destination": "/api/init" }]
+  "rewrites": [
+    { "source": "/(.*)", "destination": "/api/init" }
+  ]
 }

From 7c94a7e58aa9bd8775cef5778ce5516c1c553e58 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 04:01:08 -0700
Subject: [PATCH 06/16] tests for status logging

---
 tests/test_status.py | 417 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 tests/test_status.py

diff --git a/tests/test_status.py b/tests/test_status.py
new file mode 100644
index 00000000..f0ac0932
--- /dev/null
+++ b/tests/test_status.py
@@ -0,0 +1,417 @@
+"""
+Tests for Status exception handling functionality.
+
+Tests the round-trip flow:
+1. Exception → Status.rollout_error_from_exception() → structured logging
+2. Structured data → Status.raise_from_status_details() → original exception
+"""
+
+import pytest
+from eval_protocol.models import Status
+
+# Test with different exception types that might be available
+try:
+    import litellm.exceptions
+
+    LITELLM_AVAILABLE = True
+except ImportError:
+    LITELLM_AVAILABLE = False
+
+try:
+    import requests.exceptions
+
+    REQUESTS_AVAILABLE = True
+except ImportError:
+    REQUESTS_AVAILABLE = False
+
+try:
+    import httpx
+
+    HTTPX_AVAILABLE = True
+except ImportError:
+    HTTPX_AVAILABLE = False
+
+try:
+    import openai
+
+    OPENAI_AVAILABLE = True
+except ImportError:
+    OPENAI_AVAILABLE = False
+
+
+def test_rollout_error_from_exception_basic():
+    """Test creating Status from a basic exception."""
+    # Create a simple exception
+    original_exception = ValueError("Test error message")
+
+    # Create status from exception
+    status = Status.rollout_error_from_exception(original_exception)
+
+    # Verify the status structure
+    assert status.code == Status.Code.INTERNAL
+    assert status.message == "Test error message"
+    assert len(status.details) == 1
+
+    detail = status.details[0]
+    assert detail["exception_type"] == "builtins.ValueError"
+    assert detail["exception_message"] == "Test error message"
+
+
+def test_exception_round_trip_basic():
+    """Test the complete round-trip: exception → status → re-raise exception."""
+    # Create original exception
+    original_exception = ValueError("Round trip test")
+
+    # Convert to status
+    status = Status.rollout_error_from_exception(original_exception)
+
+    # Try to re-raise from status details
+    with pytest.raises(ValueError) as exc_info:
+        Status.raise_from_status_details(status.details)
+
+    # Verify the re-raised exception has the same message
+    assert str(exc_info.value) == "Round trip test"
+
+
+@pytest.mark.skipif(not LITELLM_AVAILABLE, reason="litellm not available")
+def test_litellm_exception_round_trip():
+    """Test round-trip with litellm exceptions."""
+    # Create a litellm exception - try different constructor patterns
+    original_exception = litellm.exceptions.NotFoundError(
+        message="Model not found", model="test-model", llm_provider="test-provider"
+    )
+    # Convert to status
+    status = Status.rollout_error_from_exception(original_exception)
+
+    # Verify status details
+    detail = status.details[0]
+    assert detail["exception_type"] == "litellm.exceptions.NotFoundError"
+    # Message might contain additional info, just check it contains our text
+    assert "Model not found" in detail["exception_message"] or "not found" in detail["exception_message"].lower()
+
+    # Re-raise and verify type
+    with pytest.raises(litellm.exceptions.NotFoundError) as exc_info:
+        Status.raise_from_status_details(status.details)
+
+    # The re-raised exception should be the same type
+    assert isinstance(exc_info.value, litellm.exceptions.NotFoundError)
+
+
+@pytest.mark.skipif(not REQUESTS_AVAILABLE, reason="requests not available")
+def test_requests_exception_round_trip():
+    """Test round-trip with requests exceptions."""
+    # Create a requests exception
+    original_exception = requests.exceptions.ConnectionError("Connection failed")
+
+    # Convert to status
+    status = Status.rollout_error_from_exception(original_exception)
+
+    # Verify status details
+    detail = status.details[0]
+    assert detail["exception_type"] == "requests.exceptions.ConnectionError"
+    assert detail["exception_message"] == "Connection failed"
+
+    # Re-raise and verify type
+    with pytest.raises(requests.exceptions.ConnectionError) as exc_info:
+        Status.raise_from_status_details(status.details)
+
+    assert str(exc_info.value) == "Connection failed"
+
+
+def test_unknown_exception_type():
+    """Test behavior with unknown/non-importable exception type."""
+    # Create status details with fake exception type
+    fake_details = [{"exception_type": "fake.module.FakeException", "exception_message": "This should not raise"}]
+
+    # Should not raise anything, just return False
+    result = Status.raise_from_status_details(fake_details)
+    assert result is False
+
+
+def test_malformed_status_details():
+    """Test behavior with malformed status details."""
+    # Various malformed details
+    malformed_cases = [
+        [],  # Empty list
+        [{}],  # Empty dict
+        [{"exception_type": "ValueError"}],  # Missing message
+        [{"exception_message": "test"}],  # Missing type
+        [{"wrong_key": "wrong_value"}],  # Wrong keys
+    ]
+
+    for malformed_details in malformed_cases:
+        result = Status.raise_from_status_details(malformed_details)
+        assert result is False
+
+
+def test_rollout_error_with_extra_info():
+    """Test rollout_error_from_exception with extra_info."""
+    original_exception = ValueError("Test with extra info")
+    extra_info = {"context": "test_context", "user_id": "123"}
+
+    status = Status.rollout_error_from_exception(original_exception, extra_info)
+
+    # Should have both exception info and extra info
+    assert len(status.details) == 2
+
+    # First detail should be exception info
+    exception_detail = status.details[0]
+    assert exception_detail["exception_type"] == "builtins.ValueError"
+    assert exception_detail["exception_message"] == "Test with extra info"
+
+    # Second detail should be extra info
+    extra_detail = status.details[1]
+    assert extra_detail["extra_info"]["context"] == "test_context"
+    assert extra_detail["extra_info"]["user_id"] == "123"
+
+
+def test_multiple_exception_details():
+    """Test raise_from_status_details with multiple details (should use first valid one)."""
+    # Create details with multiple exception info
+    details = [
+        {"other_info": "ignored"},  # Should be ignored
+        {"exception_type": "builtins.ValueError", "exception_message": "First exception"},  # Should be used
+        {"exception_type": "builtins.RuntimeError", "exception_message": "Second exception"},  # Should be ignored
+    ]
+
+    # Should raise the first valid exception
+    with pytest.raises(ValueError) as exc_info:
+        Status.raise_from_status_details(details)
+
+    assert str(exc_info.value) == "First exception"
+
+
+@pytest.mark.skipif(not LITELLM_AVAILABLE, reason="litellm not available")
+def test_different_litellm_exceptions():
+    """Test various litellm exception types."""
+    # Test with a few common litellm exceptions
+    exception_classes = [
+        litellm.exceptions.RateLimitError,
+        litellm.exceptions.InternalServerError,
+        litellm.exceptions.BadRequestError,
+    ]
+
+    for exception_class in exception_classes:
+        # Try to create an exception instance (try different constructor patterns)
+        original_exception = None
+        exception_name = exception_class.__name__
+
+        try:
+            # Try with just message
+            original_exception = exception_class(f"Test {exception_name}")
+        except TypeError:
+            try:
+                # Try with message and required parameters
+                original_exception = exception_class(
+                    message=f"Test {exception_name}", model="test-model", llm_provider="test-provider"
+                )
+            except TypeError:
+                try:
+                    # Try with positional args
+                    original_exception = exception_class(f"Test {exception_name}", "test-model", "test-provider")
+                except TypeError:
+                    # Skip this particular exception type
+                    continue
+
+        if original_exception is None:
+            continue
+
+        # Test the round-trip
+        status = Status.rollout_error_from_exception(original_exception)
+
+        # Should be able to re-raise the same type
+        with pytest.raises(exception_class):
+            Status.raise_from_status_details(status.details)
+
+
+def test_edge_case_empty_message():
+    """Test with exception that has empty message."""
+    original_exception = ValueError()  # Empty message
+
+    status = Status.rollout_error_from_exception(original_exception)
+
+    # Should handle empty message gracefully
+    detail = status.details[0]
+    assert detail["exception_type"] == "builtins.ValueError"
+    assert detail["exception_message"] == ""
+
+    # Should still re-raise correctly
+    with pytest.raises(ValueError):
+        Status.raise_from_status_details(status.details)
+
+
+def test_all_default_retryable_exceptions():
+    """
+    Comprehensive test of all exceptions in DEFAULT_RETRYABLE_EXCEPTIONS.
+
+    This ensures our Status exception handling works with every exception type
+    that the retry system claims to support.
+    """
+    # Test cases: (exception_class, test_message, required_modules, skip_reason)
+    test_cases = [
+        # Standard library exceptions
+        (ConnectionError, "Connection failed", [], None),
+        (TimeoutError, "Request timeout", [], None),
+        (OSError, "OS error occurred", [], None),
+    ]
+
+    # Add requests exceptions if available
+    if REQUESTS_AVAILABLE:
+        import requests.exceptions
+
+        test_cases.extend(
+            [
+                (requests.exceptions.ConnectionError, "Requests connection error", ["requests"], None),
+                (requests.exceptions.Timeout, "Requests timeout", ["requests"], None),
+                (requests.exceptions.HTTPError, "HTTP error occurred", ["requests"], None),
+                (requests.exceptions.RequestException, "Request exception", ["requests"], None),
+            ]
+        )
+
+    # Add httpx exceptions if available
+    if HTTPX_AVAILABLE:
+        import httpx
+
+        test_cases.extend(
+            [
+                (httpx.ConnectError, "HTTPX connect error", ["httpx"], None),
+                (httpx.TimeoutException, "HTTPX timeout", ["httpx"], None),
+                (httpx.NetworkError, "HTTPX network error", ["httpx"], None),
+                (httpx.RemoteProtocolError, "HTTPX protocol error", ["httpx"], None),
+            ]
+        )
+
+    # Add openai exceptions if available
+    if OPENAI_AVAILABLE:
+        import openai
+
+        test_cases.extend(
+            [
+                (openai.NotFoundError, "OpenAI model not found", ["openai"], None),
+                (openai.BadRequestError, "OpenAI bad request", ["openai"], None),
+                (openai.RateLimitError, "OpenAI rate limit", ["openai"], None),
+            ]
+        )
+
+    # Add litellm exceptions if available
+    if LITELLM_AVAILABLE:
+        import litellm.exceptions
+
+        test_cases.extend(
+            [
+                (litellm.exceptions.RateLimitError, "Rate limit exceeded", ["litellm"], None),
+                (litellm.exceptions.InternalServerError, "Internal server error", ["litellm"], None),
+                (litellm.exceptions.Timeout, "LiteLLM timeout", ["litellm"], None),
+                (litellm.exceptions.NotFoundError, "Model not found", ["litellm"], None),
+                (litellm.exceptions.BadRequestError, "Bad request", ["litellm"], None),
+                (litellm.exceptions.ServiceUnavailableError, "Service unavailable", ["litellm"], None),
+            ]
+        )
+
+    successful_tests = 0
+    failed_tests = []
+
+    for exception_class, test_message, required_modules, skip_reason in test_cases:
+        exception_name = f"{exception_class.__module__}.{exception_class.__name__}"
+
+        try:
+            # Try to create the original exception with different patterns
+            original_exception = None
+
+            # Pattern 1: Just message
+            try:
+                original_exception = exception_class(test_message)
+            except TypeError:
+                # Pattern 2: Message as named parameter
+                try:
+                    original_exception = exception_class(message=test_message)
+                except TypeError:
+                    # Pattern 3: For litellm - try with required parameters
+                    if "litellm" in required_modules:
+                        try:
+                            original_exception = exception_class(
+                                message=test_message, model="test-model", llm_provider="test-provider"
+                            )
+                        except TypeError:
+                            try:
+                                original_exception = exception_class(test_message, "test-model", "test-provider")
+                            except TypeError:
+                                pass
+                    # Pattern 4: For OpenAI - create mock response object
+                    elif "openai" in required_modules and original_exception is None:
+                        try:
+                            # Create minimal mock objects for OpenAI exceptions
+                            class MockRequest:
+                                def __init__(self):
+                                    self.method = "POST"
+                                    self.url = "https://api.openai.com/v1/chat/completions"
+
+                            class MockResponse:
+                                def __init__(self):
+                                    self.status_code = 404
+                                    self.headers = {"x-request-id": "test-request-id"}
+                                    self.request = MockRequest()
+
+                            mock_response = MockResponse()
+                            original_exception = exception_class(test_message, response=mock_response, body=None)
+                        except (TypeError, AttributeError) as e:
+                            # If mock approach fails, skip OpenAI for now
+                            failed_tests.append((exception_name, f"OpenAI mock creation failed: {e}"))
+                            continue
+
+                    # Pattern 5: No arguments fallback
+                    if original_exception is None:
+                        try:
+                            original_exception = exception_class()
+                        except TypeError:
+                            failed_tests.append((exception_name, "Could not create exception instance"))
+                            continue
+
+            if original_exception is None:
+                failed_tests.append((exception_name, "Could not create exception instance"))
+                continue
+
+            # Test the round-trip: exception -> status -> re-raise
+            try:
+                # Convert to status
+                status = Status.rollout_error_from_exception(original_exception)
+
+                # Verify status structure
+                assert len(status.details) >= 1
+                detail = status.details[0]
+                assert "exception_type" in detail
+                assert "exception_message" in detail
+                assert detail["exception_type"] == exception_name
+
+                # Try to re-raise from status details
+                with pytest.raises(exception_class) as exc_info:
+                    Status.raise_from_status_details(status.details)
+
+                # Verify we got the right exception type back
+                assert isinstance(exc_info.value, exception_class)
+                successful_tests += 1
+
+                print(f"✅ {exception_name}: Round-trip successful")
+
+            except Exception as e:
+                failed_tests.append((exception_name, f"Round-trip failed: {e}"))
+                continue
+
+        except Exception as e:
+            failed_tests.append((exception_name, f"Setup failed: {e}"))
+            continue
+
+    # Report results
+    print("\n🎯 Exception Round-trip Test Results:")
+    print(f"✅ Successful: {successful_tests}")
+    print(f"❌ Failed: {len(failed_tests)}")
+
+    if failed_tests:
+        print("\n❌ Failed exceptions:")
+        for exception_name, reason in failed_tests:
+            print(f"  - {exception_name}: {reason}")
+
+    # We expect most to pass, but some failures are acceptable due to complex constructors
+    # Require at least 85% success rate (17/20 = 85% is good, indicates robust support)
+    success_rate = successful_tests / (successful_tests + len(failed_tests))
+    assert success_rate >= 0.85, f"Success rate {success_rate:.1%} too low. Failed tests: {failed_tests}"

From 55043e4c645183991486ef6df476d8fe5b8e1e5e Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 04:20:44 -0700
Subject: [PATCH 07/16] version

---
 .../quickstart/svg_agent/vercel_svg_server/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
index 50f0def0..2296770c 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
@@ -1,4 +1,4 @@
 openai>=1.0.0
 python-dotenv>=0.19.0
-eval_protocol==0.2.69-dev3
+eval_protocol>=0.2.70
 Flask[async]==3.0.3

From 0ebd0177dafc55bfa302a49b2d674c0487516eff Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 17:18:36 -0700
Subject: [PATCH 08/16] eval protocol exception retry

---
 eval_protocol/exceptions.py                   | 176 ++++++++
 eval_protocol/models.py                       | 294 ++++++------
 eval_protocol/pytest/exception_config.py      |  16 +-
 .../pytest/remote_rollout_processor.py        |  16 +-
 eval_protocol/pytest/tracing_utils.py         |   4 +-
 tests/test_exceptions.py                      | 350 +++++++++++++++
 tests/test_status.py                          | 417 ------------------
 7 files changed, 702 insertions(+), 571 deletions(-)
 create mode 100644 eval_protocol/exceptions.py
 create mode 100644 tests/test_exceptions.py
 delete mode 100644 tests/test_status.py

diff --git a/eval_protocol/exceptions.py b/eval_protocol/exceptions.py
new file mode 100644
index 00000000..f294977d
--- /dev/null
+++ b/eval_protocol/exceptions.py
@@ -0,0 +1,176 @@
+"""
+Custom exceptions for Eval Protocol that map to gRPC Status codes.
+
+These exceptions provide a clean way to handle errors and map them to appropriate
+Status objects following the AIP-193 standard.
+"""
+
+from typing import Optional
+
+
+class EvalProtocolError(Exception):
+    """
+    Base exception for all Eval Protocol specific errors.
+
+    Maps to Status.Code and can be converted to Status objects for structured logging.
+    """
+
+    pass
+
+
+# Standard gRPC status code exceptions
+class CancelledError(EvalProtocolError):
+    """Operation was cancelled (Status.Code.CANCELLED = 1)"""
+
+    status_code = 1
+
+
+class UnknownError(EvalProtocolError):
+    """Unknown error occurred (Status.Code.UNKNOWN = 2)"""
+
+    status_code = 2
+
+
+class InvalidArgumentError(EvalProtocolError):
+    """Invalid argument provided (Status.Code.INVALID_ARGUMENT = 3)"""
+
+    status_code = 3
+
+
+class DeadlineExceededError(EvalProtocolError):
+    """Deadline exceeded (Status.Code.DEADLINE_EXCEEDED = 4)"""
+
+    status_code = 4
+
+
+class NotFoundError(EvalProtocolError):
+    """Resource not found (Status.Code.NOT_FOUND = 5)"""
+
+    status_code = 5
+
+
+class AlreadyExistsError(EvalProtocolError):
+    """Resource already exists (Status.Code.ALREADY_EXISTS = 6)"""
+
+    status_code = 6
+
+
+class PermissionDeniedError(EvalProtocolError):
+    """Permission denied (Status.Code.PERMISSION_DENIED = 7)"""
+
+    status_code = 7
+
+
+class ResourceExhaustedError(EvalProtocolError):
+    """Resource exhausted (Status.Code.RESOURCE_EXHAUSTED = 8)"""
+
+    status_code = 8
+
+
+class FailedPreconditionError(EvalProtocolError):
+    """Failed precondition (Status.Code.FAILED_PRECONDITION = 9)"""
+
+    status_code = 9
+
+
+class AbortedError(EvalProtocolError):
+    """Operation was aborted (Status.Code.ABORTED = 10)"""
+
+    status_code = 10
+
+
+class OutOfRangeError(EvalProtocolError):
+    """Value out of range (Status.Code.OUT_OF_RANGE = 11)"""
+
+    status_code = 11
+
+
+class UnimplementedError(EvalProtocolError):
+    """Operation is not implemented (Status.Code.UNIMPLEMENTED = 12)"""
+
+    status_code = 12
+
+
+class InternalError(EvalProtocolError):
+    """Internal server error (Status.Code.INTERNAL = 13)"""
+
+    status_code = 13
+
+
+class UnavailableError(EvalProtocolError):
+    """Service unavailable (Status.Code.UNAVAILABLE = 14)"""
+
+    status_code = 14
+
+
+class DataLossError(EvalProtocolError):
+    """Unrecoverable data loss (Status.Code.DATA_LOSS = 15)"""
+
+    status_code = 15
+
+
+class UnauthenticatedError(EvalProtocolError):
+    """Request lacks valid authentication (Status.Code.UNAUTHENTICATED = 16)"""
+
+    status_code = 16
+
+
+# Custom EP exceptions
+class RolloutFinishedError(EvalProtocolError):
+    """Rollout completed successfully (Status.Code.FINISHED = 100)"""
+
+    status_code = 100
+
+
+class RolloutRunningError(EvalProtocolError):
+    """Rollout is still running (Status.Code.RUNNING = 101)"""
+
+    status_code = 101
+
+
+class ScoreInvalidError(EvalProtocolError):
+    """Score is invalid (Status.Code.SCORE_INVALID = 102)"""
+
+    status_code = 102
+
+
+# Convenience mapping from status codes to exception classes
+# Only actual error conditions should raise exceptions
+STATUS_CODE_TO_EXCEPTION = {
+    0: None,  # OK - success, no exception
+    1: CancelledError,
+    2: UnknownError,
+    3: InvalidArgumentError,
+    4: DeadlineExceededError,
+    5: NotFoundError,
+    6: AlreadyExistsError,
+    7: PermissionDeniedError,
+    8: ResourceExhaustedError,
+    9: FailedPreconditionError,
+    10: AbortedError,
+    11: OutOfRangeError,
+    12: UnimplementedError,
+    13: InternalError,
+    14: UnavailableError,
+    15: DataLossError,
+    16: UnauthenticatedError,
+    100: None,  # FINISHED - success, no exception
+    101: None,  # RUNNING - in progress, no exception
+    102: None,  # SCORE_INVALID - success, no exception
+}
+
+
+def exception_for_status_code(code: int) -> Optional[EvalProtocolError]:
+    """
+    Create an exception instance for a given status code.
+
+    Args:
+        code: Status code from Status.Code enum
+
+    Returns:
+        Exception instance or None if code is OK (0)
+    """
+    exception_class = STATUS_CODE_TO_EXCEPTION.get(code)
+    if exception_class is None:
+        return None
+    return exception_class()
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 23231c22..494d3129 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -136,6 +136,13 @@ def eval_finished(cls) -> "Status":
         """Create a status indicating the evaluation finished."""
         return cls(code=cls.Code.FINISHED, message="Evaluation finished", details=[])
 
+    @staticmethod
+    def _build_details_with_extra_info(extra_info: Optional[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Helper to build details list from extra_info."""
+        if extra_info:
+            return [ErrorInfo.extra_info(extra_info).to_aip193_format()]
+        return []
+
     @classmethod
     def aborted(cls, message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
         """Create a status indicating the evaluation was aborted."""
@@ -160,179 +167,190 @@ def finished(cls, message: str, details: Optional[List[Dict[str, Any]]] = None)
         """Create a status indicating the rollout finished."""
         return cls(code=cls.Code.FINISHED, message=message, details=details or [])
 
+    # Error methods organized by Status.Code enum values (1-16)
+
+    # CANCELLED = 1
     @classmethod
-    def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
-        """Create a status indicating the rollout failed with an error."""
-        details = []
-        if extra_info:
-            details.append(ErrorInfo.extra_info(extra_info).to_aip193_format())
-        return cls.error(error_message, details)
+    def rollout_cancelled_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout was cancelled."""
+        return cls.cancelled_error(error_message, cls._build_details_with_extra_info(extra_info))
+
+    @classmethod
+    def cancelled_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating the operation was cancelled."""
+        return cls(code=cls.Code.CANCELLED, message=error_message, details=details or [])
 
+    # UNKNOWN = 2
     @classmethod
-    def rollout_error_from_exception(
-        cls, exception: Exception, extra_info: Optional[Dict[str, Any]] = None
+    def rollout_unknown_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an unknown error."""
+        return cls.unknown_error(error_message, cls._build_details_with_extra_info(extra_info))
+
+    @classmethod
+    def unknown_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an unknown error occurred."""
+        return cls(code=cls.Code.UNKNOWN, message=error_message, details=details or [])
+
+    # INVALID_ARGUMENT = 3
+    @classmethod
+    def rollout_invalid_argument_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
     ) -> "Status":
-        """
-        Create a status indicating the rollout failed with an exception.
-        Simple approach that stores exception info directly in details.
-        """
-        details = []
+        """Create a status indicating the rollout failed with an invalid argument error."""
+        return cls.invalid_argument_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        details.append(
-            {
-                "exception_type": f"{type(exception).__module__}.{type(exception).__name__}",
-                "exception_message": str(exception),
-            }
-        )
+    @classmethod
+    def invalid_argument_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an invalid argument error occurred."""
+        return cls(code=cls.Code.INVALID_ARGUMENT, message=error_message, details=details or [])
 
-        if extra_info:
-            details.append({"extra_info": extra_info})
+    # DEADLINE_EXCEEDED = 4
+    @classmethod
+    def rollout_deadline_exceeded_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """Create a status indicating the rollout failed with a deadline exceeded error."""
+        return cls.deadline_exceeded_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        return cls(code=cls.Code.INTERNAL, message=str(exception), details=details)
+    @classmethod
+    def deadline_exceeded_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a deadline exceeded error occurred."""
+        return cls(code=cls.Code.DEADLINE_EXCEEDED, message=error_message, details=details or [])
 
+    # NOT_FOUND = 5
     @classmethod
-    def raise_from_status_details(cls, status_details: List[Dict[str, Any]]) -> bool:
-        """
-        Try to raise original exception from simple status details using dynamic imports.
-        """
+    def rollout_not_found_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with a not found error."""
+        return cls.not_found_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        for detail in status_details:
-            # Look for simple exception info
-            if "exception_type" in detail and "exception_message" in detail:
-                exception_type = detail["exception_type"]
-                exception_message = detail["exception_message"]
-
-                logger.info(f"Found exception info: {exception_type}")
-
-                # Dynamically import and raise the exception
-                exception_class = cls._import_exception_class(exception_type)
-                if exception_class:
-                    logger.info(f"Found exception class: {exception_class}")
-                    # Try different constructor patterns
-                    exception_to_raise = cls._create_exception_instance(exception_class, exception_message)
-                    if exception_to_raise:
-                        logger.info(f"Re-raising {exception_type} from status details")
-                        raise exception_to_raise
-                    else:
-                        logger.info(f"Could not create instance of {exception_type}")
-                        continue
-                else:
-                    logger.info(f"Could not import exception type: {exception_type}")
-                    continue
+    @classmethod
+    def not_found_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a not found error occurred."""
+        return cls(code=cls.Code.NOT_FOUND, message=error_message, details=details or [])
 
-        return False
+    # ALREADY_EXISTS = 6
+    @classmethod
+    def rollout_already_exists_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an already exists error."""
+        return cls.already_exists_error(error_message, cls._build_details_with_extra_info(extra_info))
 
     @classmethod
-    def _create_exception_instance(cls, exception_class: type, message: str) -> Optional[Exception]:
-        """
-        Try to create an exception instance using different constructor patterns.
+    def already_exists_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an already exists error occurred."""
+        return cls(code=cls.Code.ALREADY_EXISTS, message=error_message, details=details or [])
 
-        Args:
-            exception_class: The exception class to instantiate
-            message: The error message
+    # PERMISSION_DENIED = 7
+    @classmethod
+    def rollout_permission_denied_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """Create a status indicating the rollout failed with a permission denied error."""
+        return cls.permission_denied_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        Returns:
-            Exception instance if successful, None otherwise
-        """
-        # Common constructor patterns to try
-        patterns = [
-            # Pattern 1: Just message
-            lambda: exception_class(message),
-            # Pattern 2: Message as named parameter
-            lambda: exception_class(message=message),
-            # Pattern 3: Message + common litellm parameters
-            # NOTE: we are losing some diagnostic information here by not passing the model and llm_provider. We could try to capture full exception state in rollout_error_from_exception.
-            lambda: exception_class(message, model="unknown", llm_provider="unknown"),
-            lambda: exception_class(message=message, model="unknown", llm_provider="unknown"),
-            # Pattern 5: OpenAI exceptions - create mock response object
-            lambda: cls._create_openai_exception(exception_class, message),
-            # Pattern 7: No arguments (fallback)
-            lambda: exception_class(),
-        ]
-
-        for i, pattern in enumerate(patterns):
-            try:
-                instance = pattern()
-                logger.debug(f"Successfully created {exception_class.__name__} using pattern {i + 1}")
-                return instance
-            except (TypeError, ValueError) as e:
-                logger.debug(f"Pattern {i + 1} failed for {exception_class.__name__}: {e}")
-                continue
+    @classmethod
+    def permission_denied_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a permission denied error occurred."""
+        return cls(code=cls.Code.PERMISSION_DENIED, message=error_message, details=details or [])
 
-        logger.debug(f"All constructor patterns failed for {exception_class.__name__}")
-        return None
+    # RESOURCE_EXHAUSTED = 8
+    @classmethod
+    def rollout_resource_exhausted_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """Create a status indicating the rollout failed with a resource exhausted error."""
+        return cls.resource_exhausted_error(error_message, cls._build_details_with_extra_info(extra_info))
 
     @classmethod
-    def _create_openai_exception(cls, exception_class: type, message: str) -> Optional[Exception]:
-        """
-        Create OpenAI exception with a mock response object.
+    def resource_exhausted_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a resource exhausted error occurred."""
+        return cls(code=cls.Code.RESOURCE_EXHAUSTED, message=error_message, details=details or [])
 
-        OpenAI exceptions require httpx.Response objects which are complex to create,
-        so we create a minimal mock that satisfies the basic requirements.
-        """
-        try:
-            import httpx
+    # FAILED_PRECONDITION = 9
+    @classmethod
+    def rollout_failed_precondition_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """Create a status indicating the rollout failed with a failed precondition error."""
+        return cls.failed_precondition_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-            # Create a minimal mock response object
-            class MockRequest:
-                def __init__(self):
-                    self.method = "POST"
-                    self.url = "https://api.openai.com/v1/chat/completions"
+    @classmethod
+    def failed_precondition_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a failed precondition error occurred."""
+        return cls(code=cls.Code.FAILED_PRECONDITION, message=error_message, details=details or [])
 
-            class MockResponse:
-                def __init__(self):
-                    self.status_code = 404
-                    self.headers = {"x-request-id": "mock-request-id"}
-                    self.request = MockRequest()
+    # ABORTED = 10
+    @classmethod
+    def rollout_aborted_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout was aborted."""
+        return cls.aborted(error_message, cls._build_details_with_extra_info(extra_info))
 
-            mock_response = MockResponse()
-            return exception_class(message, response=mock_response, body=None)
+    # OUT_OF_RANGE = 11
+    @classmethod
+    def rollout_out_of_range_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an out of range error."""
+        return cls.out_of_range_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        except Exception as e:
-            logging.getLogger(__name__).debug(f"Failed to create OpenAI exception with mock response: {e}")
-            return None
+    @classmethod
+    def out_of_range_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an out of range error occurred."""
+        return cls(code=cls.Code.OUT_OF_RANGE, message=error_message, details=details or [])
 
+    # UNIMPLEMENTED = 12
     @classmethod
-    def _import_exception_class(cls, exception_type: str) -> Optional[type]:
-        """
-        Dynamically import an exception class from a string.
+    def rollout_unimplemented_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an unimplemented error."""
+        return cls.unimplemented_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-        Args:
-            exception_type: Exception type string like "litellm.exceptions.NotFoundError",
-                           "openai.BadRequestError", "requests.exceptions.ConnectionError", etc.
+    @classmethod
+    def unimplemented_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an unimplemented error occurred."""
+        return cls(code=cls.Code.UNIMPLEMENTED, message=error_message, details=details or [])
 
-        Returns:
-            The exception class if found, None otherwise
-        """
-        try:
-            # Require fully qualified names (no automatic prefixing)
-            if "." not in exception_type:
-                logging.getLogger(__name__).debug(f"Exception type must be fully qualified: {exception_type}")
-                return None
+    # INTERNAL = 13
+    @classmethod
+    def rollout_internal_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an internal error."""
+        return cls.internal_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-            # Parse module and class name
-            module_name, class_name = exception_type.rsplit(".", 1)
+    @classmethod
+    def internal_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an internal error occurred."""
+        return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
 
-            # Import the module
-            module = importlib.import_module(module_name)
+    # UNAVAILABLE = 14
+    @classmethod
+    def rollout_unavailable_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an unavailable error."""
+        return cls.unavailable_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-            # Get the exception class
-            exception_class = getattr(module, class_name, None)
+    @classmethod
+    def unavailable_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an unavailable error occurred."""
+        return cls(code=cls.Code.UNAVAILABLE, message=error_message, details=details or [])
 
-            # Verify it's actually an exception class
-            if exception_class and issubclass(exception_class, BaseException):
-                return exception_class
+    # DATA_LOSS = 15
+    @classmethod
+    def rollout_data_loss_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with a data loss error."""
+        return cls.data_loss_error(error_message, cls._build_details_with_extra_info(extra_info))
 
-            return None
+    @classmethod
+    def data_loss_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating a data loss error occurred."""
+        return cls(code=cls.Code.DATA_LOSS, message=error_message, details=details or [])
 
-        except (ImportError, AttributeError, ValueError) as e:
-            logging.getLogger(__name__).debug(f"Could not import exception class {exception_type}: {e}")
-            return None
+    # UNAUTHENTICATED = 16
+    @classmethod
+    def rollout_unauthenticated_error(
+        cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None
+    ) -> "Status":
+        """Create a status indicating the rollout failed with an unauthenticated error."""
+        return cls.unauthenticated_error(error_message, cls._build_details_with_extra_info(extra_info))
 
     @classmethod
-    def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
-        """Create a status indicating the rollout failed with an error."""
-        return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
+    def unauthenticated_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an unauthenticated error occurred."""
+        return cls(code=cls.Code.UNAUTHENTICATED, message=error_message, details=details or [])
 
     @classmethod
     def score_invalid(
diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
index 7da0b8a0..42c97ea4 100644
--- a/eval_protocol/pytest/exception_config.py
+++ b/eval_protocol/pytest/exception_config.py
@@ -11,7 +11,8 @@
 import litellm
 import requests
 import httpx
-import openai
+
+import eval_protocol.exceptions
 
 
 # Default exceptions that should be retried with backoff
@@ -35,12 +36,17 @@
     litellm.exceptions.InternalServerError,
     litellm.exceptions.Timeout,
     litellm.exceptions.NotFoundError,
-    litellm.exceptions.BadRequestError,  # remove this once we have a long term solution
+    # litellm.exceptions.BadRequestError,  # remove this once we have a long term solution
     litellm.exceptions.ServiceUnavailableError,
     litellm.exceptions.APIError,
-    # OpenAI library exceptions
-    openai.NotFoundError,
-    openai.BadRequestError,  # remove this once we have a long term solution
+    # Eval Protocol exceptions
+    eval_protocol.exceptions.UnknownError,
+    eval_protocol.exceptions.DeadlineExceededError,
+    eval_protocol.exceptions.NotFoundError,
+    eval_protocol.exceptions.PermissionDeniedError,
+    eval_protocol.exceptions.UnavailableError,
+    eval_protocol.exceptions.UnauthenticatedError,
+    eval_protocol.exceptions.ResourceExhaustedError,
 }
 
 
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index d27f878f..6e57d93c 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -10,6 +10,7 @@
     DataLoaderConfig,
 )
 from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
+from eval_protocol.exceptions import exception_for_status_code
 
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
@@ -97,13 +98,7 @@ def _post_init() -> None:
                     r.raise_for_status()
                 except requests.exceptions.Timeout:
                     raise TimeoutError(
-                        "The /init endpoint timed out after 30 seconds. "
-                        "CRITICAL: The /init endpoint must return immediately (within 30s) and NOT block on rollout execution. "
-                        "Your remote server should:\n"
-                        "1. Accept the /init request and return a 200 response immediately\n"
-                        "2. Process the actual rollout asynchronously in the background\n"
-                        "3. Use the /status endpoint to report progress\n"
-                        "For Python/Node.js: Start a separate process per rollout to avoid blocking the /init response."
+                        f"The /init endpoint tried {url} with {init_payload.model_dump()} but timed out after 30 seconds."
                     )
 
             await asyncio.to_thread(_post_init)
@@ -166,7 +161,10 @@ def _get_status() -> Dict[str, Any]:
                         f"Found Fireworks log for rollout {row.execution_metadata.rollout_id} with status code {status_code}"
                     )
 
-                    Status.raise_from_status_details(status_details)
+                    # Create and raise exception if appropriate
+                    exception = exception_for_status_code(status_code)
+                    if exception is not None:
+                        raise exception
 
                     row.rollout_status = Status(
                         code=Status.Code(status_code),
@@ -183,7 +181,7 @@ def _get_status() -> Dict[str, Any]:
                     f"Loop completed without breaking for {row.execution_metadata.rollout_id}, which means we timed out"
                 )
                 # Loop completed without breaking, which means we timed out
-                row.rollout_status = Status.rollout_error(
+                row.rollout_status = Status.rollout_deadline_exceeded_error(
                     f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
                 )
 
diff --git a/eval_protocol/pytest/tracing_utils.py b/eval_protocol/pytest/tracing_utils.py
index 27f8d14c..6ea69371 100644
--- a/eval_protocol/pytest/tracing_utils.py
+++ b/eval_protocol/pytest/tracing_utils.py
@@ -151,14 +151,14 @@ def update_row_with_remote_trace(
     output_rows: List[EvaluationRow] = [r for result in results for r in result.rows]
 
     if len(output_rows) == 0:  # Fallback to original row if no remote data found
-        row.rollout_status = Status(code=Status.Code.NOT_FOUND, message="No remote data found for rollout")
+        row.rollout_status = Status.rollout_not_found_error("No remote data found for rollout")
         return None
     elif len(output_rows) == 1:  # Return the remote row
         remote_row = output_rows[0]
 
         # if the remote_row has the same number of messages as the original row, something went wrong
         if len(remote_row.messages) == len(row.messages):
-            row.rollout_status = Status.rollout_error(
+            row.rollout_status = Status.rollout_internal_error(
                 "Rollout finished with the same number of messages as the original row"
             )
             return None
diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py
new file mode 100644
index 00000000..b4b039e2
--- /dev/null
+++ b/tests/test_exceptions.py
@@ -0,0 +1,350 @@
+"""
+Tests for the eval_protocol exception handling system.
+
+Tests the status code to exception mapping functionality:
+1. STATUS_CODE_TO_EXCEPTION mapping correctness
+2. exception_for_status_code() function behavior
+3. Success states don't raise exceptions (0, 100, 101, 102)
+4. Error states raise appropriate exceptions (1-16)
+5. Exception class inheritance and attributes
+6. Integration with existing retry logic
+"""
+
+import pytest
+from eval_protocol.models import Status
+from eval_protocol.exceptions import (
+    exception_for_status_code,
+    STATUS_CODE_TO_EXCEPTION,
+    EvalProtocolError,
+    CancelledError,
+    UnknownError,
+    InvalidArgumentError,
+    DeadlineExceededError,
+    NotFoundError,
+    AlreadyExistsError,
+    PermissionDeniedError,
+    ResourceExhaustedError,
+    FailedPreconditionError,
+    AbortedError,
+    OutOfRangeError,
+    UnimplementedError,
+    InternalError,
+    UnavailableError,
+    DataLossError,
+    UnauthenticatedError,
+    RolloutFinishedError,
+    RolloutRunningError,
+    ScoreInvalidError,
+)
+
+
+def test_success_status_codes_no_exception():
+    """Test that success/progress status codes don't raise exceptions."""
+    success_codes = [
+        (0, "OK"),
+        (100, "FINISHED"),
+        (101, "RUNNING"),
+        (102, "SCORE_INVALID"),  # Changed to success state
+    ]
+
+    for code, name in success_codes:
+        exception = exception_for_status_code(code)
+        assert exception is None, f"Status code {code} ({name}) should not raise exception"
+
+
+def test_error_status_codes_raise_exceptions():
+    """Test that error status codes raise appropriate exceptions."""
+    error_test_cases = [
+        (1, CancelledError, "CANCELLED"),
+        (2, UnknownError, "UNKNOWN"),
+        (3, InvalidArgumentError, "INVALID_ARGUMENT"),
+        (4, DeadlineExceededError, "DEADLINE_EXCEEDED"),
+        (5, NotFoundError, "NOT_FOUND"),
+        (6, AlreadyExistsError, "ALREADY_EXISTS"),
+        (7, PermissionDeniedError, "PERMISSION_DENIED"),
+        (8, ResourceExhaustedError, "RESOURCE_EXHAUSTED"),
+        (9, FailedPreconditionError, "FAILED_PRECONDITION"),
+        (10, AbortedError, "ABORTED"),
+        (11, OutOfRangeError, "OUT_OF_RANGE"),
+        (12, UnimplementedError, "UNIMPLEMENTED"),
+        (13, InternalError, "INTERNAL"),
+        (14, UnavailableError, "UNAVAILABLE"),
+        (15, DataLossError, "DATA_LOSS"),
+        (16, UnauthenticatedError, "UNAUTHENTICATED"),
+    ]
+
+    for code, expected_exception_class, name in error_test_cases:
+        exception = exception_for_status_code(code)
+        assert exception is not None, f"Status code {code} ({name}) should raise exception"
+        assert isinstance(exception, expected_exception_class), (
+            f"Status code {code} should raise {expected_exception_class.__name__}"
+        )
+        assert isinstance(exception, EvalProtocolError), "All exceptions should inherit from EvalProtocolError"
+
+
+def test_status_code_mapping_completeness():
+    """Test that STATUS_CODE_TO_EXCEPTION mapping covers all expected codes."""
+    expected_codes = [
+        0,  # OK
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,  # Standard gRPC codes
+        100,
+        101,
+        102,  # Custom EP codes
+    ]
+
+    for code in expected_codes:
+        assert code in STATUS_CODE_TO_EXCEPTION, f"Status code {code} missing from mapping"
+
+
+def test_invalid_status_codes():
+    """Test behavior with invalid/unknown status codes."""
+    invalid_codes = [-1, 17, 99, 103, 999]
+
+    for code in invalid_codes:
+        exception = exception_for_status_code(code)
+        assert exception is None, f"Invalid status code {code} should return None"
+
+
+def test_exception_attributes():
+    """Test that exceptions have the expected attributes."""
+    # Test a few exception types
+    test_cases = [
+        (1, CancelledError, "CANCELLED"),
+        (5, NotFoundError, "NOT_FOUND"),
+        (13, InternalError, "INTERNAL"),
+    ]
+
+    for code, expected_class, name in test_cases:
+        exception = exception_for_status_code(code)
+        assert hasattr(expected_class, "status_code"), f"{expected_class.__name__} should have status_code attribute"
+        assert expected_class.status_code == code, f"{expected_class.__name__}.status_code should be {code}"
+
+
+def test_exception_raising_integration():
+    """Test the pattern used in RemoteRolloutProcessor."""
+    # Simulate the pattern used in remote_rollout_processor.py
+    status_codes_to_test = [
+        (0, False),  # OK - should not raise
+        (5, True),  # NOT_FOUND - should raise NotFoundError
+        (13, True),  # INTERNAL - should raise InternalError
+        (100, False),  # FINISHED - should not raise
+    ]
+
+    for status_code, should_raise in status_codes_to_test:
+        exception = exception_for_status_code(status_code)
+
+        if should_raise:
+            assert exception is not None, f"Status code {status_code} should create exception"
+            # Test that we can raise it
+            with pytest.raises(EvalProtocolError):
+                raise exception
+        else:
+            assert exception is None, f"Status code {status_code} should not create exception"
+
+
+def test_status_code_enum_consistency():
+    """Test that our mapping is consistent with Status.Code enum."""
+    # Test that our exception mapping aligns with Status.Code enum
+    status_code_mapping = {
+        Status.Code.OK: None,
+        Status.Code.CANCELLED: CancelledError,
+        Status.Code.UNKNOWN: UnknownError,
+        Status.Code.INVALID_ARGUMENT: InvalidArgumentError,
+        Status.Code.DEADLINE_EXCEEDED: DeadlineExceededError,
+        Status.Code.NOT_FOUND: NotFoundError,
+        Status.Code.ALREADY_EXISTS: AlreadyExistsError,
+        Status.Code.PERMISSION_DENIED: PermissionDeniedError,
+        Status.Code.RESOURCE_EXHAUSTED: ResourceExhaustedError,
+        Status.Code.FAILED_PRECONDITION: FailedPreconditionError,
+        Status.Code.ABORTED: AbortedError,
+        Status.Code.OUT_OF_RANGE: OutOfRangeError,
+        Status.Code.UNIMPLEMENTED: UnimplementedError,
+        Status.Code.INTERNAL: InternalError,
+        Status.Code.UNAVAILABLE: UnavailableError,
+        Status.Code.DATA_LOSS: DataLossError,
+        Status.Code.UNAUTHENTICATED: UnauthenticatedError,
+        Status.Code.FINISHED: None,
+        Status.Code.RUNNING: None,
+        Status.Code.SCORE_INVALID: None,
+    }
+
+    for status_code_enum, expected_exception_class in status_code_mapping.items():
+        code_value = int(status_code_enum)
+        actual_exception_class = STATUS_CODE_TO_EXCEPTION.get(code_value)
+
+        if expected_exception_class is None:
+            assert actual_exception_class is None, (
+                f"Status.Code.{status_code_enum.name} ({code_value}) should map to None"
+            )
+        else:
+            assert actual_exception_class == expected_exception_class, (
+                f"Status.Code.{status_code_enum.name} ({code_value}) should map to {expected_exception_class.__name__}"
+            )
+
+
+def test_exception_inheritance():
+    """Test that all exception classes properly inherit from EvalProtocolError."""
+    exception_classes = [
+        CancelledError,
+        UnknownError,
+        InvalidArgumentError,
+        DeadlineExceededError,
+        NotFoundError,
+        AlreadyExistsError,
+        PermissionDeniedError,
+        ResourceExhaustedError,
+        FailedPreconditionError,
+        AbortedError,
+        OutOfRangeError,
+        UnimplementedError,
+        InternalError,
+        UnavailableError,
+        DataLossError,
+        UnauthenticatedError,
+        RolloutFinishedError,
+        RolloutRunningError,
+        ScoreInvalidError,
+    ]
+
+    for exception_class in exception_classes:
+        assert issubclass(exception_class, EvalProtocolError), (
+            f"{exception_class.__name__} should inherit from EvalProtocolError"
+        )
+        assert issubclass(exception_class, Exception), f"{exception_class.__name__} should inherit from Exception"
+
+
+def test_real_world_usage_scenarios():
+    """Test realistic usage patterns from RemoteRolloutProcessor."""
+    # Test scenarios that might occur in practice
+    scenarios = [
+        # Success scenarios
+        {"status_code": 0, "description": "Successful API call", "should_raise": False},
+        {"status_code": 100, "description": "Rollout completed successfully", "should_raise": False},
+        {"status_code": 101, "description": "Rollout still in progress", "should_raise": False},
+        # Error scenarios that should trigger retry logic
+        {
+            "status_code": 4,
+            "description": "Request timeout",
+            "should_raise": True,
+            "expected_exception": DeadlineExceededError,
+        },
+        {
+            "status_code": 5,
+            "description": "Model not found",
+            "should_raise": True,
+            "expected_exception": NotFoundError,
+        },
+        {
+            "status_code": 7,
+            "description": "API key invalid",
+            "should_raise": True,
+            "expected_exception": PermissionDeniedError,
+        },
+        {
+            "status_code": 8,
+            "description": "Rate limit exceeded",
+            "should_raise": True,
+            "expected_exception": ResourceExhaustedError,
+        },
+        {
+            "status_code": 13,
+            "description": "Internal server error",
+            "should_raise": True,
+            "expected_exception": InternalError,
+        },
+        {
+            "status_code": 14,
+            "description": "Service temporarily unavailable",
+            "should_raise": True,
+            "expected_exception": UnavailableError,
+        },
+    ]
+
+    for scenario in scenarios:
+        status_code = scenario["status_code"]
+        description = scenario["description"]
+        should_raise = scenario["should_raise"]
+
+        # This is the pattern used in RemoteRolloutProcessor
+        exception = exception_for_status_code(status_code)
+
+        if should_raise:
+            expected_exception = scenario["expected_exception"]
+            assert exception is not None, f"Scenario '{description}' should create exception"
+            assert isinstance(exception, expected_exception), (
+                f"Scenario '{description}' should create {expected_exception.__name__}"
+            )
+
+            # Test that the exception can be raised and caught for retry logic
+            with pytest.raises(expected_exception):
+                raise exception
+
+        else:
+            assert exception is None, f"Scenario '{description}' should not create exception"
+
+
+def test_exception_status_code_attributes():
+    """Test that all exceptions have correct status_code attributes."""
+    expected_mappings = [
+        (CancelledError, 1),
+        (UnknownError, 2),
+        (InvalidArgumentError, 3),
+        (DeadlineExceededError, 4),
+        (NotFoundError, 5),
+        (AlreadyExistsError, 6),
+        (PermissionDeniedError, 7),
+        (ResourceExhaustedError, 8),
+        (FailedPreconditionError, 9),
+        (AbortedError, 10),
+        (OutOfRangeError, 11),
+        (UnimplementedError, 12),
+        (InternalError, 13),
+        (UnavailableError, 14),
+        (DataLossError, 15),
+        (UnauthenticatedError, 16),
+        (RolloutFinishedError, 100),
+        (RolloutRunningError, 101),
+        (ScoreInvalidError, 102),
+    ]
+
+    for exception_class, expected_code in expected_mappings:
+        assert hasattr(exception_class, "status_code"), f"{exception_class.__name__} should have status_code attribute"
+        assert exception_class.status_code == expected_code, (
+            f"{exception_class.__name__}.status_code should be {expected_code}"
+        )
+
+
+def test_integration_with_retry_logic():
+    """Test that our exceptions integrate properly with existing retry logic."""
+    from eval_protocol.pytest.exception_config import DEFAULT_RETRYABLE_EXCEPTIONS
+
+    # Test that our error exceptions are covered by retry logic
+    our_error_exceptions = [
+        UnknownError,
+        DeadlineExceededError,
+        NotFoundError,
+        PermissionDeniedError,
+        UnavailableError,
+        UnauthenticatedError,
+        ResourceExhaustedError,
+    ]
+
+    for exception_class in our_error_exceptions:
+        assert exception_class in DEFAULT_RETRYABLE_EXCEPTIONS, (
+            f"{exception_class.__name__} should be in DEFAULT_RETRYABLE_EXCEPTIONS for retry support"
+        )
diff --git a/tests/test_status.py b/tests/test_status.py
deleted file mode 100644
index f0ac0932..00000000
--- a/tests/test_status.py
+++ /dev/null
@@ -1,417 +0,0 @@
-"""
-Tests for Status exception handling functionality.
-
-Tests the round-trip flow:
-1. Exception → Status.rollout_error_from_exception() → structured logging
-2. Structured data → Status.raise_from_status_details() → original exception
-"""
-
-import pytest
-from eval_protocol.models import Status
-
-# Test with different exception types that might be available
-try:
-    import litellm.exceptions
-
-    LITELLM_AVAILABLE = True
-except ImportError:
-    LITELLM_AVAILABLE = False
-
-try:
-    import requests.exceptions
-
-    REQUESTS_AVAILABLE = True
-except ImportError:
-    REQUESTS_AVAILABLE = False
-
-try:
-    import httpx
-
-    HTTPX_AVAILABLE = True
-except ImportError:
-    HTTPX_AVAILABLE = False
-
-try:
-    import openai
-
-    OPENAI_AVAILABLE = True
-except ImportError:
-    OPENAI_AVAILABLE = False
-
-
-def test_rollout_error_from_exception_basic():
-    """Test creating Status from a basic exception."""
-    # Create a simple exception
-    original_exception = ValueError("Test error message")
-
-    # Create status from exception
-    status = Status.rollout_error_from_exception(original_exception)
-
-    # Verify the status structure
-    assert status.code == Status.Code.INTERNAL
-    assert status.message == "Test error message"
-    assert len(status.details) == 1
-
-    detail = status.details[0]
-    assert detail["exception_type"] == "builtins.ValueError"
-    assert detail["exception_message"] == "Test error message"
-
-
-def test_exception_round_trip_basic():
-    """Test the complete round-trip: exception → status → re-raise exception."""
-    # Create original exception
-    original_exception = ValueError("Round trip test")
-
-    # Convert to status
-    status = Status.rollout_error_from_exception(original_exception)
-
-    # Try to re-raise from status details
-    with pytest.raises(ValueError) as exc_info:
-        Status.raise_from_status_details(status.details)
-
-    # Verify the re-raised exception has the same message
-    assert str(exc_info.value) == "Round trip test"
-
-
-@pytest.mark.skipif(not LITELLM_AVAILABLE, reason="litellm not available")
-def test_litellm_exception_round_trip():
-    """Test round-trip with litellm exceptions."""
-    # Create a litellm exception - try different constructor patterns
-    original_exception = litellm.exceptions.NotFoundError(
-        message="Model not found", model="test-model", llm_provider="test-provider"
-    )
-    # Convert to status
-    status = Status.rollout_error_from_exception(original_exception)
-
-    # Verify status details
-    detail = status.details[0]
-    assert detail["exception_type"] == "litellm.exceptions.NotFoundError"
-    # Message might contain additional info, just check it contains our text
-    assert "Model not found" in detail["exception_message"] or "not found" in detail["exception_message"].lower()
-
-    # Re-raise and verify type
-    with pytest.raises(litellm.exceptions.NotFoundError) as exc_info:
-        Status.raise_from_status_details(status.details)
-
-    # The re-raised exception should be the same type
-    assert isinstance(exc_info.value, litellm.exceptions.NotFoundError)
-
-
-@pytest.mark.skipif(not REQUESTS_AVAILABLE, reason="requests not available")
-def test_requests_exception_round_trip():
-    """Test round-trip with requests exceptions."""
-    # Create a requests exception
-    original_exception = requests.exceptions.ConnectionError("Connection failed")
-
-    # Convert to status
-    status = Status.rollout_error_from_exception(original_exception)
-
-    # Verify status details
-    detail = status.details[0]
-    assert detail["exception_type"] == "requests.exceptions.ConnectionError"
-    assert detail["exception_message"] == "Connection failed"
-
-    # Re-raise and verify type
-    with pytest.raises(requests.exceptions.ConnectionError) as exc_info:
-        Status.raise_from_status_details(status.details)
-
-    assert str(exc_info.value) == "Connection failed"
-
-
-def test_unknown_exception_type():
-    """Test behavior with unknown/non-importable exception type."""
-    # Create status details with fake exception type
-    fake_details = [{"exception_type": "fake.module.FakeException", "exception_message": "This should not raise"}]
-
-    # Should not raise anything, just return False
-    result = Status.raise_from_status_details(fake_details)
-    assert result is False
-
-
-def test_malformed_status_details():
-    """Test behavior with malformed status details."""
-    # Various malformed details
-    malformed_cases = [
-        [],  # Empty list
-        [{}],  # Empty dict
-        [{"exception_type": "ValueError"}],  # Missing message
-        [{"exception_message": "test"}],  # Missing type
-        [{"wrong_key": "wrong_value"}],  # Wrong keys
-    ]
-
-    for malformed_details in malformed_cases:
-        result = Status.raise_from_status_details(malformed_details)
-        assert result is False
-
-
-def test_rollout_error_with_extra_info():
-    """Test rollout_error_from_exception with extra_info."""
-    original_exception = ValueError("Test with extra info")
-    extra_info = {"context": "test_context", "user_id": "123"}
-
-    status = Status.rollout_error_from_exception(original_exception, extra_info)
-
-    # Should have both exception info and extra info
-    assert len(status.details) == 2
-
-    # First detail should be exception info
-    exception_detail = status.details[0]
-    assert exception_detail["exception_type"] == "builtins.ValueError"
-    assert exception_detail["exception_message"] == "Test with extra info"
-
-    # Second detail should be extra info
-    extra_detail = status.details[1]
-    assert extra_detail["extra_info"]["context"] == "test_context"
-    assert extra_detail["extra_info"]["user_id"] == "123"
-
-
-def test_multiple_exception_details():
-    """Test raise_from_status_details with multiple details (should use first valid one)."""
-    # Create details with multiple exception info
-    details = [
-        {"other_info": "ignored"},  # Should be ignored
-        {"exception_type": "builtins.ValueError", "exception_message": "First exception"},  # Should be used
-        {"exception_type": "builtins.RuntimeError", "exception_message": "Second exception"},  # Should be ignored
-    ]
-
-    # Should raise the first valid exception
-    with pytest.raises(ValueError) as exc_info:
-        Status.raise_from_status_details(details)
-
-    assert str(exc_info.value) == "First exception"
-
-
-@pytest.mark.skipif(not LITELLM_AVAILABLE, reason="litellm not available")
-def test_different_litellm_exceptions():
-    """Test various litellm exception types."""
-    # Test with a few common litellm exceptions
-    exception_classes = [
-        litellm.exceptions.RateLimitError,
-        litellm.exceptions.InternalServerError,
-        litellm.exceptions.BadRequestError,
-    ]
-
-    for exception_class in exception_classes:
-        # Try to create an exception instance (try different constructor patterns)
-        original_exception = None
-        exception_name = exception_class.__name__
-
-        try:
-            # Try with just message
-            original_exception = exception_class(f"Test {exception_name}")
-        except TypeError:
-            try:
-                # Try with message and required parameters
-                original_exception = exception_class(
-                    message=f"Test {exception_name}", model="test-model", llm_provider="test-provider"
-                )
-            except TypeError:
-                try:
-                    # Try with positional args
-                    original_exception = exception_class(f"Test {exception_name}", "test-model", "test-provider")
-                except TypeError:
-                    # Skip this particular exception type
-                    continue
-
-        if original_exception is None:
-            continue
-
-        # Test the round-trip
-        status = Status.rollout_error_from_exception(original_exception)
-
-        # Should be able to re-raise the same type
-        with pytest.raises(exception_class):
-            Status.raise_from_status_details(status.details)
-
-
-def test_edge_case_empty_message():
-    """Test with exception that has empty message."""
-    original_exception = ValueError()  # Empty message
-
-    status = Status.rollout_error_from_exception(original_exception)
-
-    # Should handle empty message gracefully
-    detail = status.details[0]
-    assert detail["exception_type"] == "builtins.ValueError"
-    assert detail["exception_message"] == ""
-
-    # Should still re-raise correctly
-    with pytest.raises(ValueError):
-        Status.raise_from_status_details(status.details)
-
-
-def test_all_default_retryable_exceptions():
-    """
-    Comprehensive test of all exceptions in DEFAULT_RETRYABLE_EXCEPTIONS.
-
-    This ensures our Status exception handling works with every exception type
-    that the retry system claims to support.
-    """
-    # Test cases: (exception_class, test_message, required_modules, skip_reason)
-    test_cases = [
-        # Standard library exceptions
-        (ConnectionError, "Connection failed", [], None),
-        (TimeoutError, "Request timeout", [], None),
-        (OSError, "OS error occurred", [], None),
-    ]
-
-    # Add requests exceptions if available
-    if REQUESTS_AVAILABLE:
-        import requests.exceptions
-
-        test_cases.extend(
-            [
-                (requests.exceptions.ConnectionError, "Requests connection error", ["requests"], None),
-                (requests.exceptions.Timeout, "Requests timeout", ["requests"], None),
-                (requests.exceptions.HTTPError, "HTTP error occurred", ["requests"], None),
-                (requests.exceptions.RequestException, "Request exception", ["requests"], None),
-            ]
-        )
-
-    # Add httpx exceptions if available
-    if HTTPX_AVAILABLE:
-        import httpx
-
-        test_cases.extend(
-            [
-                (httpx.ConnectError, "HTTPX connect error", ["httpx"], None),
-                (httpx.TimeoutException, "HTTPX timeout", ["httpx"], None),
-                (httpx.NetworkError, "HTTPX network error", ["httpx"], None),
-                (httpx.RemoteProtocolError, "HTTPX protocol error", ["httpx"], None),
-            ]
-        )
-
-    # Add openai exceptions if available
-    if OPENAI_AVAILABLE:
-        import openai
-
-        test_cases.extend(
-            [
-                (openai.NotFoundError, "OpenAI model not found", ["openai"], None),
-                (openai.BadRequestError, "OpenAI bad request", ["openai"], None),
-                (openai.RateLimitError, "OpenAI rate limit", ["openai"], None),
-            ]
-        )
-
-    # Add litellm exceptions if available
-    if LITELLM_AVAILABLE:
-        import litellm.exceptions
-
-        test_cases.extend(
-            [
-                (litellm.exceptions.RateLimitError, "Rate limit exceeded", ["litellm"], None),
-                (litellm.exceptions.InternalServerError, "Internal server error", ["litellm"], None),
-                (litellm.exceptions.Timeout, "LiteLLM timeout", ["litellm"], None),
-                (litellm.exceptions.NotFoundError, "Model not found", ["litellm"], None),
-                (litellm.exceptions.BadRequestError, "Bad request", ["litellm"], None),
-                (litellm.exceptions.ServiceUnavailableError, "Service unavailable", ["litellm"], None),
-            ]
-        )
-
-    successful_tests = 0
-    failed_tests = []
-
-    for exception_class, test_message, required_modules, skip_reason in test_cases:
-        exception_name = f"{exception_class.__module__}.{exception_class.__name__}"
-
-        try:
-            # Try to create the original exception with different patterns
-            original_exception = None
-
-            # Pattern 1: Just message
-            try:
-                original_exception = exception_class(test_message)
-            except TypeError:
-                # Pattern 2: Message as named parameter
-                try:
-                    original_exception = exception_class(message=test_message)
-                except TypeError:
-                    # Pattern 3: For litellm - try with required parameters
-                    if "litellm" in required_modules:
-                        try:
-                            original_exception = exception_class(
-                                message=test_message, model="test-model", llm_provider="test-provider"
-                            )
-                        except TypeError:
-                            try:
-                                original_exception = exception_class(test_message, "test-model", "test-provider")
-                            except TypeError:
-                                pass
-                    # Pattern 4: For OpenAI - create mock response object
-                    elif "openai" in required_modules and original_exception is None:
-                        try:
-                            # Create minimal mock objects for OpenAI exceptions
-                            class MockRequest:
-                                def __init__(self):
-                                    self.method = "POST"
-                                    self.url = "https://api.openai.com/v1/chat/completions"
-
-                            class MockResponse:
-                                def __init__(self):
-                                    self.status_code = 404
-                                    self.headers = {"x-request-id": "test-request-id"}
-                                    self.request = MockRequest()
-
-                            mock_response = MockResponse()
-                            original_exception = exception_class(test_message, response=mock_response, body=None)
-                        except (TypeError, AttributeError) as e:
-                            # If mock approach fails, skip OpenAI for now
-                            failed_tests.append((exception_name, f"OpenAI mock creation failed: {e}"))
-                            continue
-
-                    # Pattern 5: No arguments fallback
-                    if original_exception is None:
-                        try:
-                            original_exception = exception_class()
-                        except TypeError:
-                            failed_tests.append((exception_name, "Could not create exception instance"))
-                            continue
-
-            if original_exception is None:
-                failed_tests.append((exception_name, "Could not create exception instance"))
-                continue
-
-            # Test the round-trip: exception -> status -> re-raise
-            try:
-                # Convert to status
-                status = Status.rollout_error_from_exception(original_exception)
-
-                # Verify status structure
-                assert len(status.details) >= 1
-                detail = status.details[0]
-                assert "exception_type" in detail
-                assert "exception_message" in detail
-                assert detail["exception_type"] == exception_name
-
-                # Try to re-raise from status details
-                with pytest.raises(exception_class) as exc_info:
-                    Status.raise_from_status_details(status.details)
-
-                # Verify we got the right exception type back
-                assert isinstance(exc_info.value, exception_class)
-                successful_tests += 1
-
-                print(f"✅ {exception_name}: Round-trip successful")
-
-            except Exception as e:
-                failed_tests.append((exception_name, f"Round-trip failed: {e}"))
-                continue
-
-        except Exception as e:
-            failed_tests.append((exception_name, f"Setup failed: {e}"))
-            continue
-
-    # Report results
-    print("\n🎯 Exception Round-trip Test Results:")
-    print(f"✅ Successful: {successful_tests}")
-    print(f"❌ Failed: {len(failed_tests)}")
-
-    if failed_tests:
-        print("\n❌ Failed exceptions:")
-        for exception_name, reason in failed_tests:
-            print(f"  - {exception_name}: {reason}")
-
-    # We expect most to pass, but some failures are acceptable due to complex constructors
-    # Require at least 85% success rate (17/20 = 85% is good, indicates robust support)
-    success_rate = successful_tests / (successful_tests + len(failed_tests))
-    assert success_rate >= 0.85, f"Success rate {success_rate:.1%} too low. Failed tests: {failed_tests}"

From 4c855e76a696143f7d7b40b40d336f3f7a76ffcf Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 17:44:43 -0700
Subject: [PATCH 09/16] backward compat

---
 eval_protocol/models.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 494d3129..ecb9f5dc 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -312,6 +312,12 @@ def rollout_internal_error(cls, error_message: str, extra_info: Optional[Dict[st
         """Create a status indicating the rollout failed with an internal error."""
         return cls.internal_error(error_message, cls._build_details_with_extra_info(extra_info))
 
+    # For backwards compatibility
+    @classmethod
+    def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
+        """Create a status indicating the rollout failed with an error."""
+        return cls.internal_error(error_message, cls._build_details_with_extra_info(extra_info))
+
     @classmethod
     def internal_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
         """Create a status indicating an internal error occurred."""

From d719a6c6aa251db3719134eab90542dd01548c01 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 17:46:15 -0700
Subject: [PATCH 10/16] updated server

---
 .../svg_agent/vercel_svg_server/api/init.py   | 103 +++++++++++-------
 .../vercel_svg_server/requirements.txt        |   2 +-
 2 files changed, 65 insertions(+), 40 deletions(-)

diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py b/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
index 502eb96a..ffd8b9ea 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py
@@ -12,6 +12,7 @@
 import asyncio
 from flask import Flask, request, jsonify
 from openai import OpenAI
+import openai
 from dotenv import load_dotenv
 
 from eval_protocol import Status, InitRequest, FireworksTracingHttpHandler, RolloutIdFilter
@@ -49,56 +50,80 @@ def filter(self, record: logging.LogRecord) -> bool:
 app = Flask(__name__)
 
 
-async def execute_rollout_background(req, api_key):
+async def execute_rollout_background(req: InitRequest, api_key: str):
     """Execute the OpenAI completion in background and log results"""
     # Attach rollout_id filter to logger
     logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
     logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
 
-    try:
-        model = req.completion_params.get("model")
-        # Uncomment if you need to strip fireworks_ai/ prefix
-        # if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
-        #     model = model[len("fireworks_ai/"):]
-
-        # Prepare completion arguments
-        completion_kwargs = {
-            "messages": req.messages,
-            # "messages": [{"role": "user", "content": "Hello, how are you?"}],
-            "model": model,
-            "temperature": req.completion_params.get("temperature"),
-            "max_tokens": req.completion_params.get("max_tokens"),
-        }
+    model = req.completion_params.get("model")
+    # Uncomment if you need to strip fireworks_ai/ prefix
+    # if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
+    #     model = model[len("fireworks_ai/"):]
+
+    # Prepare completion arguments
+    completion_kwargs = {
+        "messages": req.messages,
+        # "messages": [{"role": "user", "content": "Hello, how are you?"}],
+        "model": model,
+        "temperature": req.completion_params.get("temperature"),
+        "max_tokens": req.completion_params.get("max_tokens"),
+    }
+
+    # Add tools if present
+    if req.tools:
+        completion_kwargs["tools"] = req.tools
+
+    logger.info(
+        f"DEBUG: {req.model_base_url}, COMPLETION_KWARGS: {completion_kwargs}, API_KEY: {api_key}, MODEL: {model}"
+    )
 
-        # Add tools if present
-        if req.tools:
-            completion_kwargs["tools"] = req.tools
+    # Create AsyncOpenAI client
+    # client = AsyncOpenAI(base_url=req.model_base_url, api_key=api_key)
+    client = OpenAI(base_url=req.model_base_url, api_key=api_key)
 
-        logger.info(
-            f"DEBUG: {req.model_base_url}, COMPLETION_KWARGS: {completion_kwargs}, API_KEY: {api_key}, MODEL: {model}"
-        )
+    logger.info(f"Sending completion request to model {model}")
 
-        # Create AsyncOpenAI client
-        # client = AsyncOpenAI(base_url=req.model_base_url, api_key=api_key)
-        client = OpenAI(base_url=req.model_base_url, api_key=api_key)
+    # Make the async model call with timeout
+    import time
 
-        logger.info(f"Sending completion request to model {model}")
+    logger.info(f"timing start: {time.time()}")
 
-        # Make the async model call with timeout
-        import time
-
-        logger.info(f"timing start: {time.time()}")
+    try:
         completion = client.chat.completions.create(**completion_kwargs)
-        logger.info(f"Completed response: {completion}")
-        logger.info(f"timing end: {time.time()}")
-        # Log successful completion - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
-        logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
-
+    except (
+        openai.AuthenticationError,
+        openai.PermissionDeniedError,
+    ) as e:
+        # These errors should be logged and will be retried by RemoteRolloutProcessor
+        logger.error(
+            f"Rollout {req.metadata.rollout_id} failed: {e}",
+            extra={"status": Status.rollout_permission_denied_error(str(e))},
+        )
+        return
+    except openai.NotFoundError as e:
+        logger.error(
+            f"Rollout {req.metadata.rollout_id} failed: {e}", extra={"status": Status.rollout_not_found_error(str(e))}
+        )
+        return
+    except openai.RateLimitError as e:
+        logger.error(
+            f"Rollout {req.metadata.rollout_id} failed: {e}",
+            extra={"status": Status.rollout_resource_exhausted_error(str(e))},
+        )
+        return
     except Exception as e:
-        # Log error with structured status - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
+        # Non-OpenAI errors (shouldn't normally happen but catch anyway)
         logger.error(
-            f"Rollout {req.metadata.rollout_id} failed: {e}", extra={"status": Status.rollout_error_from_exception(e)}
+            f"Rollout {req.metadata.rollout_id} failed with unexpected error: {e}",
+            extra={"status": Status.rollout_internal_error(str(e))},
         )
+        return
+
+    logger.info(f"Completed response: {completion}")
+    logger.info(f"timing end: {time.time()}")
+    # Log successful completion - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
+    logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
 
 
 @app.route("/init", methods=["POST"])
@@ -114,7 +139,7 @@ async def init():
         # Validate required fields
         if not req.messages:
             error_msg = "messages is required"
-            logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
+            logger.error(error_msg, extra={"status": Status.rollout_internal_error(error_msg)})
             return jsonify({"error": error_msg}), 400
 
         # Get API key (prefer request api_key, fallback to environment)
@@ -126,7 +151,7 @@ async def init():
             api_key = os.environ.get("FIREWORKS_API_KEY")
         else:
             error_msg = "API key not provided in request or environment variable"
-            logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
+            logger.error(error_msg, extra={"status": Status.rollout_internal_error(error_msg)})
             return jsonify({"error": error_msg}), 401
 
         # 🔥 FIRE: Return immediately with acceptance (within 30s requirement)
@@ -137,7 +162,7 @@ async def init():
         }
 
         # Fire and forget: Execute rollout asynchronously
-        asyncio.create_task(execute_rollout_background(req, api_key))
+        asyncio.create_task(execute_rollout_background(req, api_key or ""))
 
         return jsonify(response_data), 200
 
diff --git a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
index 2296770c..f4ce92fc 100644
--- a/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
+++ b/eval_protocol/quickstart/svg_agent/vercel_svg_server/requirements.txt
@@ -1,4 +1,4 @@
 openai>=1.0.0
 python-dotenv>=0.19.0
-eval_protocol>=0.2.70
+eval_protocol>=0.2.71
 Flask[async]==3.0.3

From aeafa1321fc75f1eaa2b044d5df64c4f173c5aa2 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 17:48:38 -0700
Subject: [PATCH 11/16] nits

---
 eval_protocol/models.py                  | 3 ---
 eval_protocol/pytest/exception_config.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index ecb9f5dc..fdbd4b1a 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -21,9 +21,6 @@
 from eval_protocol.types import TerminationReason
 
 
-logger = logging.getLogger(__name__)
-
-
 class ErrorInfo(BaseModel):
     """
     AIP-193 ErrorInfo model for structured error details.
diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
index 42c97ea4..5e1e1797 100644
--- a/eval_protocol/pytest/exception_config.py
+++ b/eval_protocol/pytest/exception_config.py
@@ -36,7 +36,6 @@
     litellm.exceptions.InternalServerError,
     litellm.exceptions.Timeout,
     litellm.exceptions.NotFoundError,
-    # litellm.exceptions.BadRequestError,  # remove this once we have a long term solution
     litellm.exceptions.ServiceUnavailableError,
     litellm.exceptions.APIError,
     # Eval Protocol exceptions

From d9666fd9deb12642b92342f42901522971461012 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 17:53:57 -0700
Subject: [PATCH 12/16] fix test

---
 eval_protocol/exceptions.py                   |  5 +++--
 .../pytest/remote_rollout_processor.py        |  4 ++--
 tests/test_exceptions.py                      | 21 +++++++++++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/exceptions.py b/eval_protocol/exceptions.py
index f294977d..3b92c865 100644
--- a/eval_protocol/exceptions.py
+++ b/eval_protocol/exceptions.py
@@ -160,12 +160,13 @@ class ScoreInvalidError(EvalProtocolError):
 }
 
 
-def exception_for_status_code(code: int) -> Optional[EvalProtocolError]:
+def exception_for_status_code(code: int, message: str = "") -> Optional[EvalProtocolError]:
     """
     Create an exception instance for a given status code.
 
     Args:
         code: Status code from Status.Code enum
+        message: Optional error message to include in the exception
 
     Returns:
         Exception instance or None if code is OK (0)
@@ -173,4 +174,4 @@ def exception_for_status_code(code: int) -> Optional[EvalProtocolError]:
     exception_class = STATUS_CODE_TO_EXCEPTION.get(code)
     if exception_class is None:
         return None
-    return exception_class()
+    return exception_class(message) if message else exception_class()
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index 6e57d93c..b8af095d 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -161,8 +161,8 @@ def _get_status() -> Dict[str, Any]:
                         f"Found Fireworks log for rollout {row.execution_metadata.rollout_id} with status code {status_code}"
                     )
 
-                    # Create and raise exception if appropriate
-                    exception = exception_for_status_code(status_code)
+                    # Create and raise exception if appropriate, preserving original message
+                    exception = exception_for_status_code(status_code, status_message)
                     if exception is not None:
                         raise exception
 
diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py
index b4b039e2..a1fcdeb6 100644
--- a/tests/test_exceptions.py
+++ b/tests/test_exceptions.py
@@ -348,3 +348,24 @@ def test_integration_with_retry_logic():
         assert exception_class in DEFAULT_RETRYABLE_EXCEPTIONS, (
             f"{exception_class.__name__} should be in DEFAULT_RETRYABLE_EXCEPTIONS for retry support"
         )
+
+
+def test_exception_message_preservation():
+    """Test that error messages are properly preserved in exceptions."""
+    test_cases = [
+        (13, "test error", InternalError),
+        (5, "Model xyz not found", NotFoundError),
+        (7, "Invalid API key", PermissionDeniedError),
+    ]
+
+    for status_code, message, expected_exception_class in test_cases:
+        # Test with message
+        exception = exception_for_status_code(status_code, message)
+        assert exception is not None
+        assert isinstance(exception, expected_exception_class)
+        assert str(exception) == message, f"Exception should preserve message '{message}'"
+
+        # Test without message (should still work)
+        exception_no_msg = exception_for_status_code(status_code)
+        assert exception_no_msg is not None
+        assert isinstance(exception_no_msg, expected_exception_class)

From d569a64ab95e76a56e9afbe01cb8d1db0bc17079 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 29 Oct 2025 18:02:54 -0700
Subject: [PATCH 13/16] backward compat and fix test again

---
 eval_protocol/models.py                             |  9 +++++++--
 eval_protocol/pytest/evaluation_test_utils.py       |  2 +-
 .../test_remote_fireworks_propagate_status.py       | 13 +------------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index fdbd4b1a..7180ed72 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -309,6 +309,11 @@ def rollout_internal_error(cls, error_message: str, extra_info: Optional[Dict[st
         """Create a status indicating the rollout failed with an internal error."""
         return cls.internal_error(error_message, cls._build_details_with_extra_info(extra_info))
 
+    @classmethod
+    def internal_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an internal error occurred."""
+        return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
+
     # For backwards compatibility
     @classmethod
     def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]] = None) -> "Status":
@@ -316,8 +321,8 @@ def rollout_error(cls, error_message: str, extra_info: Optional[Dict[str, Any]]
         return cls.internal_error(error_message, cls._build_details_with_extra_info(extra_info))
 
     @classmethod
-    def internal_error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
-        """Create a status indicating an internal error occurred."""
+    def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status":
+        """Create a status indicating an error occurred."""
         return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
 
     # UNAVAILABLE = 14
diff --git a/eval_protocol/pytest/evaluation_test_utils.py b/eval_protocol/pytest/evaluation_test_utils.py
index c582d4be..26b0d799 100644
--- a/eval_protocol/pytest/evaluation_test_utils.py
+++ b/eval_protocol/pytest/evaluation_test_utils.py
@@ -398,7 +398,7 @@ async def execute_row_with_backoff(task: asyncio.Task[EvaluationRow], row: Evalu
                 else:
                     # Non-retryable exception - fail immediately
                     logging.error(f"❌ Rollout failed (non-retryable error encountered): {repr(e)}")
-                    row.rollout_status = Status.rollout_error(repr(e))
+                    row.rollout_status = Status.rollout_error(str(e))
                     return row
 
         async def execute_row_with_backoff_and_log(
diff --git a/tests/remote_server/test_remote_fireworks_propagate_status.py b/tests/remote_server/test_remote_fireworks_propagate_status.py
index e415ed61..7c05172f 100644
--- a/tests/remote_server/test_remote_fireworks_propagate_status.py
+++ b/tests/remote_server/test_remote_fireworks_propagate_status.py
@@ -1,15 +1,4 @@
-# MANUAL SERVER STARTUP REQUIRED:
-#
-# For Python server testing, start:
-# python -m tests.remote_server.remote_server (runs on http://127.0.0.1:3000)
-#
-# For TypeScript server testing, start:
-# cd tests/remote_server/typescript-server
-# npm install
-# npm start
-#
-# The TypeScript server should be running on http://127.0.0.1:3000
-# You only need to start one of the servers!
+# AUTO SERVER STARTUP: Server is automatically started and stopped by the test
 
 import subprocess
 import socket

From a71074ec111c9321e5cb2e8366dbb56504f2fc3a Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Thu, 30 Oct 2025 03:43:20 -0700
Subject: [PATCH 14/16] update remote rollout processor

---
 eval_protocol/pytest/remote_rollout_processor.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index b8af095d..f0bbe360 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -94,7 +94,7 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
             def _post_init() -> None:
                 url = f"{remote_base_url}/init"
                 try:
-                    r = requests.post(url, json=init_payload.model_dump(), timeout=30)
+                    r = requests.post(url, json=init_payload.model_dump(), timeout=300)
                     r.raise_for_status()
                 except requests.exceptions.Timeout:
                     raise TimeoutError(
@@ -133,9 +133,9 @@ def _get_status() -> Dict[str, Any]:
                     # For all other exceptions, raise them
                     raise
 
-                # Search Fireworks tracing logs for completion
-                completed_logs = self._tracing_adapter.search_logs(
-                    tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
+                # Search Fireworks tracing logs for completion (run in thread to avoid blocking event loop)
+                completed_logs = await asyncio.to_thread(
+                    self._tracing_adapter.search_logs, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
                 )
                 # Filter for logs that actually have status information
                 status_logs = []

From 72e3506e544eff82f22237b374c22c2e395f2751 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sat, 1 Nov 2025 01:59:25 -0700
Subject: [PATCH 15/16] fix: increase timeout for remote rollout processor

---
 eval_protocol/pytest/remote_rollout_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index f0bbe360..dd179e34 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -98,7 +98,7 @@ def _post_init() -> None:
                     r.raise_for_status()
                 except requests.exceptions.Timeout:
                     raise TimeoutError(
-                        f"The /init endpoint tried {url} with {init_payload.model_dump()} but timed out after 30 seconds."
+                        f"The /init endpoint tried {url} with {init_payload.model_dump()} but timed out after 300 seconds."
                     )
 
             await asyncio.to_thread(_post_init)

From d63e10985c9379057753a826985c8b1784343c32 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sat, 1 Nov 2025 02:00:44 -0700
Subject: [PATCH 16/16] add back bad request

---
 eval_protocol/pytest/exception_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
index 5e1e1797..e4bb1b7c 100644
--- a/eval_protocol/pytest/exception_config.py
+++ b/eval_protocol/pytest/exception_config.py
@@ -38,6 +38,7 @@
     litellm.exceptions.NotFoundError,
     litellm.exceptions.ServiceUnavailableError,
     litellm.exceptions.APIError,
+    litellm.exceptions.BadRequestError,
     # Eval Protocol exceptions
     eval_protocol.exceptions.UnknownError,
     eval_protocol.exceptions.DeadlineExceededError,