Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions eval_protocol/pytest/exception_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
return _default_retryable_exceptions

# Lazy imports (these are expensive)
import aiohttp
import httpx
import litellm
import requests
Expand All @@ -32,6 +33,9 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
ConnectionError, # type: ignore[assignment]
TimeoutError, # type: ignore[assignment]
OSError, # type: ignore[assignment] # Covers network-related OS errors
# aiohttp library exceptions
aiohttp.ClientConnectionError,
aiohttp.ServerDisconnectedError,
# Requests library exceptions
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
Expand Down
16 changes: 16 additions & 0 deletions eval_protocol/pytest/remote_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,27 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
try:
session = self._get_or_create_session()
async with session.post(init_url, json=init_payload.model_dump(), timeout=timeout_init) as resp:
if resp.status >= 500:
body = await resp.text()
raise ConnectionError(f"Remote /init returned server error (HTTP {resp.status}): {body}")
if resp.status >= 400:
body = await resp.text()
raise RuntimeError(f"Remote /init failed (HTTP {resp.status}): {body}")
resp.raise_for_status()
await resp.read() # Drain the response body and release the connection back to the pool
except asyncio.CancelledError:
# Distinguish intentional cancellation (Ctrl+C, test teardown) from
# aiohttp-internal cancellation caused by a poisoned DNS resolver
# after a server disconnect. Task.cancelling() returns the number
# of pending cancel() calls; > 0 means someone explicitly cancelled
# this task.
current = asyncio.current_task()
if current is not None and current.cancelling() > 0: # pyright: ignore[reportAttributeAccessIssue]
Comment thread
xzrderek marked this conversation as resolved.
Outdated
raise # Intentional cancellation — propagate immediately
Comment thread
xzrderek marked this conversation as resolved.
Outdated
# Network-level failure; discard the session so retries get a
# fresh connection pool.
self._session = None
raise ConnectionError("Remote server connection lost (request cancelled)")
except asyncio.TimeoutError:
raise TimeoutError(
f"The /init endpoint tried {init_url} with {init_payload.model_dump()} but timed out after 300 seconds."
Expand Down
Loading