adding response quality validation for retry

morgendave · morgendave · commit cb3206c5719c · 2025-11-18T15:54:24.000-08:00
diff --git a/eval_protocol/exceptions.py b/eval_protocol/exceptions.py
@@ -134,6 +134,12 @@ class ScoreInvalidError(EvalProtocolError):
     status_code = 102
 
 
+class ResponseQualityError(EvalProtocolError):
+    """Response quality check failed (Status.Code.RESPONSE_QUALITY_ERROR = 103)"""
+
+    status_code = 103
+
+
 # Convenience mapping from status codes to exception classes
 # Only actual error conditions should raise exceptions
 STATUS_CODE_TO_EXCEPTION = {
@@ -157,6 +163,7 @@ class ScoreInvalidError(EvalProtocolError):
     100: None,  # FINISHED - success, no exception
     101: None,  # RUNNING - in progress, no exception
     102: None,  # SCORE_INVALID - success, no exception
+    103: ResponseQualityError,  # RESPONSE_QUALITY_ERROR - quality check failed
 }
 
 
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -117,6 +117,7 @@ class Code(int, Enum):
         FINISHED = 100
         RUNNING = 101
         SCORE_INVALID = 102
+        RESPONSE_QUALITY_ERROR = 103
 
     @classmethod
     def rollout_running(cls) -> "Status":
@@ -367,6 +368,13 @@ def score_invalid(
         """Create a status indicating the score is invalid."""
         return cls(code=cls.Code.SCORE_INVALID, message=message, details=details or [])
 
+    @classmethod
+    def response_quality_error(
+        cls, message: str = "Response quality check failed", details: Optional[List[Dict[str, Any]]] = None
+    ) -> "Status":
+        """Create a status indicating the response quality check failed."""
+        return cls(code=cls.Code.RESPONSE_QUALITY_ERROR, message=message, details=details or [])
+
     def is_running(self) -> bool:
         """Check if the status indicates the rollout is running."""
         return self.code == self.Code.RUNNING
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -8,6 +8,7 @@
 from .evaluation_test import evaluation_test
 from .exception_config import ExceptionHandlerConfig, BackoffConfig, get_default_exception_handler_config
 from .rollout_processor import RolloutProcessor
+from .rollout_result_post_processor import RolloutResultPostProcessor, NoOpRolloutResultPostProcessor
 from .types import RolloutProcessorConfig
 
 # Conditional import for optional dependencies
@@ -42,6 +43,8 @@
     "ExceptionHandlerConfig",
     "BackoffConfig",
     "get_default_exception_handler_config",
+    "RolloutResultPostProcessor",
+    "NoOpRolloutResultPostProcessor",
 ]
 
 # Only add to __all__ if available
diff --git a/eval_protocol/pytest/evaluation_test_utils.py b/eval_protocol/pytest/evaluation_test_utils.py
@@ -28,6 +28,7 @@
     ServerMode,
 )
 from eval_protocol.pytest.exception_config import get_default_exception_handler_config
+from eval_protocol.exceptions import ResponseQualityError
 
 import logging
 import json
@@ -363,7 +364,21 @@ async def execute_row_with_backoff_retry(row: EvaluationRow) -> EvaluationRow:
             """Execute rollout for a single row with backoff retry."""
             retry_config = replace(config, kwargs={**(config.kwargs or {}), "start_server": False})
             retry_tasks = rollout_processor([row], retry_config)
-            return await retry_tasks[0]
+            result = await retry_tasks[0]
+            
+            # Apply post-processing quality checks if configured
+            # This must be inside the retry function so ResponseQualityError can trigger retries
+            if config.post_processor is not None:
+                try:
+                    config.post_processor.process(result)
+                except ResponseQualityError as quality_error:
+                    # Re-raise ResponseQualityError to trigger retry logic
+                    raise quality_error
+                except Exception as post_process_error:
+                    # Wrap unexpected post-processor errors in ResponseQualityError
+                    raise ResponseQualityError(f"Post-processor failed: {post_process_error}") from post_process_error
+            
+            return result
 
         async def execute_row_with_backoff(task: asyncio.Task[EvaluationRow], row: EvaluationRow) -> EvaluationRow:
             """Execute a single row task with backoff retry."""
@@ -372,6 +387,15 @@ async def execute_row_with_backoff(task: asyncio.Task[EvaluationRow], row: Evalu
                 # Try original task first
                 result = await task  # pyright: ignore[reportUnknownVariableType]
 
+                # Apply post-processing quality checks if configured
+                if config.post_processor is not None:
+                    try:
+                        config.post_processor.process(result)
+                    except ResponseQualityError as quality_error:
+                        raise quality_error
+                    except Exception as post_process_error:
+                        raise ResponseQualityError(f"Post-processor failed: {post_process_error}") from post_process_error
+
                 _set_rollout_status_to_finished(result)
 
                 return result  # pyright: ignore[reportUnknownVariableType]
@@ -384,9 +408,9 @@ async def execute_row_with_backoff(task: asyncio.Task[EvaluationRow], row: Evalu
 
                 if is_retryable and not should_giveup:
                     # Use shared backoff function for retryable exceptions
+                    # Note: post-processing is handled inside execute_row_with_backoff_retry
                     try:
                         result = await execute_row_with_backoff_retry(row)
-
                         _set_rollout_status_to_finished(result)
 
                         return result
diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
@@ -4,7 +4,7 @@
 
 import os
 from dataclasses import dataclass, field
-from typing import Callable, Set, Type, Union
+from typing import Callable, Dict, Set, Type, Union
 
 import backoff
 
@@ -47,6 +47,7 @@
     eval_protocol.exceptions.UnavailableError,
     eval_protocol.exceptions.UnauthenticatedError,
     eval_protocol.exceptions.ResourceExhaustedError,
+    eval_protocol.exceptions.ResponseQualityError,
 }
 
 
@@ -78,39 +79,112 @@ class BackoffConfig:
     # Optional custom giveup function - if provided, overrides the default exception handling logic
     giveup_func: Callable[[Exception], bool] = lambda e: False
 
-    def get_backoff_decorator(self, exceptions: Set[Type[Exception]]):
-        """Get the appropriate backoff decorator based on configuration."""
+    def get_backoff_decorator(self, exceptions: Set[Type[Exception]], exception_backoff_overrides: Dict[Type[Exception], "BackoffConfig"] | None = None):
+        """Get the appropriate backoff decorator based on configuration.
+        
+        Args:
+            exceptions: Set of exception types to retry
+            exception_backoff_overrides: Optional mapping of exception types to custom backoff configs.
+                If an exception type has an override, that config will be used instead of this one.
+        """
         if not exceptions:
             # If no exceptions specified, return a no-op decorator
             def no_op_decorator(func):
                 return func
 
             return no_op_decorator
 
-        if self.strategy == "expo":
+        # If no overrides, use simple decorator for all exceptions
+        if not exception_backoff_overrides:
+            return self._create_single_decorator(exceptions, self)
+        
+        # Group exceptions by their backoff config to avoid double backoff
+        # Each exception type gets exactly one decorator based on its config
+        # Use a tuple of config attributes as the key since BackoffConfig is not hashable
+        config_to_exceptions: Dict[tuple, tuple[Set[Type[Exception]], "BackoffConfig"]] = {}
+        
+        for exc_type in exceptions:
+            if exc_type in exception_backoff_overrides:
+                override_config = exception_backoff_overrides[exc_type]
+            else:
+                override_config = self
+            
+            # Create a hashable key from config attributes
+            # Note: jitter and giveup_func are callable, which are hashable in Python
+            config_key = (
+                override_config.strategy,
+                override_config.base_delay,
+                override_config.max_delay,
+                override_config.max_tries,
+                override_config.factor,
+                id(override_config.jitter) if override_config.jitter is not None else None,
+                id(override_config.giveup_func) if override_config.giveup_func is not None else None,
+                override_config.raise_on_giveup,
+            )
+            
+            if config_key not in config_to_exceptions:
+                config_to_exceptions[config_key] = (set(), override_config)
+            exc_set, _ = config_to_exceptions[config_key]
+            exc_set.add(exc_type)
+        
+        # If all exceptions use the same config, use a single decorator
+        if len(config_to_exceptions) == 1:
+            exc_set, config = next(iter(config_to_exceptions.values()))
+            return self._create_single_decorator(exc_set, config)
+        
+        # Create separate decorators for each config group
+        # Each exception type gets exactly one decorator, preventing double backoff
+        decorators_by_config: list[tuple[Set[Type[Exception]], Callable]] = []
+        
+        for exc_set, config in config_to_exceptions.values():
+            decorator = self._create_single_decorator(exc_set, config)
+            if decorator:
+                decorators_by_config.append((exc_set, decorator))
+        
+        # Create a combined decorator that applies all decorators
+        # Each decorator only catches exceptions in its exception set, so no double backoff
+        def combined_decorator(func):
+            decorated_func = func
+            
+            # Apply each decorator in order (inner to outer)
+            # Each decorator only catches exceptions in its specific exception set
+            # Since exception sets are disjoint (grouped by config), no double backoff
+            for exc_set, decorator in decorators_by_config:
+                decorated_func = decorator(decorated_func)
+            
+            return decorated_func
+        
+        return combined_decorator
+    
+    def _create_single_decorator(self, exc_set: Set[Type[Exception]], config: "BackoffConfig"):
+        """Create a single backoff decorator for a set of exceptions."""
+        if not exc_set:
+            return None
+        
+        if config.strategy == "expo":
             return backoff.on_exception(
                 backoff.expo,
-                tuple(exceptions),
-                max_tries=self.max_tries,
-                base=self.base_delay,
-                max_value=self.max_delay,
-                factor=self.factor,
-                jitter=self.jitter,
-                giveup=self.giveup_func,
-                raise_on_giveup=self.raise_on_giveup,
+                tuple(exc_set),
+                max_tries=config.max_tries,
+                base=config.base_delay,
+                max_value=config.max_delay,
+                factor=config.factor,
+                jitter=config.jitter,
+                giveup=config.giveup_func,
+                raise_on_giveup=config.raise_on_giveup,
             )
-        elif self.strategy == "constant":
+        elif config.strategy == "constant":
             return backoff.on_exception(
                 backoff.constant,
-                tuple(exceptions),
-                max_tries=self.max_tries,
-                interval=self.base_delay,
-                jitter=self.jitter,
-                giveup=self.giveup_func,
-                raise_on_giveup=self.raise_on_giveup,
+                tuple(exc_set),
+                max_tries=config.max_tries,
+                interval=config.base_delay,
+                jitter=config.jitter,
+                giveup=config.giveup_func,
+                raise_on_giveup=config.raise_on_giveup,
             )
         else:
-            raise ValueError(f"Unknown backoff strategy: {self.strategy}")
+            raise ValueError(f"Unknown backoff strategy: {config.strategy}")
 
 
 @dataclass
@@ -123,6 +197,10 @@ class ExceptionHandlerConfig:
     # Backoff configuration
     backoff_config: BackoffConfig = field(default_factory=BackoffConfig)
 
+    # Per-exception backoff overrides - allows custom backoff config for specific exception types
+    # For example, ResponseQualityError can use no backoff (base_delay=0, max_delay=0)
+    exception_backoff_overrides: Dict[Type[Exception], BackoffConfig] = field(default_factory=dict)
+
     def __post_init__(self):
         """Automatically apply environment variable overrides after initialization."""
         # Override backoff settings from environment variables
@@ -133,10 +211,23 @@ def __post_init__(self):
         if "EP_FAIL_ON_MAX_RETRY" in os.environ:
             fail_on_max_retry = os.environ["EP_FAIL_ON_MAX_RETRY"].lower()
             self.backoff_config.raise_on_giveup = fail_on_max_retry != "false"
+        
+        # Set default no-backoff config for ResponseQualityError if not already set
+        if eval_protocol.exceptions.ResponseQualityError not in self.exception_backoff_overrides:
+            # Default: no backoff for ResponseQualityError (immediate retry)
+            self.exception_backoff_overrides[eval_protocol.exceptions.ResponseQualityError] = BackoffConfig(
+                strategy="constant",
+                base_delay=0.0,
+                max_delay=0.0,
+                max_tries=self.backoff_config.max_tries,
+            )
 
     def get_backoff_decorator(self):
         """Get the backoff decorator configured for this exception handler."""
-        return self.backoff_config.get_backoff_decorator(self.retryable_exceptions)
+        return self.backoff_config.get_backoff_decorator(
+            self.retryable_exceptions,
+            self.exception_backoff_overrides if self.exception_backoff_overrides else None
+        )
 
 
 def get_default_exception_handler_config() -> ExceptionHandlerConfig:
diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py
@@ -11,6 +11,7 @@
 
 from ..models import CompletionParams, EvaluationRow, Message
 from .exception_config import ExceptionHandlerConfig
+from .rollout_result_post_processor import RolloutResultPostProcessor
 
 ModelParam = str  # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
 DatasetPathParam = str
@@ -75,3 +76,4 @@ class RolloutProcessorConfig:
         default_factory=dict
     )  # any additional kwargs to pass to the rollout processor
     exception_handler_config: ExceptionHandlerConfig | None = None  # configuration for exception handling with backoff
+    post_processor: RolloutResultPostProcessor | None = None  # optional post-processor for quality checks
diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py
diff --git a/tests/test_retry_mechanism.py b/tests/test_retry_mechanism.py