Add SCORE_INVALID status to Status model and update related components

Dylan Huang · Dylan Huang · commit 2a17b4788b5a · 2025-09-22T14:32:10.000-07:00
- Introduced SCORE_INVALID status code in the Status model.
- Added score_invalid method to create a status indicating an invalid score.
- Updated evaluation postprocessing to set status when score is invalid.
- Enhanced StatusIndicator component to display SCORE_INVALID status.
- Updated TypeScript types to include SCORE_INVALID in status codes.
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -114,6 +114,7 @@ class Code(int, Enum):
         # Custom codes for EP (using higher numbers to avoid conflicts)
         FINISHED = 100
         RUNNING = 101
+        SCORE_INVALID = 102
 
     @classmethod
     def rollout_running(cls) -> "Status":
@@ -167,6 +168,13 @@ def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = Non
         """Create a status indicating the rollout failed with an error."""
         return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
 
+    @classmethod
+    def score_invalid(
+        cls, message: str = "Score is invalid", details: Optional[List[Dict[str, Any]]] = None
+    ) -> "Status":
+        """Create a status indicating the score is invalid."""
+        return cls(code=cls.Code.SCORE_INVALID, message=message, details=details or [])
+
     def is_running(self) -> bool:
         """Check if the status indicates the rollout is running."""
         return self.code == self.Code.RUNNING
@@ -183,6 +191,10 @@ def is_stopped(self) -> bool:
         """Check if the status indicates the rollout was stopped."""
         return self.code == self.Code.CANCELLED
 
+    def is_score_invalid(self) -> bool:
+        """Check if the status indicates the score is invalid."""
+        return self.code == self.Code.SCORE_INVALID
+
     def get_termination_reason(self) -> Optional[TerminationReason]:
         """Extract termination reason from details if present."""
         for detail in self.details:
diff --git a/eval_protocol/pytest/evaluation_test_postprocess.py b/eval_protocol/pytest/evaluation_test_postprocess.py
@@ -7,7 +7,7 @@
 import sys
 import time
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
-from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold
+from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold, Status
 from eval_protocol.pytest.handle_persist_flow import handle_persist_flow
 from eval_protocol.pytest.types import EvaluationTestMode
 from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename
@@ -80,6 +80,9 @@ def postprocess(
                     result.evaluation_result.agg_score = agg_score
                 if result.evaluation_result.standard_error is None:
                     result.evaluation_result.standard_error = standard_error
+                if result.evaluation_result.is_score_valid is False:
+                    if result.eval_metadata is not None:
+                        result.eval_metadata.status = Status.score_invalid()
             result.execution_metadata.experiment_duration_seconds = experiment_duration_seconds
             active_logger.log(result)
 
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -22,8 +22,8 @@
 adapter = create_langfuse_adapter()
 input_rows = adapter.get_evaluation_rows(
     to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-    limit=711,
-    sample_size=50,
+    limit=10,
+    sample_size=10,
     sleep_between_gets=3.0,
     max_retries=5,
 )
@@ -50,7 +50,7 @@
     input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    max_concurrent_evaluations=2,
+    max_concurrent_evaluations=64,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)
diff --git a/vite-app/src/components/StatusIndicator.tsx b/vite-app/src/components/StatusIndicator.tsx
@@ -58,6 +58,12 @@ const StatusIndicator: React.FC<StatusIndicatorProps> = ({
           textColor: "text-yellow-700",
           text: "stopped",
         };
+      case "SCORE_INVALID":
+        return {
+          dotColor: "bg-red-500",
+          textColor: "text-red-700",
+          text: "score invalid",
+        };
       default:
         return {
           dotColor: "bg-gray-500",
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -157,6 +157,7 @@ export const StatusCodeSchema = z
     "UNAUTHENTICATED",
     "FINISHED",
     "RUNNING",
+    "SCORE_INVALID",
   ])
   .describe("Common gRPC status codes as defined in google.rpc.Code");
 
@@ -181,6 +182,7 @@ export const STATUS_CODE_MAP: Record<number, StatusCode> = {
   16: "UNAUTHENTICATED",
   100: "FINISHED",
   101: "RUNNING",
+  102: "SCORE_INVALID",
 } as const;
 
 // Helper function to get status code name from integer