Skip to content

Commit 2a17b47

Browse files
author
Dylan Huang
committed
Add SCORE_INVALID status to Status model and update related components
- Introduced SCORE_INVALID status code in the Status model. - Added score_invalid method to create a status indicating an invalid score. - Updated evaluation postprocessing to set status when score is invalid. - Enhanced StatusIndicator component to display SCORE_INVALID status. - Updated TypeScript types to include SCORE_INVALID in status codes.
1 parent cbb505c commit 2a17b47

File tree

5 files changed

+27
-4
lines changed

5 files changed

+27
-4
lines changed

eval_protocol/models.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class Code(int, Enum):
114114
# Custom codes for EP (using higher numbers to avoid conflicts)
115115
FINISHED = 100
116116
RUNNING = 101
117+
SCORE_INVALID = 102
117118

118119
@classmethod
119120
def rollout_running(cls) -> "Status":
@@ -167,6 +168,13 @@ def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = Non
167168
"""Create a status indicating the rollout failed with an error."""
168169
return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])
169170

171+
@classmethod
172+
def score_invalid(
173+
cls, message: str = "Score is invalid", details: Optional[List[Dict[str, Any]]] = None
174+
) -> "Status":
175+
"""Create a status indicating the score is invalid."""
176+
return cls(code=cls.Code.SCORE_INVALID, message=message, details=details or [])
177+
170178
def is_running(self) -> bool:
171179
"""Check if the status indicates the rollout is running."""
172180
return self.code == self.Code.RUNNING
@@ -183,6 +191,10 @@ def is_stopped(self) -> bool:
183191
"""Check if the status indicates the rollout was stopped."""
184192
return self.code == self.Code.CANCELLED
185193

194+
def is_score_invalid(self) -> bool:
195+
"""Check if the status indicates the score is invalid."""
196+
return self.code == self.Code.SCORE_INVALID
197+
186198
def get_termination_reason(self) -> Optional[TerminationReason]:
187199
"""Extract termination reason from details if present."""
188200
for detail in self.details:

eval_protocol/pytest/evaluation_test_postprocess.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
import time
99
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
10-
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold
10+
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold, Status
1111
from eval_protocol.pytest.handle_persist_flow import handle_persist_flow
1212
from eval_protocol.pytest.types import EvaluationTestMode
1313
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename
@@ -80,6 +80,9 @@ def postprocess(
8080
result.evaluation_result.agg_score = agg_score
8181
if result.evaluation_result.standard_error is None:
8282
result.evaluation_result.standard_error = standard_error
83+
if result.evaluation_result.is_score_valid is False:
84+
if result.eval_metadata is not None:
85+
result.eval_metadata.status = Status.score_invalid()
8386
result.execution_metadata.experiment_duration_seconds = experiment_duration_seconds
8487
active_logger.log(result)
8588

eval_protocol/quickstart/llm_judge_langfuse.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
adapter = create_langfuse_adapter()
2323
input_rows = adapter.get_evaluation_rows(
2424
to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
25-
limit=711,
26-
sample_size=50,
25+
limit=10,
26+
sample_size=10,
2727
sleep_between_gets=3.0,
2828
max_retries=5,
2929
)
@@ -50,7 +50,7 @@
5050
input_rows=[input_rows],
5151
rollout_processor=SingleTurnRolloutProcessor(),
5252
preprocess_fn=multi_turn_assistant_to_ground_truth,
53-
max_concurrent_evaluations=2,
53+
max_concurrent_evaluations=64,
5454
)
5555
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
5656
return await aha_judge(row)

vite-app/src/components/StatusIndicator.tsx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ const StatusIndicator: React.FC<StatusIndicatorProps> = ({
5858
textColor: "text-yellow-700",
5959
text: "stopped",
6060
};
61+
case "SCORE_INVALID":
62+
return {
63+
dotColor: "bg-red-500",
64+
textColor: "text-red-700",
65+
text: "score invalid",
66+
};
6167
default:
6268
return {
6369
dotColor: "bg-gray-500",

vite-app/src/types/eval-protocol.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ export const StatusCodeSchema = z
157157
"UNAUTHENTICATED",
158158
"FINISHED",
159159
"RUNNING",
160+
"SCORE_INVALID",
160161
])
161162
.describe("Common gRPC status codes as defined in google.rpc.Code");
162163

@@ -181,6 +182,7 @@ export const STATUS_CODE_MAP: Record<number, StatusCode> = {
181182
16: "UNAUTHENTICATED",
182183
100: "FINISHED",
183184
101: "RUNNING",
185+
102: "SCORE_INVALID",
184186
} as const;
185187

186188
// Helper function to get status code name from integer

0 commit comments

Comments
 (0)