Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class Code(int, Enum):
# Custom codes for EP (using higher numbers to avoid conflicts)
FINISHED = 100
RUNNING = 101
SCORE_INVALID = 102

@classmethod
def rollout_running(cls) -> "Status":
Expand Down Expand Up @@ -167,6 +168,13 @@ def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = Non
"""Create a status indicating the rollout failed with an error."""
return cls(code=cls.Code.INTERNAL, message=error_message, details=details or [])

@classmethod
def score_invalid(
cls, message: str = "Score is invalid", details: Optional[List[Dict[str, Any]]] = None
) -> "Status":
"""Create a status indicating the score is invalid."""
return cls(code=cls.Code.SCORE_INVALID, message=message, details=details or [])

def is_running(self) -> bool:
"""Check if the status indicates the rollout is running."""
return self.code == self.Code.RUNNING
Expand All @@ -183,6 +191,10 @@ def is_stopped(self) -> bool:
"""Check if the status indicates the rollout was stopped."""
return self.code == self.Code.CANCELLED

def is_score_invalid(self) -> bool:
"""Check if the status indicates the score is invalid."""
return self.code == self.Code.SCORE_INVALID

def get_termination_reason(self) -> Optional[TerminationReason]:
"""Extract termination reason from details if present."""
for detail in self.details:
Expand Down
5 changes: 4 additions & 1 deletion eval_protocol/pytest/evaluation_test_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import time
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold, Status
from eval_protocol.pytest.handle_persist_flow import handle_persist_flow
from eval_protocol.pytest.types import EvaluationTestMode
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename
Expand Down Expand Up @@ -80,6 +80,9 @@ def postprocess(
result.evaluation_result.agg_score = agg_score
if result.evaluation_result.standard_error is None:
result.evaluation_result.standard_error = standard_error
if result.evaluation_result.is_score_valid is False:
if result.eval_metadata is not None:
result.eval_metadata.status = Status.score_invalid()
result.execution_metadata.experiment_duration_seconds = experiment_duration_seconds
active_logger.log(result)

Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion vite-app/dist/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>EP | Log Viewer</title>
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
<script type="module" crossorigin src="/assets/index-BOtcXdzP.js"></script>
<script type="module" crossorigin src="/assets/index-C8woq7EO.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-CSKGq1w7.css">
</head>
<body>
Expand Down
6 changes: 6 additions & 0 deletions vite-app/src/components/StatusIndicator.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ const StatusIndicator: React.FC<StatusIndicatorProps> = ({
textColor: "text-yellow-700",
text: "stopped",
};
case "SCORE_INVALID":
return {
dotColor: "bg-red-500",
textColor: "text-red-700",
text: "score invalid",
};
default:
return {
dotColor: "bg-gray-500",
Expand Down
2 changes: 2 additions & 0 deletions vite-app/src/types/eval-protocol.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ export const StatusCodeSchema = z
"UNAUTHENTICATED",
"FINISHED",
"RUNNING",
"SCORE_INVALID",
])
.describe("Common gRPC status codes as defined in google.rpc.Code");

Expand All @@ -181,6 +182,7 @@ export const STATUS_CODE_MAP: Record<number, StatusCode> = {
16: "UNAUTHENTICATED",
100: "FINISHED",
101: "RUNNING",
102: "SCORE_INVALID",
} as const;

// Helper function to get status code name from integer
Expand Down
Loading