-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathtypes.py
More file actions
79 lines (66 loc) · 3.08 KB
/
types.py
File metadata and controls
79 lines (66 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Parameter types
"""
import asyncio
from dataclasses import dataclass, field
from typing import Any, Callable, Literal
from eval_protocol.dataset_logger import default_logger
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from ..models import CompletionParams, EvaluationRow, Message
from .exception_config import ExceptionHandlerConfig
from .rollout_result_post_processor import RolloutResultPostProcessor
ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
DatasetPathParam = str
InputMessagesParam = list[Message]
EvaluationInputParam = dict[str, Any] # pyright: ignore[reportExplicitAny]
RolloutProcessorInputParam = dict[str, Any] # pyright: ignore[reportExplicitAny]
Dataset = list[EvaluationRow]
EvaluationTestMode = Literal["pointwise", "groupwise", "all"]
"""
"pointwise": (default) applies test function to each row (rollout result).
"groupwise": applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
"all": applies test function to the whole dataset.
"""
ServerMode = Literal["per_run", "shared"]
"""
"per_run": start a new MCP server for each eval run / training step, only reuse the same server only for retries within that run.
"shared": start a single MCP server the first time it's needed, then reuse that same server across multiple eval runs / training steps.
"""
"""
Test function types
"""
# Type variable for the decorated function
from collections.abc import Awaitable
# TestFunction can be either:
# 1. an async/sync function that accepts EvaluationRow and returns EvaluationRow
# 2. an async/sync function that accepts list[EvaluationRow] and returns list[EvaluationRow]
TestFunction = (
Callable[[], EvaluationRow]
| Callable[[], Awaitable[EvaluationRow]]
| Callable[[], Dataset]
| Callable[[], Awaitable[Dataset]]
| Callable[[EvaluationRow], EvaluationRow]
| Callable[[EvaluationRow], Awaitable[EvaluationRow]]
| Callable[[list[EvaluationRow]], list[EvaluationRow]]
| Callable[[list[EvaluationRow]], Awaitable[list[EvaluationRow]]]
| Callable[[Dataset], Dataset]
| Callable[[Dataset], Awaitable[Dataset]]
)
"""
Rollout processor types
"""
@dataclass
class RolloutProcessorConfig:
completion_params: CompletionParams # input parameters for inference
mcp_config_path: str
semaphore: asyncio.Semaphore # shared semaphore for unified concurrency control
server_script_path: str | None = (
None # TODO: change from server_script_path to mcp_config_path for agent rollout processor
)
steps: int = 30 # max number of rollout steps
logger: DatasetLogger = default_logger # logger to use during rollout for mid-rollout logs
kwargs: dict[str, Any] = field( # pyright: ignore[reportExplicitAny]
default_factory=dict
) # any additional kwargs to pass to the rollout processor
exception_handler_config: ExceptionHandlerConfig | None = None # configuration for exception handling with backoff
post_processor: RolloutResultPostProcessor | None = None # optional post-processor for quality checks