python-sdk/eval_protocol/pytest/types.py at 2eeb478c637d2d4897bbc6c03df2906f4216f494 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Parameter types
"""

import asyncio
from dataclasses import dataclass, field
from typing import Any, Callable, Literal

from eval_protocol.dataset_logger import default_logger
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger

from ..models import CompletionParams, EvaluationRow, Message
from .exception_config import ExceptionHandlerConfig
from .rollout_result_post_processor import RolloutResultPostProcessor

ModelParam = str  # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
DatasetPathParam = str
InputMessagesParam = list[Message]
EvaluationInputParam = dict[str, Any]  # pyright: ignore[reportExplicitAny]
RolloutProcessorInputParam = dict[str, Any]  # pyright: ignore[reportExplicitAny]

Dataset = list[EvaluationRow]

EvaluationTestMode = Literal["pointwise", "groupwise", "all"]
"""
"pointwise": (default) applies test function to each row (rollout result).
"groupwise": applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
"all": applies test function to the whole dataset.
"""

ServerMode = Literal["per_run", "shared"]
"""
"per_run": start a new MCP server for each eval run / training step, only reuse the same server only for retries within that run.
"shared": start a single MCP server the first time it's needed, then reuse that same server across multiple eval runs / training steps.
"""

"""
Test function types
"""
# Type variable for the decorated function
from collections.abc import Awaitable

# TestFunction can be either:
# 1. an async/sync function that accepts EvaluationRow and returns EvaluationRow
# 2. an async/sync function that accepts list[EvaluationRow] and returns list[EvaluationRow]
TestFunction = (
    Callable[[], EvaluationRow]
    | Callable[[], Awaitable[EvaluationRow]]
    | Callable[[], Dataset]
    | Callable[[], Awaitable[Dataset]]
    | Callable[[EvaluationRow], EvaluationRow]
    | Callable[[EvaluationRow], Awaitable[EvaluationRow]]
    | Callable[[list[EvaluationRow]], list[EvaluationRow]]
    | Callable[[list[EvaluationRow]], Awaitable[list[EvaluationRow]]]
    | Callable[[Dataset], Dataset]
    | Callable[[Dataset], Awaitable[Dataset]]
)


"""
Rollout processor types
"""


@dataclass
class RolloutProcessorConfig:
    completion_params: CompletionParams  # input parameters for inference
    mcp_config_path: str
    semaphore: asyncio.Semaphore  # shared semaphore for unified concurrency control
    server_script_path: str | None = (
        None  # TODO: change from server_script_path to mcp_config_path for agent rollout processor
    )
    steps: int = 30  # max number of rollout steps
    logger: DatasetLogger = default_logger  # logger to use during rollout for mid-rollout logs
    kwargs: dict[str, Any] = field(  # pyright: ignore[reportExplicitAny]
        default_factory=dict
    )  # any additional kwargs to pass to the rollout processor
    exception_handler_config: ExceptionHandlerConfig | None = None  # configuration for exception handling with backoff
    post_processor: RolloutResultPostProcessor | None = None  # optional post-processor for quality checks