Skip to content

Commit 3b38996

Browse files
authored
Merge branch 'main' into shrey/OpenEnvRolloutProcessor
2 parents 9766c5d + f10c29f commit 3b38996

30 files changed

+4032
-25
lines changed

docs/intro.png

-117 KB
Loading

eval_protocol/benchmarks/test_glm_streaming_compliance.py

Lines changed: 3551 additions & 0 deletions
Large diffs are not rendered by default.

eval_protocol/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def parse_args(args=None):
402402
rft_parser.add_argument("--lora-rank", type=int, default=16)
403403
rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
404404
rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
405-
rft_parser.add_argument("--accelerator-count", type=int, default=1)
405+
rft_parser.add_argument("--accelerator-count", type=int)
406406
rft_parser.add_argument("--region", help="Fireworks region enum value")
407407
rft_parser.add_argument("--display-name", help="RFT job display name")
408408
rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")

eval_protocol/integrations/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
from .openeval import adapt
44
from .trl import create_trl_adapter
5+
from .openai_rft import build_python_grader_from_evaluation_test
56

6-
__all__ = [
7-
"adapt",
8-
"create_trl_adapter",
9-
]
7+
__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"]
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""
2+
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
3+
4+
Currently provides:
5+
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
6+
an OpenAI Python grader spec ({"type": "python", "source": ...}).
7+
"""
8+
9+
import ast
10+
import inspect
11+
import textwrap
12+
13+
14+
def build_python_grader_from_evaluation_test(test_fn) -> dict:
15+
"""
16+
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
17+
18+
Assumptions:
19+
- `test_fn` is either:
20+
* the core evaluation function, or
21+
* an @evaluation_test-decorated function that carries `_origin_func`.
22+
Its effective signature looks like:
23+
24+
def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
25+
26+
- The function treats `row` as an `EvaluationRow` and only relies on attributes
27+
we provide in the duck-typed stand-in:
28+
* row.ground_truth
29+
* row.messages
30+
* row.item (raw item dict)
31+
* row.sample (raw sample dict)
32+
33+
- We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows:
34+
* item["reference_answer"] -> row.ground_truth
35+
* item["messages"] (if present) -> row.messages (normalized to Message-like objects)
36+
* sample["output_text"] -> appended as the last assistant message in row.messages
37+
* the original dicts are also available via row.item / row.sample
38+
39+
- The function returns either:
40+
* a numeric score, or
41+
* an object/dict with a `score` field, or
42+
* an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
43+
"""
44+
45+
# If the user passed an @evaluation_test wrapper, try to recover the original function
46+
origin = getattr(test_fn, "_origin_func", test_fn)
47+
48+
# Get the source of the original function
49+
src = inspect.getsource(origin)
50+
src = textwrap.dedent(src)
51+
52+
# Parse into AST so we can safely strip decorators and type annotations
53+
tree = ast.parse(src)
54+
55+
class _StripAnnotationsAndDecorators(ast.NodeTransformer):
56+
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
57+
# Drop all decorators (e.g., @evaluation_test)
58+
node.decorator_list = []
59+
# Remove return type annotation
60+
node.returns = None
61+
self.generic_visit(node)
62+
return node
63+
64+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
65+
node.decorator_list = []
66+
node.returns = None
67+
self.generic_visit(node)
68+
return node
69+
70+
def visit_arg(self, node: ast.arg) -> ast.AST:
71+
# Remove all parameter annotations (e.g., row: EvaluationRow)
72+
node.annotation = None
73+
return node
74+
75+
transformer = _StripAnnotationsAndDecorators()
76+
tree = transformer.visit(tree)
77+
ast.fix_missing_locations(tree)
78+
79+
# Find the first function definition and rename it to _ep_eval
80+
func_node: ast.AST | None = None
81+
for node in tree.body:
82+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
83+
func_node = node
84+
break
85+
86+
if func_node is None:
87+
raise ValueError("Expected a function definition in test_fn source.")
88+
89+
func_node.name = "_ep_eval"
90+
91+
# Turn the modified AST back into source
92+
src = ast.unparse(tree)
93+
94+
# Helper code that will live *inside* the grader source
95+
helper = """
96+
from typing import Any, Dict
97+
from types import SimpleNamespace
98+
99+
100+
class EvaluationRow(SimpleNamespace):
101+
\"\"\"Minimal duck-typed stand-in for an evaluation row.
102+
103+
Extend this with whatever attributes your eval logic uses.
104+
\"\"\"
105+
pass
106+
107+
108+
class EvaluateResult(SimpleNamespace):
109+
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
110+
111+
This lets evaluation-style functions that construct EvaluateResult(score=...)
112+
run inside the Python grader sandbox without importing eval_protocol.
113+
\"\"\"
114+
115+
def __init__(self, score: float, **kwargs: Any) -> None:
116+
super().__init__(score=score, **kwargs)
117+
118+
119+
class Message(SimpleNamespace):
120+
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
121+
pass
122+
123+
124+
def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
125+
# Start from any item-provided messages (EP-style), defaulting to [].
126+
raw_messages = item.get("messages") or []
127+
normalized_messages = []
128+
for m in raw_messages:
129+
if isinstance(m, dict):
130+
normalized_messages.append(
131+
Message(
132+
role=m.get("role"),
133+
content=m.get("content"),
134+
)
135+
)
136+
else:
137+
# Already Message-like; rely on duck typing (must have role/content)
138+
normalized_messages.append(m)
139+
140+
reference = item.get("reference_answer")
141+
prediction = sample.get("output_text")
142+
143+
# EP-style: ensure the model prediction is present as the last assistant message
144+
if prediction is not None:
145+
normalized_messages = list(normalized_messages) # shallow copy
146+
normalized_messages.append(Message(role="assistant", content=prediction))
147+
148+
return EvaluationRow(
149+
ground_truth=reference,
150+
messages=normalized_messages,
151+
item=item,
152+
sample=sample,
153+
)
154+
155+
156+
def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
157+
row = _build_row(sample, item)
158+
result = _ep_eval(row=row)
159+
160+
# Try to normalize different result shapes into a float score
161+
try:
162+
from collections.abc import Mapping
163+
164+
if isinstance(result, (int, float)):
165+
return float(result)
166+
167+
# EvaluateResult-like object with .score
168+
if hasattr(result, "score"):
169+
return float(result.score)
170+
171+
# EvaluationRow-like object with .evaluation_result.score
172+
eval_res = getattr(result, "evaluation_result", None)
173+
if eval_res is not None:
174+
if isinstance(eval_res, Mapping):
175+
if "score" in eval_res:
176+
return float(eval_res["score"])
177+
elif hasattr(eval_res, "score"):
178+
return float(eval_res.score)
179+
180+
# Dict-like with score
181+
if isinstance(result, Mapping) and "score" in result:
182+
return float(result["score"])
183+
except Exception:
184+
pass
185+
186+
return 0.0
187+
"""
188+
189+
full_source = src + "\n\n" + textwrap.dedent(helper)
190+
return {"type": "python", "source": full_source}

eval_protocol/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,14 @@ class ExecutionMetadata(BaseModel):
782782
extra: Optional[Dict[str, Any]] = Field(
783783
default=None,
784784
description="Arbitrary execution metadata for integrations (step rewards, token IDs, debug info, etc.).",
785+
finish_reason: Optional[str] = Field(
786+
default=None,
787+
description="finish_reason reported by the completion response for this row.",
788+
)
789+
790+
tool_call_count: Optional[int] = Field(
791+
default=None,
792+
description="Number of tool calls returned in the assistant message for this row.",
785793
)
786794

787795

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import json
23
import logging
34
import os
45
import time
@@ -98,8 +99,24 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
9899
assert isinstance(response, ModelResponse), "Response should be ModelResponse"
99100
assert isinstance(response.choices[0], Choices), "Response choice should be a Choices"
100101

101-
assistant_content = response.choices[0].message.content or ""
102-
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
102+
assistant_message = response.choices[0].message
103+
finish_reason = getattr(response.choices[0], "finish_reason", None)
104+
105+
# Extract content
106+
assistant_content = assistant_message.content or ""
107+
108+
# Extract reasoning content (if present)
109+
reasoning_content = getattr(assistant_message, "reasoning_content", None)
110+
if reasoning_content is None:
111+
reasoning_content = getattr(assistant_message, "reasoning", None)
112+
if reasoning_content is not None and not isinstance(reasoning_content, str):
113+
try:
114+
reasoning_content = json.dumps(reasoning_content)
115+
except Exception:
116+
reasoning_content = str(reasoning_content)
117+
118+
# Extract tool calls
119+
tool_calls = assistant_message.tool_calls if assistant_message.tool_calls else None
103120

104121
converted_tool_calls = None
105122
if tool_calls:
@@ -136,9 +153,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
136153
Message(
137154
role="assistant",
138155
content=assistant_content,
156+
reasoning_content=reasoning_content,
139157
tool_calls=converted_tool_calls,
140158
)
141159
]
160+
161+
row.execution_metadata.finish_reason = str(finish_reason) if finish_reason is not None else None
162+
row.execution_metadata.tool_call_count = (
163+
len(converted_tool_calls) if converted_tool_calls is not None else 0
164+
)
142165
row.execution_metadata.usage = (
143166
CompletionUsage( # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field
144167
prompt_tokens=response.usage.prompt_tokens, # pyright: ignore[reportAttributeAccessIssue]

eval_protocol/pytest/evaluation_test.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,13 @@ async def _collect_result(config, lst):
620620

621621
experiment_duration_seconds = time.perf_counter() - experiment_start_time
622622

623-
# for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
623+
if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
624+
raise AssertionError(
625+
"Some EvaluationRow instances are missing evaluation_result. "
626+
"Your @evaluation_test function must set `row.evaluation_result`"
627+
)
628+
629+
# for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
624630
# rollout_id is used to differentiate the result from different completion_params
625631
if mode == "groupwise":
626632
results_by_group = [
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
3+
4+
We:
5+
- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
6+
- Wrap it in an @evaluation_test for normal eval usage
7+
- Convert the grading function into a Python grader spec with
8+
`build_python_grader_from_evaluation_test`
9+
"""
10+
11+
from typing import Any
12+
13+
from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
14+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
15+
from eval_protocol.pytest import evaluation_test
16+
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
17+
18+
19+
# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
20+
DEMO_ROWS = [
21+
EvaluationRow(
22+
messages=[
23+
Message(role="user", content="fuzzy wuzzy had no hair"),
24+
Message(role="assistant", content="fuzzy wuzzy was a bear"),
25+
],
26+
ground_truth="fuzzy wuzzy had no hair",
27+
)
28+
]
29+
30+
31+
@evaluation_test(
32+
input_rows=[DEMO_ROWS],
33+
rollout_processor=NoOpRolloutProcessor(),
34+
aggregation_method="mean",
35+
mode="pointwise",
36+
)
37+
def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
38+
"""
39+
Example @evaluation_test that scores a row using rapidfuzz.WRatio and
40+
attaches an EvaluateResult.
41+
"""
42+
# For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
43+
reference = row.ground_truth
44+
45+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
46+
last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
47+
prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
48+
49+
from rapidfuzz import fuzz, utils
50+
51+
score = float(
52+
fuzz.WRatio(
53+
str(prediction),
54+
str(reference),
55+
processor=utils.default_process,
56+
)
57+
/ 100.0
58+
)
59+
row.evaluation_result = EvaluateResult(score=score)
60+
return row
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import requests
3+
4+
from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
5+
from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval
6+
7+
8+
api_key = os.environ["OPENAI_API_KEY"]
9+
headers = {"Authorization": f"Bearer {api_key}"}
10+
11+
grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."}
12+
13+
# validate the grader
14+
resp = requests.post(
15+
"https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
16+
json={"grader": grader},
17+
headers=headers,
18+
)
19+
print("validate response:", resp.text)
20+
21+
# run the grader once with a dummy item/sample
22+
payload = {
23+
"grader": grader,
24+
"item": {"reference_answer": "fuzzy wuzzy had no hair"},
25+
"model_sample": "fuzzy wuzzy was a bear",
26+
}
27+
resp = requests.post(
28+
"https://api.openai.com/v1/fine_tuning/alpha/graders/run",
29+
json=payload,
30+
headers=headers,
31+
)
32+
print("run response:", resp.text)

0 commit comments

Comments
 (0)