Skip to content

Commit 2d46f12

Browse files
authored
adding in the openai integration (#338)
* adding in the openai integration * move to not be in package * update path * add to export * remove unneeded
1 parent ca8b2e8 commit 2d46f12

File tree

5 files changed

+350
-4
lines changed

5 files changed

+350
-4
lines changed

eval_protocol/integrations/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
from .openeval import adapt
44
from .trl import create_trl_adapter
5+
from .openai_rft import build_python_grader_from_evaluation_test
56

6-
__all__ = [
7-
"adapt",
8-
"create_trl_adapter",
9-
]
7+
__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"]
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""
2+
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
3+
4+
Currently provides:
5+
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
6+
an OpenAI Python grader spec ({"type": "python", "source": ...}).
7+
"""
8+
9+
import ast
10+
import inspect
11+
import textwrap
12+
13+
14+
def build_python_grader_from_evaluation_test(test_fn) -> dict:
15+
"""
16+
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
17+
18+
Assumptions:
19+
- `test_fn` is either:
20+
* the core evaluation function, or
21+
* an @evaluation_test-decorated function that carries `_origin_func`.
22+
Its effective signature looks like:
23+
24+
def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
25+
26+
- The function treats `row` as an `EvaluationRow` and only relies on attributes
27+
we provide in the duck-typed stand-in:
28+
* row.ground_truth
29+
* row.messages
30+
* row.item (raw item dict)
31+
* row.sample (raw sample dict)
32+
33+
- We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows:
34+
* item["reference_answer"] -> row.ground_truth
35+
* item["messages"] (if present) -> row.messages (normalized to Message-like objects)
36+
* sample["output_text"] -> appended as the last assistant message in row.messages
37+
* the original dicts are also available via row.item / row.sample
38+
39+
- The function returns either:
40+
* a numeric score, or
41+
* an object/dict with a `score` field, or
42+
* an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
43+
"""
44+
45+
# If the user passed an @evaluation_test wrapper, try to recover the original function
46+
origin = getattr(test_fn, "_origin_func", test_fn)
47+
48+
# Get the source of the original function
49+
src = inspect.getsource(origin)
50+
src = textwrap.dedent(src)
51+
52+
# Parse into AST so we can safely strip decorators and type annotations
53+
tree = ast.parse(src)
54+
55+
class _StripAnnotationsAndDecorators(ast.NodeTransformer):
56+
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
57+
# Drop all decorators (e.g., @evaluation_test)
58+
node.decorator_list = []
59+
# Remove return type annotation
60+
node.returns = None
61+
self.generic_visit(node)
62+
return node
63+
64+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
65+
node.decorator_list = []
66+
node.returns = None
67+
self.generic_visit(node)
68+
return node
69+
70+
def visit_arg(self, node: ast.arg) -> ast.AST:
71+
# Remove all parameter annotations (e.g., row: EvaluationRow)
72+
node.annotation = None
73+
return node
74+
75+
transformer = _StripAnnotationsAndDecorators()
76+
tree = transformer.visit(tree)
77+
ast.fix_missing_locations(tree)
78+
79+
# Find the first function definition and rename it to _ep_eval
80+
func_node: ast.AST | None = None
81+
for node in tree.body:
82+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
83+
func_node = node
84+
break
85+
86+
if func_node is None:
87+
raise ValueError("Expected a function definition in test_fn source.")
88+
89+
func_node.name = "_ep_eval"
90+
91+
# Turn the modified AST back into source
92+
src = ast.unparse(tree)
93+
94+
# Helper code that will live *inside* the grader source
95+
helper = """
96+
from typing import Any, Dict
97+
from types import SimpleNamespace
98+
99+
100+
class EvaluationRow(SimpleNamespace):
101+
\"\"\"Minimal duck-typed stand-in for an evaluation row.
102+
103+
Extend this with whatever attributes your eval logic uses.
104+
\"\"\"
105+
pass
106+
107+
108+
class EvaluateResult(SimpleNamespace):
109+
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
110+
111+
This lets evaluation-style functions that construct EvaluateResult(score=...)
112+
run inside the Python grader sandbox without importing eval_protocol.
113+
\"\"\"
114+
115+
def __init__(self, score: float, **kwargs: Any) -> None:
116+
super().__init__(score=score, **kwargs)
117+
118+
119+
class Message(SimpleNamespace):
120+
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
121+
pass
122+
123+
124+
def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
125+
# Start from any item-provided messages (EP-style), defaulting to [].
126+
raw_messages = item.get("messages") or []
127+
normalized_messages = []
128+
for m in raw_messages:
129+
if isinstance(m, dict):
130+
normalized_messages.append(
131+
Message(
132+
role=m.get("role"),
133+
content=m.get("content"),
134+
)
135+
)
136+
else:
137+
# Already Message-like; rely on duck typing (must have role/content)
138+
normalized_messages.append(m)
139+
140+
reference = item.get("reference_answer")
141+
prediction = sample.get("output_text")
142+
143+
# EP-style: ensure the model prediction is present as the last assistant message
144+
if prediction is not None:
145+
normalized_messages = list(normalized_messages) # shallow copy
146+
normalized_messages.append(Message(role="assistant", content=prediction))
147+
148+
return EvaluationRow(
149+
ground_truth=reference,
150+
messages=normalized_messages,
151+
item=item,
152+
sample=sample,
153+
)
154+
155+
156+
def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
157+
row = _build_row(sample, item)
158+
result = _ep_eval(row=row)
159+
160+
# Try to normalize different result shapes into a float score
161+
try:
162+
from collections.abc import Mapping
163+
164+
if isinstance(result, (int, float)):
165+
return float(result)
166+
167+
# EvaluateResult-like object with .score
168+
if hasattr(result, "score"):
169+
return float(result.score)
170+
171+
# EvaluationRow-like object with .evaluation_result.score
172+
eval_res = getattr(result, "evaluation_result", None)
173+
if eval_res is not None:
174+
if isinstance(eval_res, Mapping):
175+
if "score" in eval_res:
176+
return float(eval_res["score"])
177+
elif hasattr(eval_res, "score"):
178+
return float(eval_res.score)
179+
180+
# Dict-like with score
181+
if isinstance(result, Mapping) and "score" in result:
182+
return float(result["score"])
183+
except Exception:
184+
pass
185+
186+
return 0.0
187+
"""
188+
189+
full_source = src + "\n\n" + textwrap.dedent(helper)
190+
return {"type": "python", "source": full_source}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
3+
4+
We:
5+
- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
6+
- Wrap it in an @evaluation_test for normal eval usage
7+
- Convert the grading function into a Python grader spec with
8+
`build_python_grader_from_evaluation_test`
9+
"""
10+
11+
from typing import Any
12+
13+
from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
14+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
15+
from eval_protocol.pytest import evaluation_test
16+
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
17+
18+
19+
# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
20+
DEMO_ROWS = [
21+
EvaluationRow(
22+
messages=[
23+
Message(role="user", content="fuzzy wuzzy had no hair"),
24+
Message(role="assistant", content="fuzzy wuzzy was a bear"),
25+
],
26+
ground_truth="fuzzy wuzzy had no hair",
27+
)
28+
]
29+
30+
31+
@evaluation_test(
32+
input_rows=[DEMO_ROWS],
33+
rollout_processor=NoOpRolloutProcessor(),
34+
aggregation_method="mean",
35+
mode="pointwise",
36+
)
37+
def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
38+
"""
39+
Example @evaluation_test that scores a row using rapidfuzz.WRatio and
40+
attaches an EvaluateResult.
41+
"""
42+
# For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
43+
reference = row.ground_truth
44+
45+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
46+
last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
47+
prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
48+
49+
from rapidfuzz import fuzz, utils
50+
51+
score = float(
52+
fuzz.WRatio(
53+
str(prediction),
54+
str(reference),
55+
processor=utils.default_process,
56+
)
57+
/ 100.0
58+
)
59+
row.evaluation_result = EvaluateResult(score=score)
60+
return row
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import requests
3+
4+
from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
5+
from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval
6+
7+
8+
api_key = os.environ["OPENAI_API_KEY"]
9+
headers = {"Authorization": f"Bearer {api_key}"}
10+
11+
grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."}
12+
13+
# validate the grader
14+
resp = requests.post(
15+
"https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
16+
json={"grader": grader},
17+
headers=headers,
18+
)
19+
print("validate response:", resp.text)
20+
21+
# run the grader once with a dummy item/sample
22+
payload = {
23+
"grader": grader,
24+
"item": {"reference_answer": "fuzzy wuzzy had no hair"},
25+
"model_sample": "fuzzy wuzzy was a bear",
26+
}
27+
resp = requests.post(
28+
"https://api.openai.com/v1/fine_tuning/alpha/graders/run",
29+
json=payload,
30+
headers=headers,
31+
)
32+
print("run response:", resp.text)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import types
2+
from typing import Any, Dict, Callable
3+
4+
from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
5+
from eval_protocol.models import EvaluationRow
6+
7+
8+
def _exec_and_get_grade(source: str) -> Callable[[Dict[str, Any], Dict[str, Any]], float]:
9+
"""Execute generated grader source and return the grade(sample, item) function."""
10+
ns: Dict[str, Any] = {}
11+
exec(source, ns, ns)
12+
grade_obj = ns.get("grade")
13+
assert isinstance(grade_obj, types.FunctionType)
14+
return grade_obj
15+
16+
17+
def test_build_python_grader_from_plain_eval_function():
18+
"""Plain eval-style function should be converted into a working grade(sample, item)."""
19+
20+
# Simulate an eval-style function with annotations
21+
def my_eval(row: EvaluationRow, **kwargs: Any) -> float:
22+
# Simple correctness check: 1.0 if ground_truth matches sample["output_text"], else 0.0
23+
ground_truth = getattr(row, "ground_truth", None)
24+
sample = getattr(row, "sample", None) or {}
25+
pred = sample.get("output_text")
26+
return 1.0 if ground_truth == pred else 0.0
27+
28+
grader_spec = build_python_grader_from_evaluation_test(my_eval)
29+
assert grader_spec["type"] == "python"
30+
source = grader_spec["source"]
31+
32+
# Basic structural sanity checks on the generated source
33+
assert '"EvaluationRow"' not in source
34+
assert "def _ep_eval" in source
35+
assert "def my_eval" not in source
36+
assert "@evaluation_test" not in source
37+
38+
grade = _exec_and_get_grade(source)
39+
40+
sample = {"output_text": "42"}
41+
item = {"reference_answer": "42"}
42+
score = grade(sample, item)
43+
assert isinstance(score, float)
44+
assert score == 1.0
45+
46+
47+
def test_build_python_grader_from_wrapped_evaluation_test():
48+
"""When the function is wrapped and carries _origin_func, we should use the origin."""
49+
50+
def original_eval(row, **kwargs: Any) -> float:
51+
return 0.5
52+
53+
def wrapper(*args: Any, **kwargs: Any) -> float:
54+
return original_eval(*args, **kwargs)
55+
56+
# Simulate @evaluation_test attaching _origin_func
57+
setattr(wrapper, "_origin_func", original_eval)
58+
59+
grader_spec = build_python_grader_from_evaluation_test(wrapper)
60+
assert grader_spec["type"] == "python"
61+
source = grader_spec["source"]
62+
63+
grade = _exec_and_get_grade(source)
64+
score = grade({"output_text": "anything"}, {"reference_answer": "anything"})
65+
assert isinstance(score, float)
66+
assert score == 0.5

0 commit comments

Comments
 (0)