Skip to content

Commit 2dccccf

Browse files
committed
adding in the openai integration
1 parent f409213 commit 2dccccf

File tree

5 files changed

+413
-0
lines changed

5 files changed

+413
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
OpenAI EP Evaluation Adapter
2+
==============================================
3+
4+
To see an end-to-end example of:
5+
- taking an `@evaluation_test` (`rapidfuzz_eval`),
6+
- converting it into a `{"type": "python", "source": ...}` grader spec with
7+
`build_python_grader_from_evaluation_test`, and
8+
- validating and running it against the OpenAI `/graders/*` HTTP APIs,
9+
run:
10+
11+
```
12+
pytest eval_protocol/integrations/openai_rft/example_rapidfuzz.py -vs # To show that this works as an EP evaluation_test
13+
14+
python eval_protocol/integrations/openai_rft/test_openai_grader.py
15+
```
16+
17+
You can expect an output like:
18+
19+
```
20+
(.venv) (base) derekxu@Mac-4147 python-sdk % python eval_protocol/integrations/openai_rft/test_openai_grader.py
21+
validate response: {
22+
"grader": {
23+
"type": "python",
24+
"source": "def _ep_eval(row, **kwargs):\n \"\"\"\n Example @evaluation_test that scores a row using rapidfuzz.WRatio and\n attaches an EvaluateResult.\n \"\"\"\n reference = row.ground_truth\n assistant_msgs = [m for m in row.messages if m.role == 'assistant']\n last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ''\n prediction = last_assistant_content if isinstance(last_assistant_content, str) else ''\n from rapidfuzz import fuzz, utils\n score = float(fuzz.WRatio(str(prediction), str(reference), processor=utils.default_process) / 100.0)\n row.evaluation_result = EvaluateResult(score=score)\n return row\n\n\nfrom typing import Any, Dict\nfrom types import SimpleNamespace\n\n\nclass EvaluationRow(SimpleNamespace):\n \"\"\"Minimal duck-typed stand-in for an evaluation row.\n\n Extend this with whatever attributes your eval logic uses.\n \"\"\"\n pass\n\n\nclass EvaluateResult(SimpleNamespace):\n \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.\n\n This lets evaluation-style functions that construct EvaluateResult(score=...)\n run inside the Python grader sandbox without importing eval_protocol.\n \"\"\"\n\n def __init__(self, score: float, **kwargs: Any) -> None:\n super().__init__(score=score, **kwargs)\n\n\nclass Message(SimpleNamespace):\n \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"\n pass\n\n\ndef _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:\n # Start from any item-provided messages (EP-style), defaulting to [].\n raw_messages = item.get(\"messages\") or []\n normalized_messages = []\n for m in raw_messages:\n if isinstance(m, dict):\n normalized_messages.append(\n Message(\n role=m.get(\"role\"),\n content=m.get(\"content\"),\n )\n )\n else:\n # Already Message-like; rely on duck typing (must have role/content)\n normalized_messages.append(m)\n\n reference = item.get(\"reference_answer\")\n prediction = sample.get(\"output_text\")\n\n # EP-style: ensure the model prediction is present as the last assistant message\n if prediction is not None:\n normalized_messages = list(normalized_messages) # shallow copy\n normalized_messages.append(Message(role=\"assistant\", content=prediction))\n\n return EvaluationRow(\n ground_truth=reference,\n messages=normalized_messages,\n item=item,\n sample=sample,\n )\n\n\ndef grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:\n row = _build_row(sample, item)\n result = _ep_eval(row=row)\n\n # Try to normalize different result shapes into a float score\n try:\n from collections.abc import Mapping\n\n if isinstance(result, (int, float)):\n return float(result)\n\n # EvaluateResult-like object with .score\n if hasattr(result, \"score\"):\n return float(result.score)\n\n # EvaluationRow-like object with .evaluation_result.score\n eval_res = getattr(result, \"evaluation_result\", None)\n if eval_res is not None:\n if isinstance(eval_res, Mapping):\n if \"score\" in eval_res:\n return float(eval_res[\"score\"])\n elif hasattr(eval_res, \"score\"):\n return float(eval_res.score)\n\n # Dict-like with score\n if isinstance(result, Mapping) and \"score\" in result:\n return float(result[\"score\"])\n except Exception:\n pass\n\n return 0.0\n",
25+
"name": "grader-VasDqHrerHW5"
26+
}
27+
}
28+
run response: {
29+
"reward": 0.7555555555555555,
30+
"metadata": {
31+
"name": "grader-Bbe0lDBJVP9C",
32+
"type": "python",
33+
"errors": {
34+
"formula_parse_error": false,
35+
"sample_parse_error": false,
36+
"sample_parse_error_details": null,
37+
"truncated_observation_error": false,
38+
"unresponsive_reward_error": false,
39+
"invalid_variable_error": false,
40+
"invalid_variable_error_details": null,
41+
"other_error": false,
42+
"python_grader_server_error": false,
43+
"python_grader_server_error_type": null,
44+
"python_grader_runtime_error": false,
45+
"python_grader_runtime_error_details": null,
46+
"model_grader_server_error": false,
47+
"model_grader_refusal_error": false,
48+
"model_grader_refusal_error_details": null,
49+
"model_grader_parse_error": false,
50+
"model_grader_parse_error_details": null,
51+
"model_grader_exceeded_max_tokens_error": false,
52+
"model_grader_server_error_details": null,
53+
"endpoint_grader_internal_error": false,
54+
"endpoint_grader_internal_error_details": null,
55+
"endpoint_grader_server_error": false,
56+
"endpoint_grader_server_error_details": null,
57+
"endpoint_grader_safety_check_error": false
58+
},
59+
"execution_time": 4.79397988319397,
60+
"scores": {},
61+
"token_usage": null,
62+
"sampled_model_name": null
63+
},
64+
"sub_rewards": {},
65+
"model_grader_token_usage_per_model": {}
66+
}
67+
(.venv) (base) derekxu@Mac-4147 python-sdk %
68+
```
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
3+
4+
Currently provides:
5+
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
6+
an OpenAI Python grader spec ({"type": "python", "source": ...}).
7+
"""
8+
9+
import ast
10+
import inspect
11+
import textwrap
12+
13+
14+
def build_python_grader_from_evaluation_test(test_fn) -> dict:
15+
"""
16+
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
17+
18+
Assumptions:
19+
- `test_fn` is the *core* evaluation function (not the @evaluation_test wrapper),
20+
or an @evaluation_test-decorated function that carries _origin_func.
21+
It should have a signature like:
22+
23+
def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
24+
25+
- The function only relies on attributes that we provide on `EvaluationRowLike`
26+
(you can extend that class as needed).
27+
28+
- We map OpenAI's (sample, item) to a duck‑typed `row`:
29+
- item["reference_answer"] -> row.ground_truth
30+
- sample["output_text"] -> appended as an assistant message
31+
- raw dicts available as row.item / row.sample
32+
33+
- The function returns either:
34+
- a numeric score, or
35+
- an object/dict with a `score` field, or
36+
- an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
37+
"""
38+
39+
# If the user passed an @evaluation_test wrapper, try to recover the original function
40+
origin = getattr(test_fn, "_origin_func", test_fn)
41+
42+
# Get the source of the original function
43+
src = inspect.getsource(origin)
44+
src = textwrap.dedent(src)
45+
46+
# Parse into AST so we can safely strip decorators and type annotations
47+
tree = ast.parse(src)
48+
49+
class _StripAnnotationsAndDecorators(ast.NodeTransformer):
50+
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
51+
# Drop all decorators (e.g., @evaluation_test)
52+
node.decorator_list = []
53+
# Remove return type annotation
54+
node.returns = None
55+
self.generic_visit(node)
56+
return node
57+
58+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
59+
node.decorator_list = []
60+
node.returns = None
61+
self.generic_visit(node)
62+
return node
63+
64+
def visit_arg(self, node: ast.arg) -> ast.AST:
65+
# Remove all parameter annotations (e.g., row: EvaluationRow)
66+
node.annotation = None
67+
return node
68+
69+
transformer = _StripAnnotationsAndDecorators()
70+
tree = transformer.visit(tree)
71+
ast.fix_missing_locations(tree)
72+
73+
# Find the first function definition and rename it to _ep_eval
74+
func_node: ast.AST | None = None
75+
for node in tree.body:
76+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
77+
func_node = node
78+
break
79+
80+
if func_node is None:
81+
raise ValueError("Expected a function definition in test_fn source.")
82+
83+
func_node.name = "_ep_eval"
84+
85+
# Turn the modified AST back into source
86+
src = ast.unparse(tree)
87+
88+
# Helper code that will live *inside* the grader source
89+
helper = """
90+
from typing import Any, Dict
91+
from types import SimpleNamespace
92+
93+
94+
class EvaluationRow(SimpleNamespace):
95+
\"\"\"Minimal duck-typed stand-in for an evaluation row.
96+
97+
Extend this with whatever attributes your eval logic uses.
98+
\"\"\"
99+
pass
100+
101+
102+
class EvaluateResult(SimpleNamespace):
103+
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
104+
105+
This lets evaluation-style functions that construct EvaluateResult(score=...)
106+
run inside the Python grader sandbox without importing eval_protocol.
107+
\"\"\"
108+
109+
def __init__(self, score: float, **kwargs: Any) -> None:
110+
super().__init__(score=score, **kwargs)
111+
112+
113+
class Message(SimpleNamespace):
114+
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
115+
pass
116+
117+
118+
def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
119+
# Start from any item-provided messages (EP-style), defaulting to [].
120+
raw_messages = item.get("messages") or []
121+
normalized_messages = []
122+
for m in raw_messages:
123+
if isinstance(m, dict):
124+
normalized_messages.append(
125+
Message(
126+
role=m.get("role"),
127+
content=m.get("content"),
128+
)
129+
)
130+
else:
131+
# Already Message-like; rely on duck typing (must have role/content)
132+
normalized_messages.append(m)
133+
134+
reference = item.get("reference_answer")
135+
prediction = sample.get("output_text")
136+
137+
# EP-style: ensure the model prediction is present as the last assistant message
138+
if prediction is not None:
139+
normalized_messages = list(normalized_messages) # shallow copy
140+
normalized_messages.append(Message(role="assistant", content=prediction))
141+
142+
return EvaluationRow(
143+
ground_truth=reference,
144+
messages=normalized_messages,
145+
item=item,
146+
sample=sample,
147+
)
148+
149+
150+
def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
151+
row = _build_row(sample, item)
152+
result = _ep_eval(row=row)
153+
154+
# Try to normalize different result shapes into a float score
155+
try:
156+
from collections.abc import Mapping
157+
158+
if isinstance(result, (int, float)):
159+
return float(result)
160+
161+
# EvaluateResult-like object with .score
162+
if hasattr(result, "score"):
163+
return float(result.score)
164+
165+
# EvaluationRow-like object with .evaluation_result.score
166+
eval_res = getattr(result, "evaluation_result", None)
167+
if eval_res is not None:
168+
if isinstance(eval_res, Mapping):
169+
if "score" in eval_res:
170+
return float(eval_res["score"])
171+
elif hasattr(eval_res, "score"):
172+
return float(eval_res.score)
173+
174+
# Dict-like with score
175+
if isinstance(result, Mapping) and "score" in result:
176+
return float(result["score"])
177+
except Exception:
178+
pass
179+
180+
return 0.0
181+
"""
182+
183+
full_source = src + "\n\n" + textwrap.dedent(helper)
184+
return {"type": "python", "source": full_source}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
3+
4+
We:
5+
- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
6+
- Wrap it in an @evaluation_test for normal eval usage
7+
- Convert the grading function into a Python grader spec with
8+
`build_python_grader_from_evaluation_test`
9+
"""
10+
11+
from typing import Any
12+
13+
from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
14+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
15+
from eval_protocol.pytest import evaluation_test
16+
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
17+
18+
19+
# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
20+
DEMO_ROWS = [
21+
EvaluationRow(
22+
messages=[
23+
Message(role="user", content="fuzzy wuzzy had no hair"),
24+
Message(role="assistant", content="fuzzy wuzzy was a bear"),
25+
],
26+
ground_truth="fuzzy wuzzy had no hair",
27+
)
28+
]
29+
30+
31+
@evaluation_test(
32+
input_rows=[DEMO_ROWS],
33+
rollout_processor=NoOpRolloutProcessor(),
34+
aggregation_method="mean",
35+
mode="pointwise",
36+
)
37+
def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
38+
"""
39+
Example @evaluation_test that scores a row using rapidfuzz.WRatio and
40+
attaches an EvaluateResult.
41+
"""
42+
# For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
43+
reference = row.ground_truth
44+
45+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
46+
last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
47+
prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
48+
49+
from rapidfuzz import fuzz, utils
50+
51+
score = float(
52+
fuzz.WRatio(
53+
str(prediction),
54+
str(reference),
55+
processor=utils.default_process,
56+
)
57+
/ 100.0
58+
)
59+
row.evaluation_result = EvaluateResult(score=score)
60+
return row
61+
62+
63+
RAPIDFUZZ_PYTHON_GRADER_SPEC: dict = build_python_grader_from_evaluation_test(rapidfuzz_eval)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import requests
3+
4+
from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
5+
from eval_protocol.integrations.openai_rft.example_rapidfuzz import rapidfuzz_eval
6+
7+
8+
api_key = os.environ["OPENAI_API_KEY"]
9+
headers = {"Authorization": f"Bearer {api_key}"}
10+
11+
grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."}
12+
13+
# validate the grader
14+
resp = requests.post(
15+
"https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
16+
json={"grader": grader},
17+
headers=headers,
18+
)
19+
print("validate response:", resp.text)
20+
21+
# run the grader once with a dummy item/sample
22+
payload = {
23+
"grader": grader,
24+
"item": {"reference_answer": "fuzzy wuzzy had no hair"},
25+
"model_sample": "fuzzy wuzzy was a bear",
26+
}
27+
resp = requests.post(
28+
"https://api.openai.com/v1/fine_tuning/alpha/graders/run",
29+
json=payload,
30+
headers=headers,
31+
)
32+
print("run response:", resp.text)

0 commit comments

Comments
 (0)