Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions eval_protocol/integrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from .openeval import adapt
from .trl import create_trl_adapter
from .openai_rft import build_python_grader_from_evaluation_test

__all__ = [
"adapt",
"create_trl_adapter",
]
__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"]
190 changes: 190 additions & 0 deletions eval_protocol/integrations/openai_rft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.

Currently provides:
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
an OpenAI Python grader spec ({"type": "python", "source": ...}).
"""

import ast
import inspect
import textwrap


def build_python_grader_from_evaluation_test(test_fn) -> dict:
"""
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.

Assumptions:
- `test_fn` is either:
* the core evaluation function, or
* an @evaluation_test-decorated function that carries `_origin_func`.
Its effective signature looks like:

def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow

- The function treats `row` as an `EvaluationRow` and only relies on attributes
we provide in the duck-typed stand-in:
* row.ground_truth
* row.messages
* row.item (raw item dict)
* row.sample (raw sample dict)

- We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows:
* item["reference_answer"] -> row.ground_truth
* item["messages"] (if present) -> row.messages (normalized to Message-like objects)
* sample["output_text"] -> appended as the last assistant message in row.messages
* the original dicts are also available via row.item / row.sample

- The function returns either:
* a numeric score, or
* an object/dict with a `score` field, or
* an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
"""

# If the user passed an @evaluation_test wrapper, try to recover the original function
origin = getattr(test_fn, "_origin_func", test_fn)

# Get the source of the original function
src = inspect.getsource(origin)
src = textwrap.dedent(src)

# Parse into AST so we can safely strip decorators and type annotations
tree = ast.parse(src)

class _StripAnnotationsAndDecorators(ast.NodeTransformer):
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
# Drop all decorators (e.g., @evaluation_test)
node.decorator_list = []
# Remove return type annotation
node.returns = None
self.generic_visit(node)
return node

def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
node.decorator_list = []
node.returns = None
self.generic_visit(node)
return node

def visit_arg(self, node: ast.arg) -> ast.AST:
# Remove all parameter annotations (e.g., row: EvaluationRow)
node.annotation = None
return node

transformer = _StripAnnotationsAndDecorators()
tree = transformer.visit(tree)
ast.fix_missing_locations(tree)

# Find the first function definition and rename it to _ep_eval
func_node: ast.AST | None = None
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
func_node = node
break

if func_node is None:
raise ValueError("Expected a function definition in test_fn source.")

func_node.name = "_ep_eval"

# Turn the modified AST back into source
src = ast.unparse(tree)

# Helper code that will live *inside* the grader source
helper = """
from typing import Any, Dict
from types import SimpleNamespace


class EvaluationRow(SimpleNamespace):
\"\"\"Minimal duck-typed stand-in for an evaluation row.

Extend this with whatever attributes your eval logic uses.
\"\"\"
pass


class EvaluateResult(SimpleNamespace):
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.

This lets evaluation-style functions that construct EvaluateResult(score=...)
run inside the Python grader sandbox without importing eval_protocol.
\"\"\"

def __init__(self, score: float, **kwargs: Any) -> None:
super().__init__(score=score, **kwargs)


class Message(SimpleNamespace):
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
pass


def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
# Start from any item-provided messages (EP-style), defaulting to [].
raw_messages = item.get("messages") or []
normalized_messages = []
for m in raw_messages:
if isinstance(m, dict):
normalized_messages.append(
Message(
role=m.get("role"),
content=m.get("content"),
)
)
else:
# Already Message-like; rely on duck typing (must have role/content)
normalized_messages.append(m)

reference = item.get("reference_answer")
prediction = sample.get("output_text")

# EP-style: ensure the model prediction is present as the last assistant message
if prediction is not None:
normalized_messages = list(normalized_messages) # shallow copy
normalized_messages.append(Message(role="assistant", content=prediction))

return EvaluationRow(
ground_truth=reference,
messages=normalized_messages,
item=item,
sample=sample,
)


def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
row = _build_row(sample, item)
result = _ep_eval(row=row)

# Try to normalize different result shapes into a float score
try:
from collections.abc import Mapping

if isinstance(result, (int, float)):
return float(result)

# EvaluateResult-like object with .score
if hasattr(result, "score"):
return float(result.score)

# EvaluationRow-like object with .evaluation_result.score
eval_res = getattr(result, "evaluation_result", None)
if eval_res is not None:
if isinstance(eval_res, Mapping):
if "score" in eval_res:
return float(eval_res["score"])
elif hasattr(eval_res, "score"):
return float(eval_res.score)

# Dict-like with score
if isinstance(result, Mapping) and "score" in result:
return float(result["score"])
except Exception:
pass

return 0.0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Unhandled Exceptions Propagate, Crashing Evaluation

The call to _ep_eval on line 152 is outside the try-except block, so exceptions raised by the user's evaluation function will not be caught. Only normalization errors are caught, allowing crashes from the evaluation logic to propagate instead of gracefully returning 0.0 as intended.

Fix in Cursor Fix in Web

"""

full_source = src + "\n\n" + textwrap.dedent(helper)
return {"type": "python", "source": full_source}
60 changes: 60 additions & 0 deletions examples/openai_rft/example_rapidfuzz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.

We:
- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
- Wrap it in an @evaluation_test for normal eval usage
- Convert the grading function into a Python grader spec with
`build_python_grader_from_evaluation_test`
"""

from typing import Any

from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor


# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
DEMO_ROWS = [
EvaluationRow(
messages=[
Message(role="user", content="fuzzy wuzzy had no hair"),
Message(role="assistant", content="fuzzy wuzzy was a bear"),
],
ground_truth="fuzzy wuzzy had no hair",
)
]


@evaluation_test(
input_rows=[DEMO_ROWS],
rollout_processor=NoOpRolloutProcessor(),
aggregation_method="mean",
mode="pointwise",
)
def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
"""
Example @evaluation_test that scores a row using rapidfuzz.WRatio and
attaches an EvaluateResult.
"""
# For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
reference = row.ground_truth

assistant_msgs = [m for m in row.messages if m.role == "assistant"]
last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""

from rapidfuzz import fuzz, utils

score = float(
fuzz.WRatio(
str(prediction),
str(reference),
processor=utils.default_process,
)
/ 100.0
)
row.evaluation_result = EvaluateResult(score=score)
return row
32 changes: 32 additions & 0 deletions examples/openai_rft/test_openai_grader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import requests

from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval


api_key = os.environ["OPENAI_API_KEY"]
headers = {"Authorization": f"Bearer {api_key}"}

grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."}

# validate the grader
resp = requests.post(
"https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
json={"grader": grader},
headers=headers,
)
print("validate response:", resp.text)

# run the grader once with a dummy item/sample
payload = {
"grader": grader,
"item": {"reference_answer": "fuzzy wuzzy had no hair"},
"model_sample": "fuzzy wuzzy was a bear",
}
resp = requests.post(
"https://api.openai.com/v1/fine_tuning/alpha/graders/run",
json=payload,
headers=headers,
)
print("run response:", resp.text)
66 changes: 66 additions & 0 deletions tests/test_openai_rft_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import types
from typing import Any, Dict, Callable

from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
from eval_protocol.models import EvaluationRow


def _exec_and_get_grade(source: str) -> Callable[[Dict[str, Any], Dict[str, Any]], float]:
"""Execute generated grader source and return the grade(sample, item) function."""
ns: Dict[str, Any] = {}
exec(source, ns, ns)
grade_obj = ns.get("grade")
assert isinstance(grade_obj, types.FunctionType)
return grade_obj


def test_build_python_grader_from_plain_eval_function():
"""Plain eval-style function should be converted into a working grade(sample, item)."""

# Simulate an eval-style function with annotations
def my_eval(row: EvaluationRow, **kwargs: Any) -> float:
# Simple correctness check: 1.0 if ground_truth matches sample["output_text"], else 0.0
ground_truth = getattr(row, "ground_truth", None)
sample = getattr(row, "sample", None) or {}
pred = sample.get("output_text")
return 1.0 if ground_truth == pred else 0.0

grader_spec = build_python_grader_from_evaluation_test(my_eval)
assert grader_spec["type"] == "python"
source = grader_spec["source"]

# Basic structural sanity checks on the generated source
assert '"EvaluationRow"' not in source
assert "def _ep_eval" in source
assert "def my_eval" not in source
assert "@evaluation_test" not in source

grade = _exec_and_get_grade(source)

sample = {"output_text": "42"}
item = {"reference_answer": "42"}
score = grade(sample, item)
assert isinstance(score, float)
assert score == 1.0


def test_build_python_grader_from_wrapped_evaluation_test():
"""When the function is wrapped and carries _origin_func, we should use the origin."""

def original_eval(row, **kwargs: Any) -> float:
return 0.5

def wrapper(*args: Any, **kwargs: Any) -> float:
return original_eval(*args, **kwargs)

# Simulate @evaluation_test attaching _origin_func
setattr(wrapper, "_origin_func", original_eval)

grader_spec = build_python_grader_from_evaluation_test(wrapper)
assert grader_spec["type"] == "python"
source = grader_spec["source"]

grade = _exec_and_get_grade(source)
score = grade({"output_text": "anything"}, {"reference_answer": "anything"})
assert isinstance(score, float)
assert score == 0.5
Loading