diff --git a/eval_protocol/integrations/__init__.py b/eval_protocol/integrations/__init__.py index f85283cf..f49b5bba 100644 --- a/eval_protocol/integrations/__init__.py +++ b/eval_protocol/integrations/__init__.py @@ -2,8 +2,6 @@ from .openeval import adapt from .trl import create_trl_adapter +from .openai_rft import build_python_grader_from_evaluation_test -__all__ = [ - "adapt", - "create_trl_adapter", -] +__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"] diff --git a/eval_protocol/integrations/openai_rft.py b/eval_protocol/integrations/openai_rft.py new file mode 100644 index 00000000..9acfa40b --- /dev/null +++ b/eval_protocol/integrations/openai_rft.py @@ -0,0 +1,190 @@ +""" +Integration helpers between Eval Protocol evaluations and OpenAI RFT graders. + +Currently provides: +- build_python_grader_from_evaluation_test: turn an evaluation-style function into + an OpenAI Python grader spec ({"type": "python", "source": ...}). +""" + +import ast +import inspect +import textwrap + + +def build_python_grader_from_evaluation_test(test_fn) -> dict: + """ + Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function. + + Assumptions: + - `test_fn` is either: + * the core evaluation function, or + * an @evaluation_test-decorated function that carries `_origin_func`. + Its effective signature looks like: + + def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow + + - The function treats `row` as an `EvaluationRow` and only relies on attributes + we provide in the duck-typed stand-in: + * row.ground_truth + * row.messages + * row.item (raw item dict) + * row.sample (raw sample dict) + + - We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows: + * item["reference_answer"] -> row.ground_truth + * item["messages"] (if present) -> row.messages (normalized to Message-like objects) + * sample["output_text"] -> appended as the last assistant message in row.messages + * the original dicts are also available via row.item / row.sample + + - The function returns either: + * a numeric score, or + * an object/dict with a `score` field, or + * an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`. + """ + + # If the user passed an @evaluation_test wrapper, try to recover the original function + origin = getattr(test_fn, "_origin_func", test_fn) + + # Get the source of the original function + src = inspect.getsource(origin) + src = textwrap.dedent(src) + + # Parse into AST so we can safely strip decorators and type annotations + tree = ast.parse(src) + + class _StripAnnotationsAndDecorators(ast.NodeTransformer): + def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST: + # Drop all decorators (e.g., @evaluation_test) + node.decorator_list = [] + # Remove return type annotation + node.returns = None + self.generic_visit(node) + return node + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST: + node.decorator_list = [] + node.returns = None + self.generic_visit(node) + return node + + def visit_arg(self, node: ast.arg) -> ast.AST: + # Remove all parameter annotations (e.g., row: EvaluationRow) + node.annotation = None + return node + + transformer = _StripAnnotationsAndDecorators() + tree = transformer.visit(tree) + ast.fix_missing_locations(tree) + + # Find the first function definition and rename it to _ep_eval + func_node: ast.AST | None = None + for node in tree.body: + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + func_node = node + break + + if func_node is None: + raise ValueError("Expected a function definition in test_fn source.") + + func_node.name = "_ep_eval" + + # Turn the modified AST back into source + src = ast.unparse(tree) + + # Helper code that will live *inside* the grader source + helper = """ +from typing import Any, Dict +from types import SimpleNamespace + + +class EvaluationRow(SimpleNamespace): + \"\"\"Minimal duck-typed stand-in for an evaluation row. + + Extend this with whatever attributes your eval logic uses. + \"\"\" + pass + + +class EvaluateResult(SimpleNamespace): + \"\"\"Simple stand-in for Eval Protocol's EvaluateResult. + + This lets evaluation-style functions that construct EvaluateResult(score=...) + run inside the Python grader sandbox without importing eval_protocol. + \"\"\" + + def __init__(self, score: float, **kwargs: Any) -> None: + super().__init__(score=score, **kwargs) + + +class Message(SimpleNamespace): + \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\" + pass + + +def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow: + # Start from any item-provided messages (EP-style), defaulting to []. + raw_messages = item.get("messages") or [] + normalized_messages = [] + for m in raw_messages: + if isinstance(m, dict): + normalized_messages.append( + Message( + role=m.get("role"), + content=m.get("content"), + ) + ) + else: + # Already Message-like; rely on duck typing (must have role/content) + normalized_messages.append(m) + + reference = item.get("reference_answer") + prediction = sample.get("output_text") + + # EP-style: ensure the model prediction is present as the last assistant message + if prediction is not None: + normalized_messages = list(normalized_messages) # shallow copy + normalized_messages.append(Message(role="assistant", content=prediction)) + + return EvaluationRow( + ground_truth=reference, + messages=normalized_messages, + item=item, + sample=sample, + ) + + +def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float: + row = _build_row(sample, item) + result = _ep_eval(row=row) + + # Try to normalize different result shapes into a float score + try: + from collections.abc import Mapping + + if isinstance(result, (int, float)): + return float(result) + + # EvaluateResult-like object with .score + if hasattr(result, "score"): + return float(result.score) + + # EvaluationRow-like object with .evaluation_result.score + eval_res = getattr(result, "evaluation_result", None) + if eval_res is not None: + if isinstance(eval_res, Mapping): + if "score" in eval_res: + return float(eval_res["score"]) + elif hasattr(eval_res, "score"): + return float(eval_res.score) + + # Dict-like with score + if isinstance(result, Mapping) and "score" in result: + return float(result["score"]) + except Exception: + pass + + return 0.0 +""" + + full_source = src + "\n\n" + textwrap.dedent(helper) + return {"type": "python", "source": full_source} diff --git a/examples/openai_rft/example_rapidfuzz.py b/examples/openai_rft/example_rapidfuzz.py new file mode 100644 index 00000000..7ddf01ef --- /dev/null +++ b/examples/openai_rft/example_rapidfuzz.py @@ -0,0 +1,60 @@ +""" +Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol. + +We: +- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio +- Wrap it in an @evaluation_test for normal eval usage +- Convert the grading function into a Python grader spec with + `build_python_grader_from_evaluation_test` +""" + +from typing import Any + +from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test +from eval_protocol.models import EvaluateResult, EvaluationRow, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor + + +# Tiny inline demo dataset so this evaluation_test is runnable via pytest. +DEMO_ROWS = [ + EvaluationRow( + messages=[ + Message(role="user", content="fuzzy wuzzy had no hair"), + Message(role="assistant", content="fuzzy wuzzy was a bear"), + ], + ground_truth="fuzzy wuzzy had no hair", + ) +] + + +@evaluation_test( + input_rows=[DEMO_ROWS], + rollout_processor=NoOpRolloutProcessor(), + aggregation_method="mean", + mode="pointwise", +) +def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow: + """ + Example @evaluation_test that scores a row using rapidfuzz.WRatio and + attaches an EvaluateResult. + """ + # For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message. + reference = row.ground_truth + + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + last_assistant_content = assistant_msgs[-1].content if assistant_msgs else "" + prediction = last_assistant_content if isinstance(last_assistant_content, str) else "" + + from rapidfuzz import fuzz, utils + + score = float( + fuzz.WRatio( + str(prediction), + str(reference), + processor=utils.default_process, + ) + / 100.0 + ) + row.evaluation_result = EvaluateResult(score=score) + return row diff --git a/examples/openai_rft/test_openai_grader.py b/examples/openai_rft/test_openai_grader.py new file mode 100644 index 00000000..8cf97dde --- /dev/null +++ b/examples/openai_rft/test_openai_grader.py @@ -0,0 +1,32 @@ +import os +import requests + +from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test +from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval + + +api_key = os.environ["OPENAI_API_KEY"] +headers = {"Authorization": f"Bearer {api_key}"} + +grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."} + +# validate the grader +resp = requests.post( + "https://api.openai.com/v1/fine_tuning/alpha/graders/validate", + json={"grader": grader}, + headers=headers, +) +print("validate response:", resp.text) + +# run the grader once with a dummy item/sample +payload = { + "grader": grader, + "item": {"reference_answer": "fuzzy wuzzy had no hair"}, + "model_sample": "fuzzy wuzzy was a bear", +} +resp = requests.post( + "https://api.openai.com/v1/fine_tuning/alpha/graders/run", + json=payload, + headers=headers, +) +print("run response:", resp.text) diff --git a/tests/test_openai_rft_integration.py b/tests/test_openai_rft_integration.py new file mode 100644 index 00000000..80b159cc --- /dev/null +++ b/tests/test_openai_rft_integration.py @@ -0,0 +1,66 @@ +import types +from typing import Any, Dict, Callable + +from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test +from eval_protocol.models import EvaluationRow + + +def _exec_and_get_grade(source: str) -> Callable[[Dict[str, Any], Dict[str, Any]], float]: + """Execute generated grader source and return the grade(sample, item) function.""" + ns: Dict[str, Any] = {} + exec(source, ns, ns) + grade_obj = ns.get("grade") + assert isinstance(grade_obj, types.FunctionType) + return grade_obj + + +def test_build_python_grader_from_plain_eval_function(): + """Plain eval-style function should be converted into a working grade(sample, item).""" + + # Simulate an eval-style function with annotations + def my_eval(row: EvaluationRow, **kwargs: Any) -> float: + # Simple correctness check: 1.0 if ground_truth matches sample["output_text"], else 0.0 + ground_truth = getattr(row, "ground_truth", None) + sample = getattr(row, "sample", None) or {} + pred = sample.get("output_text") + return 1.0 if ground_truth == pred else 0.0 + + grader_spec = build_python_grader_from_evaluation_test(my_eval) + assert grader_spec["type"] == "python" + source = grader_spec["source"] + + # Basic structural sanity checks on the generated source + assert '"EvaluationRow"' not in source + assert "def _ep_eval" in source + assert "def my_eval" not in source + assert "@evaluation_test" not in source + + grade = _exec_and_get_grade(source) + + sample = {"output_text": "42"} + item = {"reference_answer": "42"} + score = grade(sample, item) + assert isinstance(score, float) + assert score == 1.0 + + +def test_build_python_grader_from_wrapped_evaluation_test(): + """When the function is wrapped and carries _origin_func, we should use the origin.""" + + def original_eval(row, **kwargs: Any) -> float: + return 0.5 + + def wrapper(*args: Any, **kwargs: Any) -> float: + return original_eval(*args, **kwargs) + + # Simulate @evaluation_test attaching _origin_func + setattr(wrapper, "_origin_func", original_eval) + + grader_spec = build_python_grader_from_evaluation_test(wrapper) + assert grader_spec["type"] == "python" + source = grader_spec["source"] + + grade = _exec_and_get_grade(source) + score = grade({"output_text": "anything"}, {"reference_answer": "anything"}) + assert isinstance(score, float) + assert score == 0.5