adding in the openai integration (#338)

xzrderek · web-flow · commit 2d46f124b74c · 2025-11-18T23:57:58.000-08:00
* adding in the openai integration

* move to not be in package

* update path

* add to export

* remove unneeded
diff --git a/eval_protocol/integrations/__init__.py b/eval_protocol/integrations/__init__.py
@@ -2,8 +2,6 @@
 
 from .openeval import adapt
 from .trl import create_trl_adapter
+from .openai_rft import build_python_grader_from_evaluation_test
 
-__all__ = [
-    "adapt",
-    "create_trl_adapter",
-]
+__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"]
diff --git a/eval_protocol/integrations/openai_rft.py b/eval_protocol/integrations/openai_rft.py
@@ -0,0 +1,190 @@
+"""
+Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
+
+Currently provides:
+- build_python_grader_from_evaluation_test: turn an evaluation-style function into
+  an OpenAI Python grader spec ({"type": "python", "source": ...}).
+"""
+
+import ast
+import inspect
+import textwrap
+
+
+def build_python_grader_from_evaluation_test(test_fn) -> dict:
+    """
+    Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
+
+    Assumptions:
+    - `test_fn` is either:
+        * the core evaluation function, or
+        * an @evaluation_test-decorated function that carries `_origin_func`.
+      Its effective signature looks like:
+
+          def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
+
+    - The function treats `row` as an `EvaluationRow` and only relies on attributes
+      we provide in the duck-typed stand-in:
+        * row.ground_truth
+        * row.messages
+        * row.item (raw item dict)
+        * row.sample (raw sample dict)
+
+    - We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows:
+        * item["reference_answer"]      -> row.ground_truth
+        * item["messages"] (if present) -> row.messages (normalized to Message-like objects)
+        * sample["output_text"]         -> appended as the last assistant message in row.messages
+        * the original dicts are also available via row.item / row.sample
+
+    - The function returns either:
+        * a numeric score, or
+        * an object/dict with a `score` field, or
+        * an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
+    """
+
+    # If the user passed an @evaluation_test wrapper, try to recover the original function
+    origin = getattr(test_fn, "_origin_func", test_fn)
+
+    # Get the source of the original function
+    src = inspect.getsource(origin)
+    src = textwrap.dedent(src)
+
+    # Parse into AST so we can safely strip decorators and type annotations
+    tree = ast.parse(src)
+
+    class _StripAnnotationsAndDecorators(ast.NodeTransformer):
+        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+            # Drop all decorators (e.g., @evaluation_test)
+            node.decorator_list = []
+            # Remove return type annotation
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
+            node.decorator_list = []
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_arg(self, node: ast.arg) -> ast.AST:
+            # Remove all parameter annotations (e.g., row: EvaluationRow)
+            node.annotation = None
+            return node
+
+    transformer = _StripAnnotationsAndDecorators()
+    tree = transformer.visit(tree)
+    ast.fix_missing_locations(tree)
+
+    # Find the first function definition and rename it to _ep_eval
+    func_node: ast.AST | None = None
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            func_node = node
+            break
+
+    if func_node is None:
+        raise ValueError("Expected a function definition in test_fn source.")
+
+    func_node.name = "_ep_eval"
+
+    # Turn the modified AST back into source
+    src = ast.unparse(tree)
+
+    # Helper code that will live *inside* the grader source
+    helper = """
+from typing import Any, Dict
+from types import SimpleNamespace
+
+
+class EvaluationRow(SimpleNamespace):
+    \"\"\"Minimal duck-typed stand-in for an evaluation row.
+
+    Extend this with whatever attributes your eval logic uses.
+    \"\"\"
+    pass
+
+
+class EvaluateResult(SimpleNamespace):
+    \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
+
+    This lets evaluation-style functions that construct EvaluateResult(score=...)
+    run inside the Python grader sandbox without importing eval_protocol.
+    \"\"\"
+
+    def __init__(self, score: float, **kwargs: Any) -> None:
+        super().__init__(score=score, **kwargs)
+
+
+class Message(SimpleNamespace):
+    \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
+    pass
+
+
+def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
+    # Start from any item-provided messages (EP-style), defaulting to [].
+    raw_messages = item.get("messages") or []
+    normalized_messages = []
+    for m in raw_messages:
+        if isinstance(m, dict):
+            normalized_messages.append(
+                Message(
+                    role=m.get("role"),
+                    content=m.get("content"),
+                )
+            )
+        else:
+            # Already Message-like; rely on duck typing (must have role/content)
+            normalized_messages.append(m)
+
+    reference = item.get("reference_answer")
+    prediction = sample.get("output_text")
+
+    # EP-style: ensure the model prediction is present as the last assistant message
+    if prediction is not None:
+        normalized_messages = list(normalized_messages)  # shallow copy
+        normalized_messages.append(Message(role="assistant", content=prediction))
+
+    return EvaluationRow(
+        ground_truth=reference,
+        messages=normalized_messages,
+        item=item,
+        sample=sample,
+    )
+
+
+def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
+    row = _build_row(sample, item)
+    result = _ep_eval(row=row)
+
+    # Try to normalize different result shapes into a float score
+    try:
+        from collections.abc import Mapping
+
+        if isinstance(result, (int, float)):
+            return float(result)
+
+        # EvaluateResult-like object with .score
+        if hasattr(result, "score"):
+            return float(result.score)
+
+        # EvaluationRow-like object with .evaluation_result.score
+        eval_res = getattr(result, "evaluation_result", None)
+        if eval_res is not None:
+            if isinstance(eval_res, Mapping):
+                if "score" in eval_res:
+                    return float(eval_res["score"])
+            elif hasattr(eval_res, "score"):
+                return float(eval_res.score)
+
+        # Dict-like with score
+        if isinstance(result, Mapping) and "score" in result:
+            return float(result["score"])
+    except Exception:
+        pass
+
+    return 0.0
+"""
+
+    full_source = src + "\n\n" + textwrap.dedent(helper)
+    return {"type": "python", "source": full_source}
diff --git a/examples/openai_rft/example_rapidfuzz.py b/examples/openai_rft/example_rapidfuzz.py
@@ -0,0 +1,60 @@
+"""
+Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
+
+We:
+- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
+- Wrap it in an @evaluation_test for normal eval usage
+- Convert the grading function into a Python grader spec with
+  `build_python_grader_from_evaluation_test`
+"""
+
+from typing import Any
+
+from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+
+
+# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
+DEMO_ROWS = [
+    EvaluationRow(
+        messages=[
+            Message(role="user", content="fuzzy wuzzy had no hair"),
+            Message(role="assistant", content="fuzzy wuzzy was a bear"),
+        ],
+        ground_truth="fuzzy wuzzy had no hair",
+    )
+]
+
+
+@evaluation_test(
+    input_rows=[DEMO_ROWS],
+    rollout_processor=NoOpRolloutProcessor(),
+    aggregation_method="mean",
+    mode="pointwise",
+)
+def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
+    """
+    Example @evaluation_test that scores a row using rapidfuzz.WRatio and
+    attaches an EvaluateResult.
+    """
+    # For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
+    reference = row.ground_truth
+
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
+    prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
+
+    from rapidfuzz import fuzz, utils
+
+    score = float(
+        fuzz.WRatio(
+            str(prediction),
+            str(reference),
+            processor=utils.default_process,
+        )
+        / 100.0
+    )
+    row.evaluation_result = EvaluateResult(score=score)
+    return row
diff --git a/examples/openai_rft/test_openai_grader.py b/examples/openai_rft/test_openai_grader.py
@@ -0,0 +1,32 @@
+import os
+import requests
+
+from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
+from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval
+
+
+api_key = os.environ["OPENAI_API_KEY"]
+headers = {"Authorization": f"Bearer {api_key}"}
+
+grader = build_python_grader_from_evaluation_test(rapidfuzz_eval)  # {"type": "python", "source": "..."}
+
+# validate the grader
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
+    json={"grader": grader},
+    headers=headers,
+)
+print("validate response:", resp.text)
+
+# run the grader once with a dummy item/sample
+payload = {
+    "grader": grader,
+    "item": {"reference_answer": "fuzzy wuzzy had no hair"},
+    "model_sample": "fuzzy wuzzy was a bear",
+}
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/run",
+    json=payload,
+    headers=headers,
+)
+print("run response:", resp.text)
diff --git a/tests/test_openai_rft_integration.py b/tests/test_openai_rft_integration.py
@@ -0,0 +1,66 @@
+import types
+from typing import Any, Dict, Callable
+
+from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
+from eval_protocol.models import EvaluationRow
+
+
+def _exec_and_get_grade(source: str) -> Callable[[Dict[str, Any], Dict[str, Any]], float]:
+    """Execute generated grader source and return the grade(sample, item) function."""
+    ns: Dict[str, Any] = {}
+    exec(source, ns, ns)
+    grade_obj = ns.get("grade")
+    assert isinstance(grade_obj, types.FunctionType)
+    return grade_obj
+
+
+def test_build_python_grader_from_plain_eval_function():
+    """Plain eval-style function should be converted into a working grade(sample, item)."""
+
+    # Simulate an eval-style function with annotations
+    def my_eval(row: EvaluationRow, **kwargs: Any) -> float:
+        # Simple correctness check: 1.0 if ground_truth matches sample["output_text"], else 0.0
+        ground_truth = getattr(row, "ground_truth", None)
+        sample = getattr(row, "sample", None) or {}
+        pred = sample.get("output_text")
+        return 1.0 if ground_truth == pred else 0.0
+
+    grader_spec = build_python_grader_from_evaluation_test(my_eval)
+    assert grader_spec["type"] == "python"
+    source = grader_spec["source"]
+
+    # Basic structural sanity checks on the generated source
+    assert '"EvaluationRow"' not in source
+    assert "def _ep_eval" in source
+    assert "def my_eval" not in source
+    assert "@evaluation_test" not in source
+
+    grade = _exec_and_get_grade(source)
+
+    sample = {"output_text": "42"}
+    item = {"reference_answer": "42"}
+    score = grade(sample, item)
+    assert isinstance(score, float)
+    assert score == 1.0
+
+
+def test_build_python_grader_from_wrapped_evaluation_test():
+    """When the function is wrapped and carries _origin_func, we should use the origin."""
+
+    def original_eval(row, **kwargs: Any) -> float:
+        return 0.5
+
+    def wrapper(*args: Any, **kwargs: Any) -> float:
+        return original_eval(*args, **kwargs)
+
+    # Simulate @evaluation_test attaching _origin_func
+    setattr(wrapper, "_origin_func", original_eval)
+
+    grader_spec = build_python_grader_from_evaluation_test(wrapper)
+    assert grader_spec["type"] == "python"
+    source = grader_spec["source"]
+
+    grade = _exec_and_get_grade(source)
+    score = grade({"output_text": "anything"}, {"reference_answer": "anything"})
+    assert isinstance(score, float)
+    assert score == 0.5