adding in the openai integration

xzrderek · xzrderek · commit 2dccccf6894c · 2025-11-18T01:50:39.000-08:00
diff --git a/eval_protocol/integrations/openai_rft/README.md b/eval_protocol/integrations/openai_rft/README.md
@@ -0,0 +1,68 @@
+OpenAI EP Evaluation Adapter
+==============================================
+
+To see an end-to-end example of:
+- taking an `@evaluation_test` (`rapidfuzz_eval`),
+- converting it into a `{"type": "python", "source": ...}` grader spec with
+  `build_python_grader_from_evaluation_test`, and
+- validating and running it against the OpenAI `/graders/*` HTTP APIs,
+run:
+
+```
+pytest eval_protocol/integrations/openai_rft/example_rapidfuzz.py -vs  # To show that this works as an EP evaluation_test
+
+python eval_protocol/integrations/openai_rft/test_openai_grader.py
+```
+
+You can expect an output like:
+
+```
+(.venv) (base) derekxu@Mac-4147 python-sdk % python eval_protocol/integrations/openai_rft/test_openai_grader.py
+validate response: {
+  "grader": {
+    "type": "python",
+    "source": "def _ep_eval(row, **kwargs):\n    \"\"\"\n    Example @evaluation_test that scores a row using rapidfuzz.WRatio and\n    attaches an EvaluateResult.\n    \"\"\"\n    reference = row.ground_truth\n    assistant_msgs = [m for m in row.messages if m.role == 'assistant']\n    last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ''\n    prediction = last_assistant_content if isinstance(last_assistant_content, str) else ''\n    from rapidfuzz import fuzz, utils\n    score = float(fuzz.WRatio(str(prediction), str(reference), processor=utils.default_process) / 100.0)\n    row.evaluation_result = EvaluateResult(score=score)\n    return row\n\n\nfrom typing import Any, Dict\nfrom types import SimpleNamespace\n\n\nclass EvaluationRow(SimpleNamespace):\n    \"\"\"Minimal duck-typed stand-in for an evaluation row.\n\n    Extend this with whatever attributes your eval logic uses.\n    \"\"\"\n    pass\n\n\nclass EvaluateResult(SimpleNamespace):\n    \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.\n\n    This lets evaluation-style functions that construct EvaluateResult(score=...)\n    run inside the Python grader sandbox without importing eval_protocol.\n    \"\"\"\n\n    def __init__(self, score: float, **kwargs: Any) -> None:\n        super().__init__(score=score, **kwargs)\n\n\nclass Message(SimpleNamespace):\n    \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"\n    pass\n\n\ndef _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:\n    # Start from any item-provided messages (EP-style), defaulting to [].\n    raw_messages = item.get(\"messages\") or []\n    normalized_messages = []\n    for m in raw_messages:\n        if isinstance(m, dict):\n            normalized_messages.append(\n                Message(\n                    role=m.get(\"role\"),\n                    content=m.get(\"content\"),\n                )\n            )\n        else:\n            # Already Message-like; rely on duck typing (must have role/content)\n            normalized_messages.append(m)\n\n    reference = item.get(\"reference_answer\")\n    prediction = sample.get(\"output_text\")\n\n    # EP-style: ensure the model prediction is present as the last assistant message\n    if prediction is not None:\n        normalized_messages = list(normalized_messages)  # shallow copy\n        normalized_messages.append(Message(role=\"assistant\", content=prediction))\n\n    return EvaluationRow(\n        ground_truth=reference,\n        messages=normalized_messages,\n        item=item,\n        sample=sample,\n    )\n\n\ndef grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:\n    row = _build_row(sample, item)\n    result = _ep_eval(row=row)\n\n    # Try to normalize different result shapes into a float score\n    try:\n        from collections.abc import Mapping\n\n        if isinstance(result, (int, float)):\n            return float(result)\n\n        # EvaluateResult-like object with .score\n        if hasattr(result, \"score\"):\n            return float(result.score)\n\n        # EvaluationRow-like object with .evaluation_result.score\n        eval_res = getattr(result, \"evaluation_result\", None)\n        if eval_res is not None:\n            if isinstance(eval_res, Mapping):\n                if \"score\" in eval_res:\n                    return float(eval_res[\"score\"])\n            elif hasattr(eval_res, \"score\"):\n                return float(eval_res.score)\n\n        # Dict-like with score\n        if isinstance(result, Mapping) and \"score\" in result:\n            return float(result[\"score\"])\n    except Exception:\n        pass\n\n    return 0.0\n",
+    "name": "grader-VasDqHrerHW5"
+  }
+}
+run response: {
+  "reward": 0.7555555555555555,
+  "metadata": {
+    "name": "grader-Bbe0lDBJVP9C",
+    "type": "python",
+    "errors": {
+      "formula_parse_error": false,
+      "sample_parse_error": false,
+      "sample_parse_error_details": null,
+      "truncated_observation_error": false,
+      "unresponsive_reward_error": false,
+      "invalid_variable_error": false,
+      "invalid_variable_error_details": null,
+      "other_error": false,
+      "python_grader_server_error": false,
+      "python_grader_server_error_type": null,
+      "python_grader_runtime_error": false,
+      "python_grader_runtime_error_details": null,
+      "model_grader_server_error": false,
+      "model_grader_refusal_error": false,
+      "model_grader_refusal_error_details": null,
+      "model_grader_parse_error": false,
+      "model_grader_parse_error_details": null,
+      "model_grader_exceeded_max_tokens_error": false,
+      "model_grader_server_error_details": null,
+      "endpoint_grader_internal_error": false,
+      "endpoint_grader_internal_error_details": null,
+      "endpoint_grader_server_error": false,
+      "endpoint_grader_server_error_details": null,
+      "endpoint_grader_safety_check_error": false
+    },
+    "execution_time": 4.79397988319397,
+    "scores": {},
+    "token_usage": null,
+    "sampled_model_name": null
+  },
+  "sub_rewards": {},
+  "model_grader_token_usage_per_model": {}
+}
+(.venv) (base) derekxu@Mac-4147 python-sdk %
+```
diff --git a/eval_protocol/integrations/openai_rft/adapter.py b/eval_protocol/integrations/openai_rft/adapter.py
@@ -0,0 +1,184 @@
+"""
+Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
+
+Currently provides:
+- build_python_grader_from_evaluation_test: turn an evaluation-style function into
+  an OpenAI Python grader spec ({"type": "python", "source": ...}).
+"""
+
+import ast
+import inspect
+import textwrap
+
+
+def build_python_grader_from_evaluation_test(test_fn) -> dict:
+    """
+    Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
+
+    Assumptions:
+    - `test_fn` is the *core* evaluation function (not the @evaluation_test wrapper),
+      or an @evaluation_test-decorated function that carries _origin_func.
+      It should have a signature like:
+
+          def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
+
+    - The function only relies on attributes that we provide on `EvaluationRowLike`
+      (you can extend that class as needed).
+
+    - We map OpenAI's (sample, item) to a duck‑typed `row`:
+        - item["reference_answer"]      -> row.ground_truth
+        - sample["output_text"]         -> appended as an assistant message
+        - raw dicts available as row.item / row.sample
+
+    - The function returns either:
+        - a numeric score, or
+        - an object/dict with a `score` field, or
+        - an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
+    """
+
+    # If the user passed an @evaluation_test wrapper, try to recover the original function
+    origin = getattr(test_fn, "_origin_func", test_fn)
+
+    # Get the source of the original function
+    src = inspect.getsource(origin)
+    src = textwrap.dedent(src)
+
+    # Parse into AST so we can safely strip decorators and type annotations
+    tree = ast.parse(src)
+
+    class _StripAnnotationsAndDecorators(ast.NodeTransformer):
+        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+            # Drop all decorators (e.g., @evaluation_test)
+            node.decorator_list = []
+            # Remove return type annotation
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
+            node.decorator_list = []
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_arg(self, node: ast.arg) -> ast.AST:
+            # Remove all parameter annotations (e.g., row: EvaluationRow)
+            node.annotation = None
+            return node
+
+    transformer = _StripAnnotationsAndDecorators()
+    tree = transformer.visit(tree)
+    ast.fix_missing_locations(tree)
+
+    # Find the first function definition and rename it to _ep_eval
+    func_node: ast.AST | None = None
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            func_node = node
+            break
+
+    if func_node is None:
+        raise ValueError("Expected a function definition in test_fn source.")
+
+    func_node.name = "_ep_eval"
+
+    # Turn the modified AST back into source
+    src = ast.unparse(tree)
+
+    # Helper code that will live *inside* the grader source
+    helper = """
+from typing import Any, Dict
+from types import SimpleNamespace
+
+
+class EvaluationRow(SimpleNamespace):
+    \"\"\"Minimal duck-typed stand-in for an evaluation row.
+
+    Extend this with whatever attributes your eval logic uses.
+    \"\"\"
+    pass
+
+
+class EvaluateResult(SimpleNamespace):
+    \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
+
+    This lets evaluation-style functions that construct EvaluateResult(score=...)
+    run inside the Python grader sandbox without importing eval_protocol.
+    \"\"\"
+
+    def __init__(self, score: float, **kwargs: Any) -> None:
+        super().__init__(score=score, **kwargs)
+
+
+class Message(SimpleNamespace):
+    \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
+    pass
+
+
+def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
+    # Start from any item-provided messages (EP-style), defaulting to [].
+    raw_messages = item.get("messages") or []
+    normalized_messages = []
+    for m in raw_messages:
+        if isinstance(m, dict):
+            normalized_messages.append(
+                Message(
+                    role=m.get("role"),
+                    content=m.get("content"),
+                )
+            )
+        else:
+            # Already Message-like; rely on duck typing (must have role/content)
+            normalized_messages.append(m)
+
+    reference = item.get("reference_answer")
+    prediction = sample.get("output_text")
+
+    # EP-style: ensure the model prediction is present as the last assistant message
+    if prediction is not None:
+        normalized_messages = list(normalized_messages)  # shallow copy
+        normalized_messages.append(Message(role="assistant", content=prediction))
+
+    return EvaluationRow(
+        ground_truth=reference,
+        messages=normalized_messages,
+        item=item,
+        sample=sample,
+    )
+
+
+def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
+    row = _build_row(sample, item)
+    result = _ep_eval(row=row)
+
+    # Try to normalize different result shapes into a float score
+    try:
+        from collections.abc import Mapping
+
+        if isinstance(result, (int, float)):
+            return float(result)
+
+        # EvaluateResult-like object with .score
+        if hasattr(result, "score"):
+            return float(result.score)
+
+        # EvaluationRow-like object with .evaluation_result.score
+        eval_res = getattr(result, "evaluation_result", None)
+        if eval_res is not None:
+            if isinstance(eval_res, Mapping):
+                if "score" in eval_res:
+                    return float(eval_res["score"])
+            elif hasattr(eval_res, "score"):
+                return float(eval_res.score)
+
+        # Dict-like with score
+        if isinstance(result, Mapping) and "score" in result:
+            return float(result["score"])
+    except Exception:
+        pass
+
+    return 0.0
+"""
+
+    full_source = src + "\n\n" + textwrap.dedent(helper)
+    return {"type": "python", "source": full_source}
diff --git a/eval_protocol/integrations/openai_rft/example_rapidfuzz.py b/eval_protocol/integrations/openai_rft/example_rapidfuzz.py
@@ -0,0 +1,63 @@
+"""
+Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
+
+We:
+- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
+- Wrap it in an @evaluation_test for normal eval usage
+- Convert the grading function into a Python grader spec with
+  `build_python_grader_from_evaluation_test`
+"""
+
+from typing import Any
+
+from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+
+
+# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
+DEMO_ROWS = [
+    EvaluationRow(
+        messages=[
+            Message(role="user", content="fuzzy wuzzy had no hair"),
+            Message(role="assistant", content="fuzzy wuzzy was a bear"),
+        ],
+        ground_truth="fuzzy wuzzy had no hair",
+    )
+]
+
+
+@evaluation_test(
+    input_rows=[DEMO_ROWS],
+    rollout_processor=NoOpRolloutProcessor(),
+    aggregation_method="mean",
+    mode="pointwise",
+)
+def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
+    """
+    Example @evaluation_test that scores a row using rapidfuzz.WRatio and
+    attaches an EvaluateResult.
+    """
+    # For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
+    reference = row.ground_truth
+
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
+    prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
+
+    from rapidfuzz import fuzz, utils
+
+    score = float(
+        fuzz.WRatio(
+            str(prediction),
+            str(reference),
+            processor=utils.default_process,
+        )
+        / 100.0
+    )
+    row.evaluation_result = EvaluateResult(score=score)
+    return row
+
+
+RAPIDFUZZ_PYTHON_GRADER_SPEC: dict = build_python_grader_from_evaluation_test(rapidfuzz_eval)
diff --git a/eval_protocol/integrations/openai_rft/test_openai_grader.py b/eval_protocol/integrations/openai_rft/test_openai_grader.py
@@ -0,0 +1,32 @@
+import os
+import requests
+
+from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
+from eval_protocol.integrations.openai_rft.example_rapidfuzz import rapidfuzz_eval
+
+
+api_key = os.environ["OPENAI_API_KEY"]
+headers = {"Authorization": f"Bearer {api_key}"}
+
+grader = build_python_grader_from_evaluation_test(rapidfuzz_eval)  # {"type": "python", "source": "..."}
+
+# validate the grader
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
+    json={"grader": grader},
+    headers=headers,
+)
+print("validate response:", resp.text)
+
+# run the grader once with a dummy item/sample
+payload = {
+    "grader": grader,
+    "item": {"reference_answer": "fuzzy wuzzy had no hair"},
+    "model_sample": "fuzzy wuzzy was a bear",
+}
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/run",
+    json=payload,
+    headers=headers,
+)
+print("run response:", resp.text)
diff --git a/tests/test_openai_rft_integration.py b/tests/test_openai_rft_integration.py