Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions eval_protocol/integrations/openai_rft/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
OpenAI EP Evaluation Adapter
==============================================

To see an end-to-end example of:
- taking an `@evaluation_test` (`rapidfuzz_eval`),
- converting it into a `{"type": "python", "source": ...}` grader spec with
`build_python_grader_from_evaluation_test`, and
- validating and running it against the OpenAI `/graders/*` HTTP APIs,
run:

```
pytest eval_protocol/integrations/openai_rft/example_rapidfuzz.py -vs # To show that this works as an EP evaluation_test

python eval_protocol/integrations/openai_rft/test_openai_grader.py
```

You can expect an output like:

```
(.venv) (base) derekxu@Mac-4147 python-sdk % python eval_protocol/integrations/openai_rft/test_openai_grader.py
validate response: {
"grader": {
"type": "python",
"source": "def _ep_eval(row, **kwargs):\n \"\"\"\n Example @evaluation_test that scores a row using rapidfuzz.WRatio and\n attaches an EvaluateResult.\n \"\"\"\n reference = row.ground_truth\n assistant_msgs = [m for m in row.messages if m.role == 'assistant']\n last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ''\n prediction = last_assistant_content if isinstance(last_assistant_content, str) else ''\n from rapidfuzz import fuzz, utils\n score = float(fuzz.WRatio(str(prediction), str(reference), processor=utils.default_process) / 100.0)\n row.evaluation_result = EvaluateResult(score=score)\n return row\n\n\nfrom typing import Any, Dict\nfrom types import SimpleNamespace\n\n\nclass EvaluationRow(SimpleNamespace):\n \"\"\"Minimal duck-typed stand-in for an evaluation row.\n\n Extend this with whatever attributes your eval logic uses.\n \"\"\"\n pass\n\n\nclass EvaluateResult(SimpleNamespace):\n \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.\n\n This lets evaluation-style functions that construct EvaluateResult(score=...)\n run inside the Python grader sandbox without importing eval_protocol.\n \"\"\"\n\n def __init__(self, score: float, **kwargs: Any) -> None:\n super().__init__(score=score, **kwargs)\n\n\nclass Message(SimpleNamespace):\n \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"\n pass\n\n\ndef _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:\n # Start from any item-provided messages (EP-style), defaulting to [].\n raw_messages = item.get(\"messages\") or []\n normalized_messages = []\n for m in raw_messages:\n if isinstance(m, dict):\n normalized_messages.append(\n Message(\n role=m.get(\"role\"),\n content=m.get(\"content\"),\n )\n )\n else:\n # Already Message-like; rely on duck typing (must have role/content)\n normalized_messages.append(m)\n\n reference = item.get(\"reference_answer\")\n prediction = sample.get(\"output_text\")\n\n # EP-style: ensure the model prediction is present as the last assistant message\n if prediction is not None:\n normalized_messages = list(normalized_messages) # shallow copy\n normalized_messages.append(Message(role=\"assistant\", content=prediction))\n\n return EvaluationRow(\n ground_truth=reference,\n messages=normalized_messages,\n item=item,\n sample=sample,\n )\n\n\ndef grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:\n row = _build_row(sample, item)\n result = _ep_eval(row=row)\n\n # Try to normalize different result shapes into a float score\n try:\n from collections.abc import Mapping\n\n if isinstance(result, (int, float)):\n return float(result)\n\n # EvaluateResult-like object with .score\n if hasattr(result, \"score\"):\n return float(result.score)\n\n # EvaluationRow-like object with .evaluation_result.score\n eval_res = getattr(result, \"evaluation_result\", None)\n if eval_res is not None:\n if isinstance(eval_res, Mapping):\n if \"score\" in eval_res:\n return float(eval_res[\"score\"])\n elif hasattr(eval_res, \"score\"):\n return float(eval_res.score)\n\n # Dict-like with score\n if isinstance(result, Mapping) and \"score\" in result:\n return float(result[\"score\"])\n except Exception:\n pass\n\n return 0.0\n",
"name": "grader-VasDqHrerHW5"
}
}
run response: {
"reward": 0.7555555555555555,
"metadata": {
"name": "grader-Bbe0lDBJVP9C",
"type": "python",
"errors": {
"formula_parse_error": false,
"sample_parse_error": false,
"sample_parse_error_details": null,
"truncated_observation_error": false,
"unresponsive_reward_error": false,
"invalid_variable_error": false,
"invalid_variable_error_details": null,
"other_error": false,
"python_grader_server_error": false,
"python_grader_server_error_type": null,
"python_grader_runtime_error": false,
"python_grader_runtime_error_details": null,
"model_grader_server_error": false,
"model_grader_refusal_error": false,
"model_grader_refusal_error_details": null,
"model_grader_parse_error": false,
"model_grader_parse_error_details": null,
"model_grader_exceeded_max_tokens_error": false,
"model_grader_server_error_details": null,
"endpoint_grader_internal_error": false,
"endpoint_grader_internal_error_details": null,
"endpoint_grader_server_error": false,
"endpoint_grader_server_error_details": null,
"endpoint_grader_safety_check_error": false
},
"execution_time": 4.79397988319397,
"scores": {},
"token_usage": null,
"sampled_model_name": null
},
"sub_rewards": {},
"model_grader_token_usage_per_model": {}
}
(.venv) (base) derekxu@Mac-4147 python-sdk %
```
184 changes: 184 additions & 0 deletions eval_protocol/integrations/openai_rft/adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.

Currently provides:
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
an OpenAI Python grader spec ({"type": "python", "source": ...}).
"""

import ast
import inspect
import textwrap


def build_python_grader_from_evaluation_test(test_fn) -> dict:
"""
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.

Assumptions:
- `test_fn` is the *core* evaluation function (not the @evaluation_test wrapper),
or an @evaluation_test-decorated function that carries _origin_func.
It should have a signature like:

def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow

- The function only relies on attributes that we provide on `EvaluationRowLike`
Comment thread
xzrderek marked this conversation as resolved.
Outdated
(you can extend that class as needed).

- We map OpenAI's (sample, item) to a duck‑typed `row`:
- item["reference_answer"] -> row.ground_truth
- sample["output_text"] -> appended as an assistant message
- raw dicts available as row.item / row.sample

- The function returns either:
- a numeric score, or
- an object/dict with a `score` field, or
- an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
"""

# If the user passed an @evaluation_test wrapper, try to recover the original function
origin = getattr(test_fn, "_origin_func", test_fn)

# Get the source of the original function
src = inspect.getsource(origin)
src = textwrap.dedent(src)

# Parse into AST so we can safely strip decorators and type annotations
tree = ast.parse(src)

class _StripAnnotationsAndDecorators(ast.NodeTransformer):
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
# Drop all decorators (e.g., @evaluation_test)
node.decorator_list = []
# Remove return type annotation
node.returns = None
self.generic_visit(node)
return node

def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
node.decorator_list = []
node.returns = None
self.generic_visit(node)
return node

def visit_arg(self, node: ast.arg) -> ast.AST:
# Remove all parameter annotations (e.g., row: EvaluationRow)
node.annotation = None
return node

transformer = _StripAnnotationsAndDecorators()
tree = transformer.visit(tree)
ast.fix_missing_locations(tree)

# Find the first function definition and rename it to _ep_eval
func_node: ast.AST | None = None
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
func_node = node
break

if func_node is None:
raise ValueError("Expected a function definition in test_fn source.")

func_node.name = "_ep_eval"

# Turn the modified AST back into source
src = ast.unparse(tree)

# Helper code that will live *inside* the grader source
helper = """
from typing import Any, Dict
from types import SimpleNamespace


class EvaluationRow(SimpleNamespace):
\"\"\"Minimal duck-typed stand-in for an evaluation row.

Extend this with whatever attributes your eval logic uses.
\"\"\"
pass


class EvaluateResult(SimpleNamespace):
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.

This lets evaluation-style functions that construct EvaluateResult(score=...)
run inside the Python grader sandbox without importing eval_protocol.
\"\"\"

def __init__(self, score: float, **kwargs: Any) -> None:
super().__init__(score=score, **kwargs)


class Message(SimpleNamespace):
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
pass


def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
# Start from any item-provided messages (EP-style), defaulting to [].
raw_messages = item.get("messages") or []
normalized_messages = []
for m in raw_messages:
if isinstance(m, dict):
normalized_messages.append(
Message(
role=m.get("role"),
content=m.get("content"),
)
)
else:
# Already Message-like; rely on duck typing (must have role/content)
normalized_messages.append(m)

reference = item.get("reference_answer")
prediction = sample.get("output_text")

# EP-style: ensure the model prediction is present as the last assistant message
if prediction is not None:
normalized_messages = list(normalized_messages) # shallow copy
normalized_messages.append(Message(role="assistant", content=prediction))

return EvaluationRow(
ground_truth=reference,
messages=normalized_messages,
item=item,
sample=sample,
)


def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
row = _build_row(sample, item)
result = _ep_eval(row=row)

# Try to normalize different result shapes into a float score
try:
from collections.abc import Mapping

if isinstance(result, (int, float)):
return float(result)

# EvaluateResult-like object with .score
if hasattr(result, "score"):
return float(result.score)

# EvaluationRow-like object with .evaluation_result.score
eval_res = getattr(result, "evaluation_result", None)
if eval_res is not None:
if isinstance(eval_res, Mapping):
if "score" in eval_res:
return float(eval_res["score"])
elif hasattr(eval_res, "score"):
return float(eval_res.score)

# Dict-like with score
if isinstance(result, Mapping) and "score" in result:
return float(result["score"])
except Exception:
pass

return 0.0
"""

full_source = src + "\n\n" + textwrap.dedent(helper)
return {"type": "python", "source": full_source}
63 changes: 63 additions & 0 deletions eval_protocol/integrations/openai_rft/example_rapidfuzz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.

We:
- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
- Wrap it in an @evaluation_test for normal eval usage
- Convert the grading function into a Python grader spec with
`build_python_grader_from_evaluation_test`
"""

from typing import Any

from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor


# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
DEMO_ROWS = [
EvaluationRow(
messages=[
Message(role="user", content="fuzzy wuzzy had no hair"),
Message(role="assistant", content="fuzzy wuzzy was a bear"),
],
ground_truth="fuzzy wuzzy had no hair",
)
]


@evaluation_test(
input_rows=[DEMO_ROWS],
rollout_processor=NoOpRolloutProcessor(),
aggregation_method="mean",
mode="pointwise",
)
def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
"""
Example @evaluation_test that scores a row using rapidfuzz.WRatio and
attaches an EvaluateResult.
"""
# For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
reference = row.ground_truth

assistant_msgs = [m for m in row.messages if m.role == "assistant"]
last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""

from rapidfuzz import fuzz, utils

score = float(
fuzz.WRatio(
str(prediction),
str(reference),
processor=utils.default_process,
)
/ 100.0
)
row.evaluation_result = EvaluateResult(score=score)
return row


RAPIDFUZZ_PYTHON_GRADER_SPEC: dict = build_python_grader_from_evaluation_test(rapidfuzz_eval)
32 changes: 32 additions & 0 deletions eval_protocol/integrations/openai_rft/test_openai_grader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import requests

from eval_protocol.integrations.openai_rft.adapter import build_python_grader_from_evaluation_test
from eval_protocol.integrations.openai_rft.example_rapidfuzz import rapidfuzz_eval


api_key = os.environ["OPENAI_API_KEY"]
headers = {"Authorization": f"Bearer {api_key}"}

grader = build_python_grader_from_evaluation_test(rapidfuzz_eval) # {"type": "python", "source": "..."}

# validate the grader
resp = requests.post(
"https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
json={"grader": grader},
headers=headers,
)
print("validate response:", resp.text)

# run the grader once with a dummy item/sample
payload = {
"grader": grader,
"item": {"reference_answer": "fuzzy wuzzy had no hair"},
"model_sample": "fuzzy wuzzy was a bear",
}
resp = requests.post(
"https://api.openai.com/v1/fine_tuning/alpha/graders/run",
json=payload,
headers=headers,
)
print("run response:", resp.text)
Loading
Loading