eval-protocol
diff --git a/‎docs/intro.png‎
-117 KB b/‎docs/intro.png‎
-117 KB
diff --git a/‎eval_protocol/benchmarks/test_glm_streaming_compliance.py‎
Lines changed: 3551 additions & 0 deletions b/‎eval_protocol/benchmarks/test_glm_streaming_compliance.py‎
Lines changed: 3551 additions & 0 deletions
diff --git a/‎eval_protocol/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/cli.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/integrations/__init__.py‎
Lines changed: 2 additions & 4 deletions b/‎eval_protocol/integrations/__init__.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎eval_protocol/integrations/openai_rft.py‎
Lines changed: 190 additions & 0 deletions b/‎eval_protocol/integrations/openai_rft.py‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎eval_protocol/models.py‎
Lines changed: 8 additions & 0 deletions b/‎eval_protocol/models.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 25 additions & 2 deletions b/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 7 additions & 1 deletion b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎examples/openai_rft/example_rapidfuzz.py‎
Lines changed: 60 additions & 0 deletions b/‎examples/openai_rft/example_rapidfuzz.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/openai_rft/test_openai_grader.py‎
Lines changed: 32 additions & 0 deletions b/‎examples/openai_rft/test_openai_grader.py‎
Lines changed: 32 additions & 0 deletions
@@ -402,7 +402,7 @@ def parse_args(args=None):
     rft_parser.add_argument("--lora-rank", type=int, default=16)
     rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
     rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
-    rft_parser.add_argument("--accelerator-count", type=int, default=1)
+    rft_parser.add_argument("--accelerator-count", type=int)
     rft_parser.add_argument("--region", help="Fireworks region enum value")
     rft_parser.add_argument("--display-name", help="RFT job display name")
     rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
 
@@ -2,8 +2,6 @@
 
 from .openeval import adapt
 from .trl import create_trl_adapter
+from .openai_rft import build_python_grader_from_evaluation_test
 
-__all__ = [
-    "adapt",
-    "create_trl_adapter",
-]
+__all__ = ["adapt", "create_trl_adapter", "build_python_grader_from_evaluation_test"]
@@ -0,0 +1,190 @@
+"""
+Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
+
+Currently provides:
+- build_python_grader_from_evaluation_test: turn an evaluation-style function into
+  an OpenAI Python grader spec ({"type": "python", "source": ...}).
+"""
+
+import ast
+import inspect
+import textwrap
+
+
+def build_python_grader_from_evaluation_test(test_fn) -> dict:
+    """
+    Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
+
+    Assumptions:
+    - `test_fn` is either:
+        * the core evaluation function, or
+        * an @evaluation_test-decorated function that carries `_origin_func`.
+      Its effective signature looks like:
+
+          def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
+
+    - The function treats `row` as an `EvaluationRow` and only relies on attributes
+      we provide in the duck-typed stand-in:
+        * row.ground_truth
+        * row.messages
+        * row.item (raw item dict)
+        * row.sample (raw sample dict)
+
+    - We map OpenAI's (sample, item) into that duck-typed `EvaluationRow` as follows:
+        * item["reference_answer"]      -> row.ground_truth
+        * item["messages"] (if present) -> row.messages (normalized to Message-like objects)
+        * sample["output_text"]         -> appended as the last assistant message in row.messages
+        * the original dicts are also available via row.item / row.sample
+
+    - The function returns either:
+        * a numeric score, or
+        * an object/dict with a `score` field, or
+        * an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
+    """
+
+    # If the user passed an @evaluation_test wrapper, try to recover the original function
+    origin = getattr(test_fn, "_origin_func", test_fn)
+
+    # Get the source of the original function
+    src = inspect.getsource(origin)
+    src = textwrap.dedent(src)
+
+    # Parse into AST so we can safely strip decorators and type annotations
+    tree = ast.parse(src)
+
+    class _StripAnnotationsAndDecorators(ast.NodeTransformer):
+        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+            # Drop all decorators (e.g., @evaluation_test)
+            node.decorator_list = []
+            # Remove return type annotation
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
+            node.decorator_list = []
+            node.returns = None
+            self.generic_visit(node)
+            return node
+
+        def visit_arg(self, node: ast.arg) -> ast.AST:
+            # Remove all parameter annotations (e.g., row: EvaluationRow)
+            node.annotation = None
+            return node
+
+    transformer = _StripAnnotationsAndDecorators()
+    tree = transformer.visit(tree)
+    ast.fix_missing_locations(tree)
+
+    # Find the first function definition and rename it to _ep_eval
+    func_node: ast.AST | None = None
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            func_node = node
+            break
+
+    if func_node is None:
+        raise ValueError("Expected a function definition in test_fn source.")
+
+    func_node.name = "_ep_eval"
+
+    # Turn the modified AST back into source
+    src = ast.unparse(tree)
+
+    # Helper code that will live *inside* the grader source
+    helper = """
+from typing import Any, Dict
+from types import SimpleNamespace
+
+
+class EvaluationRow(SimpleNamespace):
+    \"\"\"Minimal duck-typed stand-in for an evaluation row.
+
+    Extend this with whatever attributes your eval logic uses.
+    \"\"\"
+    pass
+
+
+class EvaluateResult(SimpleNamespace):
+    \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
+
+    This lets evaluation-style functions that construct EvaluateResult(score=...)
+    run inside the Python grader sandbox without importing eval_protocol.
+    \"\"\"
+
+    def __init__(self, score: float, **kwargs: Any) -> None:
+        super().__init__(score=score, **kwargs)
+
+
+class Message(SimpleNamespace):
+    \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
+    pass
+
+
+def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
+    # Start from any item-provided messages (EP-style), defaulting to [].
+    raw_messages = item.get("messages") or []
+    normalized_messages = []
+    for m in raw_messages:
+        if isinstance(m, dict):
+            normalized_messages.append(
+                Message(
+                    role=m.get("role"),
+                    content=m.get("content"),
+                )
+            )
+        else:
+            # Already Message-like; rely on duck typing (must have role/content)
+            normalized_messages.append(m)
+
+    reference = item.get("reference_answer")
+    prediction = sample.get("output_text")
+
+    # EP-style: ensure the model prediction is present as the last assistant message
+    if prediction is not None:
+        normalized_messages = list(normalized_messages)  # shallow copy
+        normalized_messages.append(Message(role="assistant", content=prediction))
+
+    return EvaluationRow(
+        ground_truth=reference,
+        messages=normalized_messages,
+        item=item,
+        sample=sample,
+    )
+
+
+def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
+    row = _build_row(sample, item)
+    result = _ep_eval(row=row)
+
+    # Try to normalize different result shapes into a float score
+    try:
+        from collections.abc import Mapping
+
+        if isinstance(result, (int, float)):
+            return float(result)
+
+        # EvaluateResult-like object with .score
+        if hasattr(result, "score"):
+            return float(result.score)
+
+        # EvaluationRow-like object with .evaluation_result.score
+        eval_res = getattr(result, "evaluation_result", None)
+        if eval_res is not None:
+            if isinstance(eval_res, Mapping):
+                if "score" in eval_res:
+                    return float(eval_res["score"])
+            elif hasattr(eval_res, "score"):
+                return float(eval_res.score)
+
+        # Dict-like with score
+        if isinstance(result, Mapping) and "score" in result:
+            return float(result["score"])
+    except Exception:
+        pass
+
+    return 0.0
+"""
+
+    full_source = src + "\n\n" + textwrap.dedent(helper)
+    return {"type": "python", "source": full_source}
@@ -782,6 +782,14 @@ class ExecutionMetadata(BaseModel):
     extra: Optional[Dict[str, Any]] = Field(
         default=None,
         description="Arbitrary execution metadata for integrations (step rewards, token IDs, debug info, etc.).",
+    finish_reason: Optional[str] = Field(
+        default=None,
+        description="finish_reason reported by the completion response for this row.",
+    )
+
+    tool_call_count: Optional[int] = Field(
+        default=None,
+        description="Number of tool calls returned in the assistant message for this row.",
     )
 
 
 
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import os
 import time
@@ -98,8 +99,24 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             assert isinstance(response, ModelResponse), "Response should be ModelResponse"
             assert isinstance(response.choices[0], Choices), "Response choice should be a Choices"
 
-            assistant_content = response.choices[0].message.content or ""
-            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+            assistant_message = response.choices[0].message
+            finish_reason = getattr(response.choices[0], "finish_reason", None)
+
+            # Extract content
+            assistant_content = assistant_message.content or ""
+
+            # Extract reasoning content (if present)
+            reasoning_content = getattr(assistant_message, "reasoning_content", None)
+            if reasoning_content is None:
+                reasoning_content = getattr(assistant_message, "reasoning", None)
+            if reasoning_content is not None and not isinstance(reasoning_content, str):
+                try:
+                    reasoning_content = json.dumps(reasoning_content)
+                except Exception:
+                    reasoning_content = str(reasoning_content)
+
+            # Extract tool calls
+            tool_calls = assistant_message.tool_calls if assistant_message.tool_calls else None
 
             converted_tool_calls = None
             if tool_calls:
@@ -136,9 +153,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 Message(
                     role="assistant",
                     content=assistant_content,
+                    reasoning_content=reasoning_content,
                     tool_calls=converted_tool_calls,
                 )
             ]
+
+            row.execution_metadata.finish_reason = str(finish_reason) if finish_reason is not None else None
+            row.execution_metadata.tool_call_count = (
+                len(converted_tool_calls) if converted_tool_calls is not None else 0
+            )
             row.execution_metadata.usage = (
                 CompletionUsage(  # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field
                     prompt_tokens=response.usage.prompt_tokens,  # pyright: ignore[reportAttributeAccessIssue]
 
@@ -620,7 +620,13 @@ async def _collect_result(config, lst):
 
                     experiment_duration_seconds = time.perf_counter() - experiment_start_time
 
-                    # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
+                    if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
+                        raise AssertionError(
+                            "Some EvaluationRow instances are missing evaluation_result. "
+                            "Your @evaluation_test function must set `row.evaluation_result`"
+                        )
+
+                    # for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
                     # rollout_id is used to differentiate the result from different completion_params
                     if mode == "groupwise":
                         results_by_group = [
 
@@ -0,0 +1,60 @@
+"""
+Example of using a rapidfuzz-based Python grader with OpenAI RFT via Eval Protocol.
+
+We:
+- Define a grading function over a duck-typed `row` that uses rapidfuzz.WRatio
+- Wrap it in an @evaluation_test for normal eval usage
+- Convert the grading function into a Python grader spec with
+  `build_python_grader_from_evaluation_test`
+"""
+
+from typing import Any
+
+from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+
+
+# Tiny inline demo dataset so this evaluation_test is runnable via pytest.
+DEMO_ROWS = [
+    EvaluationRow(
+        messages=[
+            Message(role="user", content="fuzzy wuzzy had no hair"),
+            Message(role="assistant", content="fuzzy wuzzy was a bear"),
+        ],
+        ground_truth="fuzzy wuzzy had no hair",
+    )
+]
+
+
+@evaluation_test(
+    input_rows=[DEMO_ROWS],
+    rollout_processor=NoOpRolloutProcessor(),
+    aggregation_method="mean",
+    mode="pointwise",
+)
+def rapidfuzz_eval(row: EvaluationRow, **kwargs: Any) -> EvaluationRow:
+    """
+    Example @evaluation_test that scores a row using rapidfuzz.WRatio and
+    attaches an EvaluateResult.
+    """
+    # For EP evals, we compare the EvaluationRow's ground_truth to the last assistant message.
+    reference = row.ground_truth
+
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    last_assistant_content = assistant_msgs[-1].content if assistant_msgs else ""
+    prediction = last_assistant_content if isinstance(last_assistant_content, str) else ""
+
+    from rapidfuzz import fuzz, utils
+
+    score = float(
+        fuzz.WRatio(
+            str(prediction),
+            str(reference),
+            processor=utils.default_process,
+        )
+        / 100.0
+    )
+    row.evaluation_result = EvaluateResult(score=score)
+    return row
@@ -0,0 +1,32 @@
+import os
+import requests
+
+from eval_protocol.integrations.openai_rft import build_python_grader_from_evaluation_test
+from examples.openai_rft.example_rapidfuzz import rapidfuzz_eval
+
+
+api_key = os.environ["OPENAI_API_KEY"]
+headers = {"Authorization": f"Bearer {api_key}"}
+
+grader = build_python_grader_from_evaluation_test(rapidfuzz_eval)  # {"type": "python", "source": "..."}
+
+# validate the grader
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
+    json={"grader": grader},
+    headers=headers,
+)
+print("validate response:", resp.text)
+
+# run the grader once with a dummy item/sample
+payload = {
+    "grader": grader,
+    "item": {"reference_answer": "fuzzy wuzzy had no hair"},
+    "model_sample": "fuzzy wuzzy was a bear",
+}
+resp = requests.post(
+    "https://api.openai.com/v1/fine_tuning/alpha/graders/run",
+    json=payload,
+    headers=headers,
+)
+print("run response:", resp.text)