Skip to content

Commit c260b5a

Browse files
benjibcshreymodi1
andauthored
Add GEVal logprob artifacts for OpenAI and Fireworks (#376)
* Add GEval logprob artifacts for OpenAI and Fireworks * updated with fireworks * updated openai adapter snapshots --------- Co-authored-by: Shrey Modi <shreycricket10@gmail.com>
1 parent 3322e5f commit c260b5a

File tree

10 files changed

+195
-3
lines changed

10 files changed

+195
-3
lines changed

eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from eval_protocol.common_utils import load_jsonl
99
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
10-
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
10+
from eval_protocol.directory_utils import find_eval_protocol_datasets_dir
1111

1212
if TYPE_CHECKING:
1313
from eval_protocol.models import EvaluationRow

eval_protocol/models.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,10 +517,20 @@ class Message(BaseModel):
517517
function_call: Optional[FunctionCall] = None
518518
control_plane_step: Optional[Dict[str, Any]] = None
519519
weight: Optional[int] = None
520+
logprobs: Optional[Any] = Field(
521+
default=None,
522+
description=(
523+
"Optional log probability metadata captured from the completion response. "
524+
"When present, this typically mirrors the provider-specific logprob payload."
525+
),
526+
)
520527

521528
def dump_mdoel_for_chat_completion_request(self):
522529
"""Only keep chat completion accepted fields"""
523-
return self.model_dump(exclude_none=True, exclude={"control_plane_step", "reasoning_content", "weight"})
530+
return self.model_dump(
531+
exclude_none=True,
532+
exclude={"control_plane_step", "reasoning_content", "weight", "logprobs"},
533+
)
524534

525535
@classmethod
526536
def model_validate(cls, obj, *args, **kwargs):

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import logging
44
import os
55
import time
6-
from typing import List
6+
from dataclasses import asdict, is_dataclass
7+
from types import SimpleNamespace
8+
from typing import Any, List
79

810
import litellm
911
from litellm import acompletion
@@ -19,6 +21,28 @@
1921
logger = logging.getLogger(__name__)
2022

2123

24+
def _serialize_logprobs(logprobs: Any) -> Any:
25+
"""Best-effort conversion of provider logprobs into JSON-serializable data."""
26+
27+
if logprobs is None:
28+
return None
29+
if hasattr(logprobs, "model_dump"):
30+
try:
31+
return logprobs.model_dump()
32+
except Exception:
33+
pass
34+
if is_dataclass(logprobs) and not isinstance(logprobs, type):
35+
return asdict(logprobs)
36+
if isinstance(logprobs, SimpleNamespace):
37+
return vars(logprobs)
38+
if isinstance(logprobs, dict):
39+
return logprobs
40+
try:
41+
return json.loads(json.dumps(logprobs, default=lambda o: getattr(o, "__dict__", str(o))))
42+
except Exception:
43+
return logprobs
44+
45+
2246
class SingleTurnRolloutProcessor(RolloutProcessor):
2347
"""Single turn rollout processor for direct LLM calls."""
2448

@@ -110,6 +134,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
110134

111135
assistant_message = response.choices[0].message
112136
finish_reason = getattr(response.choices[0], "finish_reason", None)
137+
assistant_logprobs = _serialize_logprobs(getattr(response.choices[0], "logprobs", None))
113138

114139
# Extract content
115140
assistant_content = assistant_message.content or ""
@@ -164,6 +189,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
164189
content=assistant_content,
165190
reasoning_content=reasoning_content,
166191
tool_calls=converted_tool_calls,
192+
logprobs=assistant_logprobs,
167193
)
168194
]
169195

examples/deepeval/artifacts/geval_logprobs_combined.jsonl

Lines changed: 2 additions & 0 deletions
Large diffs are not rendered by default.

examples/deepeval/artifacts/geval_logprobs_fireworks.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"messages":[{"role":"user","content":"Say hello politely."},{"role":"assistant","content":"Hello, how are you today?","logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}],"input_metadata":{"row_id":"liquid-school-081702","completion_params":{"model":"gpt-3.5-turbo","logprobs":true,"top_logprobs":3},"session_data":{"mode":"all"}},"rollout_status":{"code":100,"message":"Rollout finished","details":[]},"evaluation_result":{"score":0.9952574125139473,"is_score_valid":true,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","metrics":{"Helpful & Relevant [GEval]":{"is_score_valid":true,"score":0.9952574125139473,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","data":{"logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}}},"agg_score":0.9952574125139473,"standard_error":0.06870295008214607},"execution_metadata":{"invocation_id":"private-thing-379551","experiment_id":"tidy-picture-368421","rollout_id":"smooth-concert-175178","run_id":"traditional-hour-630753","usage":{"completion_tokens":7,"prompt_tokens":11,"total_tokens":18},"cost_metrics":{"input_cost":5.5e-6,"output_cost":0.000010500000000000001,"total_cost_dollar":0.000016000000000000003},"duration_seconds":1.00264809600003,"experiment_duration_seconds":4.857228931000009,"finish_reason":"stop","tool_call_count":0},"created_at":"2025-12-15T16:33:31.522715Z","eval_metadata":{"name":"test_geval_with_logprobs","description":"Attach GEval scores while keeping the raw logprobs on the final message.","version":"0.0.0.dev112+g8dd93b8.dirty","status":{"code":100,"message":"Evaluation finished","details":[]},"num_runs":1,"aggregation_method":"mean"},"pid":8933}

examples/deepeval/artifacts/geval_logprobs_openai_fireworks.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""Example evaluation_test that wraps deepeval's GEval and captures logprobs.
2+
3+
To run this example you will need `deepeval` installed and a compatible
4+
API key (e.g., OpenAI or Fireworks). You can override the base URL with
5+
``EP_LLM_API_BASE`` or ``EP_LLM_BASE_URL`` and pass provider-specific
6+
parameters through ``completion_params``. Logs are written to
7+
``~/.eval_protocol/datasets/<YYYY-MM-DD>.jsonl`` via the local filesystem
8+
logger so you can inspect the captured logprobs directly.
9+
10+
Environment variables:
11+
FIREWORKS_API_KEY - Required for Fireworks models
12+
"""
13+
14+
import os
15+
from typing import List
16+
17+
from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
18+
from eval_protocol.integrations.deepeval import adapt_metric
19+
from eval_protocol.models import EvaluationRow
20+
from eval_protocol.pytest import evaluation_test
21+
22+
try: # pragma: no cover - optional dependency for the example
23+
from deepeval.metrics import GEval
24+
from deepeval.models import LiteLLMModel
25+
from deepeval.test_case import LLMTestCaseParams
26+
except ImportError as exc: # pragma: no cover - optional dependency for the example
27+
raise ImportError("Install deepeval to run this example: pip install deepeval") from exc
28+
29+
# Use DeepSeek via Fireworks for the GEval judge model
30+
# Note: We need allowed_openai_params to enable top_logprobs for GEval's score normalization
31+
judge_model = LiteLLMModel(
32+
model="fireworks_ai/accounts/fireworks/models/deepseek-v3p2",
33+
api_key=os.environ.get("FIREWORKS_API_KEY"),
34+
allowed_openai_params=["top_logprobs"], # Enable logprobs for GEval normalization
35+
)
36+
37+
# Configure GEval to judge the assistant response with the full chat context.
38+
wrapped_metric = adapt_metric(
39+
GEval(
40+
name="Helpful & Relevant",
41+
criteria="Evaluate the helpfulness and relevance of the model output.",
42+
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
43+
model=judge_model,
44+
top_logprobs=5, # Fireworks max is 5 (default is 20)
45+
)
46+
)
47+
48+
49+
@evaluation_test(
50+
input_rows=[[EvaluationRow(messages=[{"role": "user", "content": "Say hello politely."}])]],
51+
completion_params=[
52+
{
53+
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p2",
54+
},
55+
],
56+
logger=LocalFSDatasetLoggerAdapter(),
57+
mode="all",
58+
)
59+
def test_geval_with_logprobs(rows: List[EvaluationRow]) -> List[EvaluationRow]:
60+
"""Attach GEval scores while keeping the raw logprobs on the final message."""
61+
62+
for row in rows:
63+
eval_result = wrapped_metric(
64+
messages=[message.model_dump(exclude_none=True) for message in row.messages],
65+
ground_truth="Hello!",
66+
)
67+
row.evaluation_result = eval_result
68+
69+
# Logprob payload is available on the last assistant message after rollout
70+
# and can be forwarded to metric metadata for debugging or analysis.
71+
last_assistant = row.messages[-1]
72+
if last_assistant.logprobs:
73+
metric_key = next(iter(eval_result.metrics))
74+
eval_result.metrics[metric_key].data["logprobs"] = last_assistant.logprobs
75+
76+
return rows

tests/adapters/__snapshots__/test_openai_responses_adapter.ambr

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
''',
111111
'control_plane_step': None,
112112
'function_call': None,
113+
'logprobs': None,
113114
'name': None,
114115
'reasoning_content': None,
115116
'role': 'system',
@@ -124,6 +125,7 @@
124125
''',
125126
'control_plane_step': None,
126127
'function_call': None,
128+
'logprobs': None,
127129
'name': None,
128130
'reasoning_content': None,
129131
'role': 'user',
@@ -134,6 +136,7 @@
134136
'content': '',
135137
'control_plane_step': None,
136138
'function_call': None,
139+
'logprobs': None,
137140
'name': None,
138141
'reasoning_content': None,
139142
'role': 'assistant',
@@ -271,6 +274,7 @@
271274
''',
272275
'control_plane_step': None,
273276
'function_call': None,
277+
'logprobs': None,
274278
'name': None,
275279
'reasoning_content': None,
276280
'role': 'tool',
@@ -288,6 +292,7 @@
288292
''',
289293
'control_plane_step': None,
290294
'function_call': None,
295+
'logprobs': None,
291296
'name': None,
292297
'reasoning_content': None,
293298
'role': 'tool',
@@ -298,6 +303,7 @@
298303
'content': 'No results found.',
299304
'control_plane_step': None,
300305
'function_call': None,
306+
'logprobs': None,
301307
'name': None,
302308
'reasoning_content': None,
303309
'role': 'tool',
@@ -308,6 +314,7 @@
308314
'content': 'No results found.',
309315
'control_plane_step': None,
310316
'function_call': None,
317+
'logprobs': None,
311318
'name': None,
312319
'reasoning_content': None,
313320
'role': 'tool',
@@ -318,6 +325,7 @@
318325
'content': 'No results found.',
319326
'control_plane_step': None,
320327
'function_call': None,
328+
'logprobs': None,
321329
'name': None,
322330
'reasoning_content': None,
323331
'role': 'tool',
@@ -338,6 +346,7 @@
338346
''',
339347
'control_plane_step': None,
340348
'function_call': None,
349+
'logprobs': None,
341350
'name': None,
342351
'reasoning_content': None,
343352
'role': 'tool',
@@ -348,6 +357,7 @@
348357
'content': 'No results found.',
349358
'control_plane_step': None,
350359
'function_call': None,
360+
'logprobs': None,
351361
'name': None,
352362
'reasoning_content': None,
353363
'role': 'tool',
@@ -358,6 +368,7 @@
358368
'content': 'No results found.',
359369
'control_plane_step': None,
360370
'function_call': None,
371+
'logprobs': None,
361372
'name': None,
362373
'reasoning_content': None,
363374
'role': 'tool',
@@ -406,6 +417,7 @@
406417
''',
407418
'control_plane_step': None,
408419
'function_call': None,
420+
'logprobs': None,
409421
'name': None,
410422
'reasoning_content': None,
411423
'role': 'assistant',
@@ -556,6 +568,7 @@
556568
''',
557569
'control_plane_step': None,
558570
'function_call': None,
571+
'logprobs': None,
559572
'name': None,
560573
'reasoning_content': None,
561574
'role': 'system',
@@ -566,6 +579,7 @@
566579
'content': 'Find all employees and their reporting hierarchy levels using a recursive CTE. Show employee name, level, and the complete hierarchy path from top to bottom.',
567580
'control_plane_step': None,
568581
'function_call': None,
582+
'logprobs': None,
569583
'name': None,
570584
'reasoning_content': None,
571585
'role': 'user',
@@ -576,6 +590,7 @@
576590
'content': '',
577591
'control_plane_step': None,
578592
'function_call': None,
593+
'logprobs': None,
579594
'name': None,
580595
'reasoning_content': None,
581596
'role': 'assistant',
@@ -606,6 +621,7 @@
606621
''',
607622
'control_plane_step': None,
608623
'function_call': None,
624+
'logprobs': None,
609625
'name': None,
610626
'reasoning_content': None,
611627
'role': 'tool',
@@ -629,6 +645,7 @@
629645
''',
630646
'control_plane_step': None,
631647
'function_call': None,
648+
'logprobs': None,
632649
'name': None,
633650
'reasoning_content': None,
634651
'role': 'assistant',

tests/test_rollout_logprobs.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
3+
import pytest
4+
from litellm.types.utils import Choices, Message as LLMMessage, ModelResponse
5+
6+
from eval_protocol.dataset_logger import default_logger
7+
from eval_protocol.models import EvaluationRow, Message
8+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
9+
from eval_protocol.pytest.exception_config import get_default_exception_handler_config
10+
from eval_protocol.pytest.types import RolloutProcessorConfig
11+
12+
13+
def test_single_turn_rollout_captures_logprobs(monkeypatch):
14+
processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)
15+
16+
config = RolloutProcessorConfig(
17+
completion_params={"model": "test-model", "logprobs": True, "top_logprobs": 2},
18+
mcp_config_path="",
19+
semaphore=asyncio.Semaphore(1),
20+
server_script_path=None,
21+
steps=1,
22+
logger=default_logger,
23+
exception_handler_config=get_default_exception_handler_config(),
24+
)
25+
26+
row = EvaluationRow(messages=[Message(role="user", content="hi")])
27+
28+
async def fake_acompletion(**kwargs):
29+
assert kwargs["logprobs"] is True
30+
assert kwargs["top_logprobs"] == 2
31+
logprobs = {"content": [{"token": "hello", "logprob": -0.1, "top_logprobs": []}]}
32+
return ModelResponse(
33+
id="resp-1",
34+
choices=[
35+
Choices(
36+
index=0,
37+
message=LLMMessage(role="assistant", content="hello"),
38+
finish_reason="stop",
39+
logprobs=logprobs,
40+
)
41+
],
42+
created=0,
43+
model="test-model",
44+
)
45+
46+
monkeypatch.setattr("eval_protocol.pytest.default_single_turn_rollout_process.acompletion", fake_acompletion)
47+
48+
async def _run() -> None:
49+
tasks = processor([row], config)
50+
completed_rows = await asyncio.gather(*tasks)
51+
52+
assert completed_rows[0].messages[-1].content == "hello"
53+
assistant_logprobs = completed_rows[0].messages[-1].logprobs
54+
assert isinstance(assistant_logprobs, dict)
55+
assert assistant_logprobs["content"][0]["token"] == "hello"
56+
assert assistant_logprobs["content"][0]["logprob"] == -0.1
57+
58+
asyncio.run(_run())

0 commit comments

Comments
 (0)