Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import random
import time
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Protocol
from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol

from eval_protocol.models import EvaluationRow, InputMetadata, Message

Expand Down Expand Up @@ -49,9 +49,14 @@ def __call__(
from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails

LANGFUSE_AVAILABLE = True
except ImportError:
except ImportError: # pragma: no cover - optional dependency
LANGFUSE_AVAILABLE = False

if TYPE_CHECKING: # pragma: no cover - import is optional at runtime
from langfuse.client import Langfuse as _LangfuseClient # type: ignore[import-not-found]
else:
_LangfuseClient = Any


def convert_trace_to_evaluation_row(
trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
Expand Down Expand Up @@ -296,7 +301,8 @@ def __init__(self):
if not LANGFUSE_AVAILABLE:
raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")

self.client = get_client()
client_factory = cast(Callable[[], _LangfuseClient], get_client)
self.client = client_factory()

def get_evaluation_rows(
self,
Expand Down
20 changes: 13 additions & 7 deletions eval_protocol/adapters/langsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,23 @@
from __future__ import annotations

import logging
from typing import Any, Dict, List, Optional, Iterable
from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast

from eval_protocol.models import EvaluationRow, InputMetadata, Message

logger = logging.getLogger(__name__)

try:
from langsmith import Client # type: ignore
from langsmith import Client as _RuntimeClient # type: ignore[attr-defined]
except ImportError: # pragma: no cover - optional dependency
_RuntimeClient = None

LANGSMITH_AVAILABLE = True
except ImportError:
LANGSMITH_AVAILABLE = False
if TYPE_CHECKING: # pragma: no cover - import is optional at runtime
from langsmith import Client as LangSmithClient # type: ignore[import-not-found]
else:
LangSmithClient = Any

LANGSMITH_AVAILABLE = _RuntimeClient is not None


class LangSmithAdapter:
Expand All @@ -34,10 +39,11 @@ class LangSmithAdapter:
- outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
"""

def __init__(self, client: Optional[Client] = None) -> None:
def __init__(self, client: Optional["LangSmithClient"] = None) -> None:
if not LANGSMITH_AVAILABLE:
raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
self.client = client or Client()
runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient)
self.client = client or runtime_client()

def get_evaluation_rows(
self,
Expand Down
59 changes: 59 additions & 0 deletions eval_protocol/pytest/dataset_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Utilities for preparing datasets for evaluation tests."""

from collections.abc import Callable
from typing import Any

from eval_protocol.human_id import generate_id, num_combinations
from eval_protocol.models import EvaluationRow
from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs
from eval_protocol.pytest.types import Dataset

from ..common_utils import load_jsonl


def load_and_prepare_rows(
kwargs: ParameterizedTestKwargs,
*,
dataset_adapter: Callable[[list[dict[str, Any]]], Dataset],
preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None,
max_dataset_rows: int | None,
) -> list[EvaluationRow]:
"""Load and preprocess evaluation rows based on parameterized pytest kwargs.

This helper consolidates the logic that loads input data from various sources
(dataset paths, raw messages, or pre-built :class:`EvaluationRow` objects),
applies optional preprocessing, and ensures each row has a stable
``row_id``. The behavior mirrors the original inline implementation inside
:func:`eval_protocol.pytest.evaluation_test.evaluation_test`.
"""

data: list[EvaluationRow] = []

if kwargs.get("dataset_path") is not None:
ds_arg = kwargs["dataset_path"]
data_jsonl: list[dict[str, Any]] = []
for path in ds_arg or []:
data_jsonl.extend(load_jsonl(path))
if max_dataset_rows is not None:
data_jsonl = data_jsonl[:max_dataset_rows]
data = dataset_adapter(data_jsonl)
elif kwargs.get("input_messages") is not None:
input_messages = kwargs["input_messages"] or []
data = [EvaluationRow(messages=dataset_messages) for dataset_messages in input_messages]
elif kwargs.get("input_rows") is not None:
input_rows = kwargs["input_rows"] or []
data = [row.model_copy(deep=True) for row in input_rows]
else:
raise ValueError("No input dataset, input messages, or input rows provided")

if preprocess_fn:
data = preprocess_fn(data)

for row in data:
if row.input_metadata.row_id is None:
index = hash(row)
max_index = num_combinations() - 1
index = abs(index) % (max_index + 1)
row.input_metadata.row_id = generate_id(seed=0, index=index)

return data
44 changes: 7 additions & 37 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Status,
)
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
from eval_protocol.pytest.execution import execute_pytest
from eval_protocol.pytest.generate_parameter_combinations import (
Expand Down Expand Up @@ -60,8 +61,6 @@
rollout_processor_with_retry,
)

from ..common_utils import load_jsonl


def evaluation_test(
*,
Expand Down Expand Up @@ -223,43 +222,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)

try:
# Handle dataset loading
data: list[EvaluationRow] = []
# Track all rows processed in the current run for error logging
processed_rows_in_run: list[EvaluationRow] = []
if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
ds_arg: list[str] = kwargs["dataset_path"]
# Support either a single path or a list of paths; if a list is provided,
# concatenate the rows from each file in order.
data_jsonl: list[dict[str, object]] = []
for p in ds_arg:
data_jsonl.extend(load_jsonl(p))
# Apply override for max rows if present
if max_dataset_rows is not None:
data_jsonl = data_jsonl[:max_dataset_rows]
data = dataset_adapter(data_jsonl)
elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
# Support either a single row (List[Message]) or many rows (List[List[Message]])
im = kwargs["input_messages"]
data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im]
elif "input_rows" in kwargs and kwargs["input_rows"] is not None:
# Deep copy pre-constructed EvaluationRow objects
data = [row.model_copy(deep=True) for row in kwargs["input_rows"]]
else:
raise ValueError("No input dataset, input messages, or input rows provided")

if preprocess_fn:
data = preprocess_fn(data)

for row in data:
# generate a stable row_id for each row
if row.input_metadata.row_id is None:
# Generate a stable, deterministic row_id using the row's hash and num_combinations
index = hash(row)
max_index = num_combinations() - 1
# Ensure index is a non-negative integer within [0, max_index]
index = abs(index) % (max_index + 1)
row.input_metadata.row_id = generate_id(seed=0, index=index)
data = load_and_prepare_rows(
kwargs,
dataset_adapter=dataset_adapter,
preprocess_fn=preprocess_fn,
max_dataset_rows=max_dataset_rows,
)

completion_params = kwargs["completion_params"]
# Create eval metadata with test function info and current commit hash
Expand Down
32 changes: 25 additions & 7 deletions eval_protocol/quickstart/llm_judge_langsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@
pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
"""

import asyncio
import os
from typing import Any, Dict, List, Optional

import pytest

from openai import AsyncOpenAI

from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.quickstart.utils import (
split_multi_turn_rows,
JUDGE_CONFIGS,
calculate_bootstrap_scores,
run_judgment,
run_judgment_async,
)
from eval_protocol.adapters.langsmith import LangSmithAdapter

Expand Down Expand Up @@ -91,22 +94,37 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation

judgments: List[Dict[str, Any]] = []

for row in rows:
result = run_judgment(row, model_name, judge_name)
if result and result["games"][0] and result["games"][1]:
judgments.append(result)
judge_config = JUDGE_CONFIGS[judge_name]

async with AsyncOpenAI(
api_key=judge_config.get("api_key"),
base_url=judge_config.get("base_url"),
) as shared_client:
semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8))

async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
async with semaphore:
return await run_judgment_async(row, model_name, judge_name, shared_client)

tasks = [_run_judgment(row) for row in rows]
for coro in asyncio.as_completed(tasks):
result = await coro
if result and result["games"][0] and result["games"][1]:
judgments.append(result)

if not judgments:
print("❌ No valid judgments generated")
return rows

print(f"✅ Generated {len(judgments)} valid judgments")

mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
if mean_score == 0.0:
bootstrap_result = calculate_bootstrap_scores(judgments)
if not bootstrap_result:
print("❌ No valid scores extracted")
return rows

mean_score, lower_score, upper_score = bootstrap_result

print("\n##### LLM Judge Results (90th percentile CI) #####")
clean_model_name = model_name.split("/")[-1]
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
Expand Down
Loading
Loading