eval-protocol · benjibc · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -9,7 +9,7 @@
 import random
 import time
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional, Protocol
+from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
@@ -49,9 +49,14 @@ def __call__(
     from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails
 
     LANGFUSE_AVAILABLE = True
-except ImportError:
+except ImportError:  # pragma: no cover - optional dependency
     LANGFUSE_AVAILABLE = False
 
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langfuse.client import Langfuse as _LangfuseClient  # type: ignore[import-not-found]
+else:
+    _LangfuseClient = Any
+
 
 def convert_trace_to_evaluation_row(
     trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
@@ -296,7 +301,8 @@ def __init__(self):
         if not LANGFUSE_AVAILABLE:
             raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
 
-        self.client = get_client()
+        client_factory = cast(Callable[[], _LangfuseClient], get_client)
+        self.client = client_factory()
 
     def get_evaluation_rows(
         self,

diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
@@ -10,18 +10,23 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, List, Optional, Iterable
+from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
 logger = logging.getLogger(__name__)
 
 try:
-    from langsmith import Client  # type: ignore
+    from langsmith import Client as _RuntimeClient  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover - optional dependency
+    _RuntimeClient = None
 
-    LANGSMITH_AVAILABLE = True
-except ImportError:
-    LANGSMITH_AVAILABLE = False
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langsmith import Client as LangSmithClient  # type: ignore[import-not-found]
+else:
+    LangSmithClient = Any
+
+LANGSMITH_AVAILABLE = _RuntimeClient is not None
 
 
 class LangSmithAdapter:
@@ -34,10 +39,11 @@ class LangSmithAdapter:
     - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
     """
 
-    def __init__(self, client: Optional[Client] = None) -> None:
+    def __init__(self, client: Optional["LangSmithClient"] = None) -> None:
         if not LANGSMITH_AVAILABLE:
             raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
-        self.client = client or Client()
+        runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient)
+        self.client = client or runtime_client()
 
     def get_evaluation_rows(
         self,

diff --git a/eval_protocol/pytest/dataset_preparation.py b/eval_protocol/pytest/dataset_preparation.py
@@ -0,0 +1,59 @@
+"""Utilities for preparing datasets for evaluation tests."""
+
+from collections.abc import Callable
+from typing import Any
+
+from eval_protocol.human_id import generate_id, num_combinations
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs
+from eval_protocol.pytest.types import Dataset
+
+from ..common_utils import load_jsonl
+
+
+def load_and_prepare_rows(
+    kwargs: ParameterizedTestKwargs,
+    *,
+    dataset_adapter: Callable[[list[dict[str, Any]]], Dataset],
+    preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None,
+    max_dataset_rows: int | None,
+) -> list[EvaluationRow]:
+    """Load and preprocess evaluation rows based on parameterized pytest kwargs.
+
+    This helper consolidates the logic that loads input data from various sources
+    (dataset paths, raw messages, or pre-built :class:`EvaluationRow` objects),
+    applies optional preprocessing, and ensures each row has a stable
+    ``row_id``. The behavior mirrors the original inline implementation inside
+    :func:`eval_protocol.pytest.evaluation_test.evaluation_test`.
+    """
+
+    data: list[EvaluationRow] = []
+
+    if kwargs.get("dataset_path") is not None:
+        ds_arg = kwargs["dataset_path"]
+        data_jsonl: list[dict[str, Any]] = []
+        for path in ds_arg or []:
+            data_jsonl.extend(load_jsonl(path))
+        if max_dataset_rows is not None:
+            data_jsonl = data_jsonl[:max_dataset_rows]
+        data = dataset_adapter(data_jsonl)
+    elif kwargs.get("input_messages") is not None:
+        input_messages = kwargs["input_messages"] or []
+        data = [EvaluationRow(messages=dataset_messages) for dataset_messages in input_messages]
+    elif kwargs.get("input_rows") is not None:
+        input_rows = kwargs["input_rows"] or []
+        data = [row.model_copy(deep=True) for row in input_rows]
+    else:
+        raise ValueError("No input dataset, input messages, or input rows provided")
+
+    if preprocess_fn:
+        data = preprocess_fn(data)
+
+    for row in data:
+        if row.input_metadata.row_id is None:
+            index = hash(row)
+            max_index = num_combinations() - 1
+            index = abs(index) % (max_index + 1)
+            row.input_metadata.row_id = generate_id(seed=0, index=index)
+
+    return data
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -23,6 +23,7 @@
     Status,
 )
 from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
+from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows
 from eval_protocol.pytest.evaluation_test_postprocess import postprocess
 from eval_protocol.pytest.execution import execute_pytest
 from eval_protocol.pytest.generate_parameter_combinations import (
@@ -60,8 +61,6 @@
     rollout_processor_with_retry,
 )
 
-from ..common_utils import load_jsonl
-
 
 def evaluation_test(
     *,
@@ -223,43 +222,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
 
                 try:
-                    # Handle dataset loading
-                    data: list[EvaluationRow] = []
                     # Track all rows processed in the current run for error logging
                     processed_rows_in_run: list[EvaluationRow] = []
-                    if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
-                        ds_arg: list[str] = kwargs["dataset_path"]
-                        # Support either a single path or a list of paths; if a list is provided,
-                        # concatenate the rows from each file in order.
-                        data_jsonl: list[dict[str, object]] = []
-                        for p in ds_arg:
-                            data_jsonl.extend(load_jsonl(p))
-                        # Apply override for max rows if present
-                        if max_dataset_rows is not None:
-                            data_jsonl = data_jsonl[:max_dataset_rows]
-                        data = dataset_adapter(data_jsonl)
-                    elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
-                        # Support either a single row (List[Message]) or many rows (List[List[Message]])
-                        im = kwargs["input_messages"]
-                        data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im]
-                    elif "input_rows" in kwargs and kwargs["input_rows"] is not None:
-                        # Deep copy pre-constructed EvaluationRow objects
-                        data = [row.model_copy(deep=True) for row in kwargs["input_rows"]]
-                    else:
-                        raise ValueError("No input dataset, input messages, or input rows provided")
-
-                    if preprocess_fn:
-                        data = preprocess_fn(data)
-
-                    for row in data:
-                        # generate a stable row_id for each row
-                        if row.input_metadata.row_id is None:
-                            # Generate a stable, deterministic row_id using the row's hash and num_combinations
-                            index = hash(row)
-                            max_index = num_combinations() - 1
-                            # Ensure index is a non-negative integer within [0, max_index]
-                            index = abs(index) % (max_index + 1)
-                            row.input_metadata.row_id = generate_id(seed=0, index=index)
+                    data = load_and_prepare_rows(
+                        kwargs,
+                        dataset_adapter=dataset_adapter,
+                        preprocess_fn=preprocess_fn,
+                        max_dataset_rows=max_dataset_rows,
+                    )
 
                     completion_params = kwargs["completion_params"]
                     # Create eval metadata with test function info and current commit hash

diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -19,19 +19,22 @@
   pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
 """
 
+import asyncio
 import os
 from typing import Any, Dict, List, Optional
 
 import pytest
 
+from openai import AsyncOpenAI
+
 from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
 from eval_protocol.quickstart.utils import (
     split_multi_turn_rows,
     JUDGE_CONFIGS,
     calculate_bootstrap_scores,
-    run_judgment,
+    run_judgment_async,
 )
 from eval_protocol.adapters.langsmith import LangSmithAdapter
 
@@ -91,22 +94,37 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation
 
     judgments: List[Dict[str, Any]] = []
 
-    for row in rows:
-        result = run_judgment(row, model_name, judge_name)
-        if result and result["games"][0] and result["games"][1]:
-            judgments.append(result)
+    judge_config = JUDGE_CONFIGS[judge_name]
+
+    async with AsyncOpenAI(
+        api_key=judge_config.get("api_key"),
+        base_url=judge_config.get("base_url"),
+    ) as shared_client:
+        semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8))
+
+        async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
+            async with semaphore:
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
+
+        tasks = [_run_judgment(row) for row in rows]
+        for coro in asyncio.as_completed(tasks):
+            result = await coro
+            if result and result["games"][0] and result["games"][1]:
+                judgments.append(result)
 
     if not judgments:
         print("❌ No valid judgments generated")
         return rows
 
     print(f"✅ Generated {len(judgments)} valid judgments")
 
-    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
-    if mean_score == 0.0:
+    bootstrap_result = calculate_bootstrap_scores(judgments)
+    if not bootstrap_result:
         print("❌ No valid scores extracted")
         return rows
 
+    mean_score, lower_score, upper_score = bootstrap_result
+
     print("\n##### LLM Judge Results (90th percentile CI) #####")
     clean_model_name = model_name.split("/")[-1]
     print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")