Improve optional dependency stubs and formatting

benjibc · benjibc · commit 44ec364e659d · 2025-09-16T21:31:32.000Z
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -9,7 +9,7 @@
 import random
 import time
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional, Protocol
+from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
@@ -49,9 +49,14 @@ def __call__(
     from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails
 
     LANGFUSE_AVAILABLE = True
-except ImportError:
+except ImportError:  # pragma: no cover - optional dependency
     LANGFUSE_AVAILABLE = False
 
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langfuse.client import Langfuse as _LangfuseClient  # type: ignore[import-not-found]
+else:
+    _LangfuseClient = Any
+
 
 def convert_trace_to_evaluation_row(
     trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
@@ -296,7 +301,8 @@ def __init__(self):
         if not LANGFUSE_AVAILABLE:
             raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
 
-        self.client = get_client()
+        client_factory = cast(Callable[[], _LangfuseClient], get_client)
+        self.client = client_factory()
 
     def get_evaluation_rows(
         self,
diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
@@ -10,18 +10,23 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, List, Optional, Iterable
+from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
 logger = logging.getLogger(__name__)
 
 try:
-    from langsmith import Client  # type: ignore
+    from langsmith import Client as _RuntimeClient  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover - optional dependency
+    _RuntimeClient = None
 
-    LANGSMITH_AVAILABLE = True
-except ImportError:
-    LANGSMITH_AVAILABLE = False
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langsmith import Client as LangSmithClient  # type: ignore[import-not-found]
+else:
+    LangSmithClient = Any
+
+LANGSMITH_AVAILABLE = _RuntimeClient is not None
 
 
 class LangSmithAdapter:
@@ -34,10 +39,11 @@ class LangSmithAdapter:
     - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
     """
 
-    def __init__(self, client: Optional[Client] = None) -> None:
+    def __init__(self, client: Optional["LangSmithClient"] = None) -> None:
         if not LANGSMITH_AVAILABLE:
             raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
-        self.client = client or Client()
+        runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient)
+        self.client = client or runtime_client()
 
     def get_evaluation_rows(
         self,
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -61,6 +61,7 @@
     rollout_processor_with_retry,
 )
 
+
 def evaluation_test(
     *,
     completion_params: Sequence[CompletionParams | None] | None = None,
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -19,19 +19,22 @@
   pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
 """
 
+import asyncio
 import os
 from typing import Any, Dict, List, Optional
 
 import pytest
 
+from openai import AsyncOpenAI
+
 from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
 from eval_protocol.quickstart.utils import (
     split_multi_turn_rows,
     JUDGE_CONFIGS,
     calculate_bootstrap_scores,
-    run_judgment,
+    run_judgment_async,
 )
 from eval_protocol.adapters.langsmith import LangSmithAdapter
 
@@ -91,22 +94,37 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation
 
     judgments: List[Dict[str, Any]] = []
 
-    for row in rows:
-        result = run_judgment(row, model_name, judge_name)
-        if result and result["games"][0] and result["games"][1]:
-            judgments.append(result)
+    judge_config = JUDGE_CONFIGS[judge_name]
+
+    async with AsyncOpenAI(
+        api_key=judge_config.get("api_key"),
+        base_url=judge_config.get("base_url"),
+    ) as shared_client:
+        semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8))
+
+        async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
+            async with semaphore:
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
+
+        tasks = [_run_judgment(row) for row in rows]
+        for coro in asyncio.as_completed(tasks):
+            result = await coro
+            if result and result["games"][0] and result["games"][1]:
+                judgments.append(result)
 
     if not judgments:
         print("❌ No valid judgments generated")
         return rows
 
     print(f"✅ Generated {len(judgments)} valid judgments")
 
-    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
-    if mean_score == 0.0:
+    bootstrap_result = calculate_bootstrap_scores(judgments)
+    if not bootstrap_result:
         print("❌ No valid scores extracted")
         return rows
 
+    mean_score, lower_score, upper_score = bootstrap_result
+
     print("\n##### LLM Judge Results (90th percentile CI) #####")
     clean_model_name = model_name.split("/")[-1]
     print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import importlib
+from importlib.machinery import ModuleSpec
 import sys
 import types
 from typing import cast
@@ -23,6 +24,7 @@ def _ensure_module(name: str, **attrs) -> None:
     try:  # pragma: no cover - prefer real dependency when available
         importlib.import_module("loguru")
     except ModuleNotFoundError:
+
         class _Logger:  # pragma: no cover - inert logging shim
             def __getattr__(self, _name: str):
                 def _noop(*_args, **_kwargs):
@@ -35,14 +37,59 @@ def _noop(*_args, **_kwargs):
     def _noop_loader(*_args, **_kwargs):  # pragma: no cover - placeholder loader
         return {}
 
+    def _field_type(name: str):
+        def __init__(self, *_args, **_kwargs):
+            return None
+
+        return type(name, (), {"__init__": __init__})
+
+    class _SqliteDatabase:
+        def __init__(self, *_args, **_kwargs):
+            self.path = None
+
+        def connect(self):  # pragma: no cover - stub connection
+            return None
+
+        def close(self):  # pragma: no cover
+            return None
+
+        def atomic(self):  # pragma: no cover - context manager shim
+            class _Atomic:
+                def __enter__(self_inner):
+                    return self_inner
+
+                def __exit__(self_inner, *_exc):
+                    return False
+
+            return _Atomic()
+
+        def create_tables(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
+        def create_table(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
+        def drop_tables(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
     optional_stub_attrs = {
         "toml": {"loads": _noop_loader, "load": _noop_loader},
         "datasets": {},
         "addict": {"Dict": dict},
-        "deepdiff": {},
-        "litellm": {},
-        "peewee": {},
+        "deepdiff": {"DeepDiff": type("DeepDiff", (), {})},
+        "peewee": {
+            "Model": type("Model", (), {}),
+            "SqliteDatabase": _SqliteDatabase,
+            "CharField": _field_type("CharField"),
+            "TextField": _field_type("TextField"),
+            "IntegerField": _field_type("IntegerField"),
+            "DateTimeField": _field_type("DateTimeField"),
+            "AutoField": _field_type("AutoField"),
+            "OperationalError": Exception,
+        },
         "backoff": {},
+        "aiohttp": {"ClientSession": type("ClientSession", (), {})},
+        "tqdm": {"tqdm": lambda iterable, *_args, **_kwargs: iterable},
     }
 
     for optional_module, attrs in optional_stub_attrs.items():
@@ -51,6 +98,64 @@ def _noop_loader(*_args, **_kwargs):  # pragma: no cover - placeholder loader
         except ModuleNotFoundError:
             _ensure_module(optional_module, **attrs)
 
+    try:
+        importlib.import_module("litellm")
+    except ModuleNotFoundError:
+        litellm_mod = types.ModuleType("litellm")
+
+        def _acompletion(*_args, **_kwargs):  # pragma: no cover - stubbed async function
+            return None
+
+        def _completion_cost(*_args, **_kwargs):  # pragma: no cover - cost shim
+            return 0.0
+
+        litellm_mod.acompletion = _acompletion
+        litellm_mod.completion = _acompletion
+        litellm_mod.completion_cost = _completion_cost
+
+        caching_pkg = types.ModuleType("litellm.caching")
+        caching_submodule = types.ModuleType("litellm.caching.caching")
+        caching_submodule.Cache = type("Cache", (), {})
+        dual_cache_module = types.ModuleType("litellm.caching.dual_cache")
+        dual_cache_module.DualCache = type("DualCache", (), {})
+        in_memory_cache_module = types.ModuleType("litellm.caching.in_memory_cache")
+        in_memory_cache_module.InMemoryCache = type("InMemoryCache", (), {})
+        caching_pkg.caching = caching_submodule
+        caching_pkg.dual_cache = dual_cache_module
+        caching_pkg.in_memory_cache = in_memory_cache_module
+        redis_cache_module = types.ModuleType("litellm.caching.redis_cache")
+        redis_cache_module.RedisCache = type("RedisCache", (), {})
+        caching_pkg.redis_cache = redis_cache_module
+
+        litellm_mod.caching = caching_pkg
+
+        main_module = types.ModuleType("litellm.main")
+        main_module.ModelResponse = type("ModelResponse", (), {})
+        main_module.Usage = type("Usage", (), {})
+
+        cost_calculator_mod = types.ModuleType("litellm.cost_calculator")
+        cost_calculator_mod.cost_per_token = lambda *_args, **_kwargs: 0.0
+
+        sys.modules["litellm"] = litellm_mod
+        sys.modules["litellm.caching"] = caching_pkg
+        sys.modules["litellm.caching.caching"] = caching_submodule
+        sys.modules["litellm.caching.dual_cache"] = dual_cache_module
+        sys.modules["litellm.caching.in_memory_cache"] = in_memory_cache_module
+        sys.modules["litellm.caching.redis_cache"] = redis_cache_module
+        sys.modules["litellm.main"] = main_module
+        sys.modules["litellm.cost_calculator"] = cost_calculator_mod
+
+    try:
+        importlib.import_module("playhouse.sqlite_ext")
+    except ModuleNotFoundError:
+        playhouse_mod = types.ModuleType("playhouse")
+        sqlite_ext_mod = types.ModuleType("playhouse.sqlite_ext")
+        sqlite_ext_mod.JSONField = type("JSONField", (), {})
+        playhouse_mod.sqlite_ext = sqlite_ext_mod
+
+        sys.modules["playhouse"] = playhouse_mod
+        sys.modules["playhouse.sqlite_ext"] = sqlite_ext_mod
+
     try:
         importlib.import_module("openai")
         return
@@ -62,6 +167,7 @@ def _noop_loader(*_args, **_kwargs):  # pragma: no cover - placeholder loader
     completion_usage_mod = types.ModuleType("openai.types.completion_usage")
     chat_mod = types.ModuleType("openai.types.chat")
     chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message")
+    chat_message_param_mod = types.ModuleType("openai.types.chat.chat_completion_message_param")
     tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call")
 
     class CompletionUsage(BaseModel):  # pragma: no cover - simple data container
@@ -77,30 +183,67 @@ class FunctionCall(BaseModel):  # pragma: no cover - simple data container
 
         model_config = ConfigDict(extra="allow")
 
+    class FunctionDefinition(BaseModel):  # pragma: no cover - simple data container
+        name: str | None = None
+        description: str | None = None
+        parameters: dict[str, Any] | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class ChatCompletionContentPartTextParam(BaseModel):  # pragma: no cover - simple data container
+        text: str | None = None
+        type: str = "text"
+
+        model_config = ConfigDict(extra="allow")
+
     class ChatCompletionMessageToolCall(BaseModel):  # pragma: no cover - simple data container
         id: str | None = None
         type: str | None = None
         function: FunctionCall | None = None
 
         model_config = ConfigDict(extra="allow")
 
+    class ChatCompletionMessageParam(BaseModel):  # pragma: no cover - simple data container
+        content: str | None = None
+        role: str | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class _NotGiven:  # pragma: no cover - sentinel placeholder
+        pass
+
     types_mod.CompletionUsage = CompletionUsage
     completion_usage_mod.CompletionUsage = CompletionUsage
     chat_message_mod.FunctionCall = FunctionCall
+    chat_message_param_mod.ChatCompletionMessageParam = ChatCompletionMessageParam
     tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall
+    chat_mod.ChatCompletionContentPartTextParam = ChatCompletionContentPartTextParam
+    types_mod.FunctionDefinition = FunctionDefinition
+
+    openai_mod.__spec__ = ModuleSpec("openai", loader=None)
+    types_mod.__spec__ = ModuleSpec("openai.types", loader=None)
+    completion_usage_mod.__spec__ = ModuleSpec("openai.types.completion_usage", loader=None)
+    chat_mod.__spec__ = ModuleSpec("openai.types.chat", loader=None)
+    chat_message_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message", loader=None)
+    chat_message_param_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_param", loader=None)
+    tool_call_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_tool_call", loader=None)
 
     openai_mod.types = types_mod
+    openai_mod.NotGiven = _NotGiven
+    openai_mod.NOT_GIVEN = _NotGiven()
     types_mod.completion_usage = completion_usage_mod
     types_mod.chat = chat_mod
     chat_mod.chat_completion_message = chat_message_mod
     chat_mod.chat_completion_message_tool_call = tool_call_mod
+    chat_mod.chat_completion_message_param = chat_message_param_mod
 
     sys.modules["openai"] = openai_mod
     sys.modules["openai.types"] = types_mod
     sys.modules["openai.types.completion_usage"] = completion_usage_mod
     sys.modules["openai.types.chat"] = chat_mod
     sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod
     sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod
+    sys.modules["openai.types.chat.chat_completion_message_param"] = chat_message_param_mod
 
 
 _install_dependency_stubs()

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@`
`61`	`61`	`rollout_processor_with_retry,`
`62`	`62`	`)`
`63`	`63`
	`64`	`+`
`64`	`65`	`def evaluation_test(`
`65`	`66`	`*,`
`66`	`67`	`completion_params: Sequence[CompletionParams \| None] \| None = None,`