Skip to content

Commit 44ec364

Browse files
committed
Improve optional dependency stubs and formatting
1 parent 3d142fc commit 44ec364

File tree

5 files changed

+194
-20
lines changed

5 files changed

+194
-20
lines changed

eval_protocol/adapters/langfuse.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import random
1010
import time
1111
from datetime import datetime, timedelta
12-
from typing import Any, Dict, List, Optional, Protocol
12+
from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol
1313

1414
from eval_protocol.models import EvaluationRow, InputMetadata, Message
1515

@@ -49,9 +49,14 @@ def __call__(
4949
from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails
5050

5151
LANGFUSE_AVAILABLE = True
52-
except ImportError:
52+
except ImportError: # pragma: no cover - optional dependency
5353
LANGFUSE_AVAILABLE = False
5454

55+
if TYPE_CHECKING: # pragma: no cover - import is optional at runtime
56+
from langfuse.client import Langfuse as _LangfuseClient # type: ignore[import-not-found]
57+
else:
58+
_LangfuseClient = Any
59+
5560

5661
def convert_trace_to_evaluation_row(
5762
trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
@@ -296,7 +301,8 @@ def __init__(self):
296301
if not LANGFUSE_AVAILABLE:
297302
raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
298303

299-
self.client = get_client()
304+
client_factory = cast(Callable[[], _LangfuseClient], get_client)
305+
self.client = client_factory()
300306

301307
def get_evaluation_rows(
302308
self,

eval_protocol/adapters/langsmith.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,23 @@
1010
from __future__ import annotations
1111

1212
import logging
13-
from typing import Any, Dict, List, Optional, Iterable
13+
from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast
1414

1515
from eval_protocol.models import EvaluationRow, InputMetadata, Message
1616

1717
logger = logging.getLogger(__name__)
1818

1919
try:
20-
from langsmith import Client # type: ignore
20+
from langsmith import Client as _RuntimeClient # type: ignore[attr-defined]
21+
except ImportError: # pragma: no cover - optional dependency
22+
_RuntimeClient = None
2123

22-
LANGSMITH_AVAILABLE = True
23-
except ImportError:
24-
LANGSMITH_AVAILABLE = False
24+
if TYPE_CHECKING: # pragma: no cover - import is optional at runtime
25+
from langsmith import Client as LangSmithClient # type: ignore[import-not-found]
26+
else:
27+
LangSmithClient = Any
28+
29+
LANGSMITH_AVAILABLE = _RuntimeClient is not None
2530

2631

2732
class LangSmithAdapter:
@@ -34,10 +39,11 @@ class LangSmithAdapter:
3439
- outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
3540
"""
3641

37-
def __init__(self, client: Optional[Client] = None) -> None:
42+
def __init__(self, client: Optional["LangSmithClient"] = None) -> None:
3843
if not LANGSMITH_AVAILABLE:
3944
raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
40-
self.client = client or Client()
45+
runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient)
46+
self.client = client or runtime_client()
4147

4248
def get_evaluation_rows(
4349
self,

eval_protocol/pytest/evaluation_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
rollout_processor_with_retry,
6262
)
6363

64+
6465
def evaluation_test(
6566
*,
6667
completion_params: Sequence[CompletionParams | None] | None = None,

eval_protocol/quickstart/llm_judge_langsmith.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,22 @@
1919
pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
2020
"""
2121

22+
import asyncio
2223
import os
2324
from typing import Any, Dict, List, Optional
2425

2526
import pytest
2627

28+
from openai import AsyncOpenAI
29+
2730
from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
2831
from eval_protocol.pytest import evaluation_test
2932
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
3033
from eval_protocol.quickstart.utils import (
3134
split_multi_turn_rows,
3235
JUDGE_CONFIGS,
3336
calculate_bootstrap_scores,
34-
run_judgment,
37+
run_judgment_async,
3538
)
3639
from eval_protocol.adapters.langsmith import LangSmithAdapter
3740

@@ -91,22 +94,37 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation
9194

9295
judgments: List[Dict[str, Any]] = []
9396

94-
for row in rows:
95-
result = run_judgment(row, model_name, judge_name)
96-
if result and result["games"][0] and result["games"][1]:
97-
judgments.append(result)
97+
judge_config = JUDGE_CONFIGS[judge_name]
98+
99+
async with AsyncOpenAI(
100+
api_key=judge_config.get("api_key"),
101+
base_url=judge_config.get("base_url"),
102+
) as shared_client:
103+
semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8))
104+
105+
async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
106+
async with semaphore:
107+
return await run_judgment_async(row, model_name, judge_name, shared_client)
108+
109+
tasks = [_run_judgment(row) for row in rows]
110+
for coro in asyncio.as_completed(tasks):
111+
result = await coro
112+
if result and result["games"][0] and result["games"][1]:
113+
judgments.append(result)
98114

99115
if not judgments:
100116
print("❌ No valid judgments generated")
101117
return rows
102118

103119
print(f"✅ Generated {len(judgments)} valid judgments")
104120

105-
mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
106-
if mean_score == 0.0:
121+
bootstrap_result = calculate_bootstrap_scores(judgments)
122+
if not bootstrap_result:
107123
print("❌ No valid scores extracted")
108124
return rows
109125

126+
mean_score, lower_score, upper_score = bootstrap_result
127+
110128
print("\n##### LLM Judge Results (90th percentile CI) #####")
111129
clean_model_name = model_name.split("/")[-1]
112130
print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")

tests/pytest/test_dataset_preparation.py

Lines changed: 146 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import importlib
4+
from importlib.machinery import ModuleSpec
45
import sys
56
import types
67
from typing import cast
@@ -23,6 +24,7 @@ def _ensure_module(name: str, **attrs) -> None:
2324
try: # pragma: no cover - prefer real dependency when available
2425
importlib.import_module("loguru")
2526
except ModuleNotFoundError:
27+
2628
class _Logger: # pragma: no cover - inert logging shim
2729
def __getattr__(self, _name: str):
2830
def _noop(*_args, **_kwargs):
@@ -35,14 +37,59 @@ def _noop(*_args, **_kwargs):
3537
def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader
3638
return {}
3739

40+
def _field_type(name: str):
41+
def __init__(self, *_args, **_kwargs):
42+
return None
43+
44+
return type(name, (), {"__init__": __init__})
45+
46+
class _SqliteDatabase:
47+
def __init__(self, *_args, **_kwargs):
48+
self.path = None
49+
50+
def connect(self): # pragma: no cover - stub connection
51+
return None
52+
53+
def close(self): # pragma: no cover
54+
return None
55+
56+
def atomic(self): # pragma: no cover - context manager shim
57+
class _Atomic:
58+
def __enter__(self_inner):
59+
return self_inner
60+
61+
def __exit__(self_inner, *_exc):
62+
return False
63+
64+
return _Atomic()
65+
66+
def create_tables(self, *_args, **_kwargs): # pragma: no cover
67+
return None
68+
69+
def create_table(self, *_args, **_kwargs): # pragma: no cover
70+
return None
71+
72+
def drop_tables(self, *_args, **_kwargs): # pragma: no cover
73+
return None
74+
3875
optional_stub_attrs = {
3976
"toml": {"loads": _noop_loader, "load": _noop_loader},
4077
"datasets": {},
4178
"addict": {"Dict": dict},
42-
"deepdiff": {},
43-
"litellm": {},
44-
"peewee": {},
79+
"deepdiff": {"DeepDiff": type("DeepDiff", (), {})},
80+
"peewee": {
81+
"Model": type("Model", (), {}),
82+
"SqliteDatabase": _SqliteDatabase,
83+
"CharField": _field_type("CharField"),
84+
"TextField": _field_type("TextField"),
85+
"IntegerField": _field_type("IntegerField"),
86+
"DateTimeField": _field_type("DateTimeField"),
87+
"AutoField": _field_type("AutoField"),
88+
"OperationalError": Exception,
89+
},
4590
"backoff": {},
91+
"aiohttp": {"ClientSession": type("ClientSession", (), {})},
92+
"tqdm": {"tqdm": lambda iterable, *_args, **_kwargs: iterable},
4693
}
4794

4895
for optional_module, attrs in optional_stub_attrs.items():
@@ -51,6 +98,64 @@ def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader
5198
except ModuleNotFoundError:
5299
_ensure_module(optional_module, **attrs)
53100

101+
try:
102+
importlib.import_module("litellm")
103+
except ModuleNotFoundError:
104+
litellm_mod = types.ModuleType("litellm")
105+
106+
def _acompletion(*_args, **_kwargs): # pragma: no cover - stubbed async function
107+
return None
108+
109+
def _completion_cost(*_args, **_kwargs): # pragma: no cover - cost shim
110+
return 0.0
111+
112+
litellm_mod.acompletion = _acompletion
113+
litellm_mod.completion = _acompletion
114+
litellm_mod.completion_cost = _completion_cost
115+
116+
caching_pkg = types.ModuleType("litellm.caching")
117+
caching_submodule = types.ModuleType("litellm.caching.caching")
118+
caching_submodule.Cache = type("Cache", (), {})
119+
dual_cache_module = types.ModuleType("litellm.caching.dual_cache")
120+
dual_cache_module.DualCache = type("DualCache", (), {})
121+
in_memory_cache_module = types.ModuleType("litellm.caching.in_memory_cache")
122+
in_memory_cache_module.InMemoryCache = type("InMemoryCache", (), {})
123+
caching_pkg.caching = caching_submodule
124+
caching_pkg.dual_cache = dual_cache_module
125+
caching_pkg.in_memory_cache = in_memory_cache_module
126+
redis_cache_module = types.ModuleType("litellm.caching.redis_cache")
127+
redis_cache_module.RedisCache = type("RedisCache", (), {})
128+
caching_pkg.redis_cache = redis_cache_module
129+
130+
litellm_mod.caching = caching_pkg
131+
132+
main_module = types.ModuleType("litellm.main")
133+
main_module.ModelResponse = type("ModelResponse", (), {})
134+
main_module.Usage = type("Usage", (), {})
135+
136+
cost_calculator_mod = types.ModuleType("litellm.cost_calculator")
137+
cost_calculator_mod.cost_per_token = lambda *_args, **_kwargs: 0.0
138+
139+
sys.modules["litellm"] = litellm_mod
140+
sys.modules["litellm.caching"] = caching_pkg
141+
sys.modules["litellm.caching.caching"] = caching_submodule
142+
sys.modules["litellm.caching.dual_cache"] = dual_cache_module
143+
sys.modules["litellm.caching.in_memory_cache"] = in_memory_cache_module
144+
sys.modules["litellm.caching.redis_cache"] = redis_cache_module
145+
sys.modules["litellm.main"] = main_module
146+
sys.modules["litellm.cost_calculator"] = cost_calculator_mod
147+
148+
try:
149+
importlib.import_module("playhouse.sqlite_ext")
150+
except ModuleNotFoundError:
151+
playhouse_mod = types.ModuleType("playhouse")
152+
sqlite_ext_mod = types.ModuleType("playhouse.sqlite_ext")
153+
sqlite_ext_mod.JSONField = type("JSONField", (), {})
154+
playhouse_mod.sqlite_ext = sqlite_ext_mod
155+
156+
sys.modules["playhouse"] = playhouse_mod
157+
sys.modules["playhouse.sqlite_ext"] = sqlite_ext_mod
158+
54159
try:
55160
importlib.import_module("openai")
56161
return
@@ -62,6 +167,7 @@ def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader
62167
completion_usage_mod = types.ModuleType("openai.types.completion_usage")
63168
chat_mod = types.ModuleType("openai.types.chat")
64169
chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message")
170+
chat_message_param_mod = types.ModuleType("openai.types.chat.chat_completion_message_param")
65171
tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call")
66172

67173
class CompletionUsage(BaseModel): # pragma: no cover - simple data container
@@ -77,30 +183,67 @@ class FunctionCall(BaseModel): # pragma: no cover - simple data container
77183

78184
model_config = ConfigDict(extra="allow")
79185

186+
class FunctionDefinition(BaseModel): # pragma: no cover - simple data container
187+
name: str | None = None
188+
description: str | None = None
189+
parameters: dict[str, Any] | None = None
190+
191+
model_config = ConfigDict(extra="allow")
192+
193+
class ChatCompletionContentPartTextParam(BaseModel): # pragma: no cover - simple data container
194+
text: str | None = None
195+
type: str = "text"
196+
197+
model_config = ConfigDict(extra="allow")
198+
80199
class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple data container
81200
id: str | None = None
82201
type: str | None = None
83202
function: FunctionCall | None = None
84203

85204
model_config = ConfigDict(extra="allow")
86205

206+
class ChatCompletionMessageParam(BaseModel): # pragma: no cover - simple data container
207+
content: str | None = None
208+
role: str | None = None
209+
210+
model_config = ConfigDict(extra="allow")
211+
212+
class _NotGiven: # pragma: no cover - sentinel placeholder
213+
pass
214+
87215
types_mod.CompletionUsage = CompletionUsage
88216
completion_usage_mod.CompletionUsage = CompletionUsage
89217
chat_message_mod.FunctionCall = FunctionCall
218+
chat_message_param_mod.ChatCompletionMessageParam = ChatCompletionMessageParam
90219
tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall
220+
chat_mod.ChatCompletionContentPartTextParam = ChatCompletionContentPartTextParam
221+
types_mod.FunctionDefinition = FunctionDefinition
222+
223+
openai_mod.__spec__ = ModuleSpec("openai", loader=None)
224+
types_mod.__spec__ = ModuleSpec("openai.types", loader=None)
225+
completion_usage_mod.__spec__ = ModuleSpec("openai.types.completion_usage", loader=None)
226+
chat_mod.__spec__ = ModuleSpec("openai.types.chat", loader=None)
227+
chat_message_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message", loader=None)
228+
chat_message_param_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_param", loader=None)
229+
tool_call_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_tool_call", loader=None)
91230

92231
openai_mod.types = types_mod
232+
openai_mod.NotGiven = _NotGiven
233+
openai_mod.NOT_GIVEN = _NotGiven()
93234
types_mod.completion_usage = completion_usage_mod
94235
types_mod.chat = chat_mod
95236
chat_mod.chat_completion_message = chat_message_mod
96237
chat_mod.chat_completion_message_tool_call = tool_call_mod
238+
chat_mod.chat_completion_message_param = chat_message_param_mod
97239

98240
sys.modules["openai"] = openai_mod
99241
sys.modules["openai.types"] = types_mod
100242
sys.modules["openai.types.completion_usage"] = completion_usage_mod
101243
sys.modules["openai.types.chat"] = chat_mod
102244
sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod
103245
sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod
246+
sys.modules["openai.types.chat.chat_completion_message_param"] = chat_message_param_mod
104247

105248

106249
_install_dependency_stubs()

0 commit comments

Comments
 (0)