From cf5ccddc92481f255f69d5ac3a056bc72c804b7a Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:21:44 -0700 Subject: [PATCH 1/4] Document stub imports for dataset prep tests --- eval_protocol/pytest/dataset_preparation.py | 59 ++++ eval_protocol/pytest/evaluation_test.py | 45 +-- tests/pytest/test_dataset_preparation.py | 289 ++++++++++++++++++++ 3 files changed, 355 insertions(+), 38 deletions(-) create mode 100644 eval_protocol/pytest/dataset_preparation.py create mode 100644 tests/pytest/test_dataset_preparation.py diff --git a/eval_protocol/pytest/dataset_preparation.py b/eval_protocol/pytest/dataset_preparation.py new file mode 100644 index 00000000..ee75c15e --- /dev/null +++ b/eval_protocol/pytest/dataset_preparation.py @@ -0,0 +1,59 @@ +"""Utilities for preparing datasets for evaluation tests.""" + +from collections.abc import Callable +from typing import Any + +from eval_protocol.human_id import generate_id, num_combinations +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs +from eval_protocol.pytest.types import Dataset + +from ..common_utils import load_jsonl + + +def load_and_prepare_rows( + kwargs: ParameterizedTestKwargs, + *, + dataset_adapter: Callable[[list[dict[str, Any]]], Dataset], + preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None, + max_dataset_rows: int | None, +) -> list[EvaluationRow]: + """Load and preprocess evaluation rows based on parameterized pytest kwargs. + + This helper consolidates the logic that loads input data from various sources + (dataset paths, raw messages, or pre-built :class:`EvaluationRow` objects), + applies optional preprocessing, and ensures each row has a stable + ``row_id``. The behavior mirrors the original inline implementation inside + :func:`eval_protocol.pytest.evaluation_test.evaluation_test`. + """ + + data: list[EvaluationRow] = [] + + if kwargs.get("dataset_path") is not None: + ds_arg = kwargs["dataset_path"] + data_jsonl: list[dict[str, Any]] = [] + for path in ds_arg or []: + data_jsonl.extend(load_jsonl(path)) + if max_dataset_rows is not None: + data_jsonl = data_jsonl[:max_dataset_rows] + data = dataset_adapter(data_jsonl) + elif kwargs.get("input_messages") is not None: + input_messages = kwargs["input_messages"] or [] + data = [EvaluationRow(messages=dataset_messages) for dataset_messages in input_messages] + elif kwargs.get("input_rows") is not None: + input_rows = kwargs["input_rows"] or [] + data = [row.model_copy(deep=True) for row in input_rows] + else: + raise ValueError("No input dataset, input messages, or input rows provided") + + if preprocess_fn: + data = preprocess_fn(data) + + for row in data: + if row.input_metadata.row_id is None: + index = hash(row) + max_index = num_combinations() - 1 + index = abs(index) % (max_index + 1) + row.input_metadata.row_id = generate_id(seed=0, index=index) + + return data diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index a7ec65f3..176fb114 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -23,6 +23,7 @@ Status, ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper +from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows from eval_protocol.pytest.evaluation_test_postprocess import postprocess from eval_protocol.pytest.execution import execute_pytest from eval_protocol.pytest.generate_parameter_combinations import ( @@ -60,9 +61,6 @@ rollout_processor_with_retry, ) -from ..common_utils import load_jsonl - - def evaluation_test( *, completion_params: Sequence[CompletionParams | None] | None = None, @@ -223,43 +221,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger) try: - # Handle dataset loading - data: list[EvaluationRow] = [] # Track all rows processed in the current run for error logging processed_rows_in_run: list[EvaluationRow] = [] - if "dataset_path" in kwargs and kwargs["dataset_path"] is not None: - ds_arg: list[str] = kwargs["dataset_path"] - # Support either a single path or a list of paths; if a list is provided, - # concatenate the rows from each file in order. - data_jsonl: list[dict[str, object]] = [] - for p in ds_arg: - data_jsonl.extend(load_jsonl(p)) - # Apply override for max rows if present - if max_dataset_rows is not None: - data_jsonl = data_jsonl[:max_dataset_rows] - data = dataset_adapter(data_jsonl) - elif "input_messages" in kwargs and kwargs["input_messages"] is not None: - # Support either a single row (List[Message]) or many rows (List[List[Message]]) - im = kwargs["input_messages"] - data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im] - elif "input_rows" in kwargs and kwargs["input_rows"] is not None: - # Deep copy pre-constructed EvaluationRow objects - data = [row.model_copy(deep=True) for row in kwargs["input_rows"]] - else: - raise ValueError("No input dataset, input messages, or input rows provided") - - if preprocess_fn: - data = preprocess_fn(data) - - for row in data: - # generate a stable row_id for each row - if row.input_metadata.row_id is None: - # Generate a stable, deterministic row_id using the row's hash and num_combinations - index = hash(row) - max_index = num_combinations() - 1 - # Ensure index is a non-negative integer within [0, max_index] - index = abs(index) % (max_index + 1) - row.input_metadata.row_id = generate_id(seed=0, index=index) + data = load_and_prepare_rows( + kwargs, + dataset_adapter=dataset_adapter, + preprocess_fn=preprocess_fn, + max_dataset_rows=max_dataset_rows, + ) completion_params = kwargs["completion_params"] # Create eval metadata with test function info and current commit hash diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py new file mode 100644 index 00000000..81deeccc --- /dev/null +++ b/tests/pytest/test_dataset_preparation.py @@ -0,0 +1,289 @@ +import sys +import types +from pathlib import Path +from typing import cast + +import pytest + +# The evaluation modules depend on optional third-party packages that aren't +# available in the execution environment for these unit tests. To exercise the +# real implementation of ``load_and_prepare_rows`` without installing those +# packages, we register lightweight stubs in ``sys.modules`` that provide the +# minimal interfaces required during import. + +PACKAGE_ROOT = Path(__file__).resolve().parents[2] / "eval_protocol" +sys.modules.pop("eval_protocol", None) +sys.modules.pop("eval_protocol.pytest", None) + +eval_protocol_pkg = types.ModuleType("eval_protocol") +eval_protocol_pkg.__path__ = [str(PACKAGE_ROOT)] # type: ignore[attr-defined] +sys.modules["eval_protocol"] = eval_protocol_pkg + +pytest_pkg = types.ModuleType("eval_protocol.pytest") +pytest_pkg.__path__ = [str(PACKAGE_ROOT / "pytest")] # type: ignore[attr-defined] +sys.modules["eval_protocol.pytest"] = pytest_pkg +setattr(eval_protocol_pkg, "pytest", pytest_pkg) + +if "loguru" not in sys.modules: + loguru_module = types.ModuleType("loguru") + + class _DummyLogger: + def __getattr__(self, _name): # pragma: no cover - dynamic fallback + def _noop(*_args, **_kwargs): + return None + + return _noop + + loguru_module.logger = _DummyLogger() # type: ignore[attr-defined] + sys.modules["loguru"] = loguru_module + +if "toml" not in sys.modules: + toml_module = types.ModuleType("toml") + + def _noop_load(*_args, **_kwargs): # pragma: no cover - helper for stubbing + return {} + + def _noop_dump(*_args, **_kwargs): # pragma: no cover - helper for stubbing + return None + + toml_module.load = _noop_load # type: ignore[attr-defined] + toml_module.dump = _noop_dump # type: ignore[attr-defined] + sys.modules["toml"] = toml_module + +if "addict" not in sys.modules: + addict_module = types.ModuleType("addict") + + class _AddictDict(dict): # pragma: no cover - simple stub + def __getattr__(self, item): + try: + return self[item] + except KeyError as exc: + raise AttributeError(item) from exc + + def __setattr__(self, key, value): + self[key] = value + + addict_module.Dict = _AddictDict # type: ignore[attr-defined] + sys.modules["addict"] = addict_module + +if "eval_protocol.mcp_env" not in sys.modules: + mcp_env_module = types.ModuleType("eval_protocol.mcp_env") + + class _DummyPolicy: # pragma: no cover - stub placeholder + pass + + def _noop(*_args, **_kwargs): + return None + + mcp_env_module.AnthropicPolicy = _DummyPolicy # type: ignore[attr-defined] + mcp_env_module.FireworksPolicy = _DummyPolicy # type: ignore[attr-defined] + mcp_env_module.LiteLLMPolicy = _DummyPolicy # type: ignore[attr-defined] + mcp_env_module.OpenAIPolicy = _DummyPolicy # type: ignore[attr-defined] + mcp_env_module.make = _noop # type: ignore[attr-defined] + mcp_env_module.rollout = _noop # type: ignore[attr-defined] + mcp_env_module.test_mcp = _noop # type: ignore[attr-defined] + sys.modules["eval_protocol.mcp_env"] = mcp_env_module + +if "eval_protocol.mcp" not in sys.modules: + sys.modules["eval_protocol.mcp"] = types.ModuleType("eval_protocol.mcp") + +if "eval_protocol.rewards" not in sys.modules: + sys.modules["eval_protocol.rewards"] = types.ModuleType("eval_protocol.rewards") + +if "eval_protocol.dataset_logger" not in sys.modules: + dataset_logger_module = types.ModuleType("eval_protocol.dataset_logger") + + class _StubDatasetLogger: # pragma: no cover - stub placeholder + def log(self, *_args, **_kwargs): + return None + + dataset_logger_module.default_logger = _StubDatasetLogger() # type: ignore[attr-defined] + sys.modules["eval_protocol.dataset_logger"] = dataset_logger_module + +if "eval_protocol.dataset_logger.dataset_logger" not in sys.modules: + dataset_logger_pkg = types.ModuleType("eval_protocol.dataset_logger.dataset_logger") + dataset_logger_pkg.DatasetLogger = _StubDatasetLogger # type: ignore[attr-defined] + sys.modules["eval_protocol.dataset_logger.dataset_logger"] = dataset_logger_pkg + +if "backoff" not in sys.modules: + backoff_module = types.ModuleType("backoff") + + def _noop_decorator(*_args, **_kwargs): + def _decorator(func): + return func + + return _decorator + + backoff_module.on_exception = _noop_decorator # type: ignore[attr-defined] + backoff_module.expo = lambda *args, **kwargs: None # type: ignore[attr-defined] + sys.modules["backoff"] = backoff_module + +if "litellm" not in sys.modules: + litellm_module = types.ModuleType("litellm") + cost_calculator_module = types.ModuleType("litellm.cost_calculator") + cost_calculator_module.cost_per_token = lambda *args, **kwargs: 0.0 # type: ignore[attr-defined] + sys.modules["litellm"] = litellm_module + sys.modules["litellm.cost_calculator"] = cost_calculator_module + +if "tqdm" not in sys.modules: + tqdm_module = types.ModuleType("tqdm") + + def _noop_tqdm(iterable=None, **_kwargs): + return iterable if iterable is not None else [] + + tqdm_module.tqdm = _noop_tqdm # type: ignore[attr-defined] + sys.modules["tqdm"] = tqdm_module + +if "openai" not in sys.modules: + openai_module = types.ModuleType("openai") + openai_types_module = types.ModuleType("openai.types") + openai_chat_module = types.ModuleType("openai.types.chat") + openai_chat_completion_module = types.ModuleType("openai.types.chat.chat_completion_message") + openai_chat_tool_module = types.ModuleType("openai.types.chat.chat_completion_message_tool_call") + + class _NotGiven: # pragma: no cover - stub placeholder + pass + + openai_module.NOT_GIVEN = _NotGiven() # type: ignore[attr-defined] + openai_module.NotGiven = _NotGiven # type: ignore[attr-defined] + + openai_types_module.CompletionUsage = object # type: ignore[attr-defined] + openai_chat_completion_module.FunctionCall = object # type: ignore[attr-defined] + openai_chat_tool_module.ChatCompletionMessageToolCall = object # type: ignore[attr-defined] + + openai_types_module.chat = openai_chat_module # type: ignore[attr-defined] + openai_chat_module.chat_completion_message = openai_chat_completion_module # type: ignore[attr-defined] + openai_chat_module.chat_completion_message_tool_call = openai_chat_tool_module # type: ignore[attr-defined] + openai_module.types = openai_types_module # type: ignore[attr-defined] + + sys.modules["openai"] = openai_module + sys.modules["openai.types"] = openai_types_module + sys.modules["openai.types.chat"] = openai_chat_module + sys.modules["openai.types.chat.chat_completion_message"] = openai_chat_completion_module + sys.modules["openai.types.chat.chat_completion_message_tool_call"] = openai_chat_tool_module + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows +from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs + + +def _make_kwargs(**overrides) -> ParameterizedTestKwargs: + base: ParameterizedTestKwargs = { + "dataset_path": None, + "completion_params": None, + "input_messages": None, + "input_rows": None, + "evaluation_test_kwargs": None, + } + base.update(overrides) + return cast(ParameterizedTestKwargs, base) + + +def test_load_and_prepare_rows_from_dataset(monkeypatch): + dataset_contents = { + "file1": [{"text": "f1a"}, {"text": "f1b"}], + "file2": [{"text": "f2a"}, {"text": "f2b"}], + } + load_calls: list[str] = [] + + def fake_load_jsonl(path: str): + load_calls.append(path) + return dataset_contents[path] + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.load_jsonl", fake_load_jsonl) + + generated_args: list[dict[str, int | None]] = [] + + def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str: + generated_args.append({"seed": seed, "index": index}) + return f"id-{index}" + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id) + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 10) + + adapter_inputs: list[list[dict[str, str]]] = [] + + def dataset_adapter(data): + adapter_inputs.append(list(data)) + rows: list[EvaluationRow] = [] + for entry in data: + rows.append(EvaluationRow(messages=[Message(role="user", content=entry["text"])])) + return rows + + preprocess_calls: list[list[EvaluationRow]] = [] + + def preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]: + preprocess_calls.append(list(rows)) + return rows + + kwargs = _make_kwargs(dataset_path=["file1", "file2"]) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=dataset_adapter, + preprocess_fn=preprocess, + max_dataset_rows=3, + ) + + assert load_calls == ["file1", "file2"], "Expected to load all dataset paths" + assert len(adapter_inputs) == 1 + assert len(adapter_inputs[0]) == 3, "max_dataset_rows should truncate concatenated data" + assert preprocess_calls and preprocess_calls[0] == result + assert all(row.input_metadata.row_id is not None for row in result) + assert len(generated_args) == len(result) + assert all(call["seed"] == 0 for call in generated_args) + assert all(0 <= call["index"] < 10 for call in generated_args if call["index"] is not None) + + +def test_load_and_prepare_rows_from_messages(monkeypatch): + generated_indices: list[int | None] = [] + + def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str: + generated_indices.append(index) + return f"row-{index}" + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id) + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 8) + + kwargs = _make_kwargs( + input_messages=[ + [Message(role="system", content="system")], + [Message(role="user", content="question")], + ] + ) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"), + preprocess_fn=None, + max_dataset_rows=None, + ) + + assert [row.messages for row in result] == kwargs["input_messages"] + assert generated_indices and all(index is not None for index in generated_indices) + + +def test_load_and_prepare_rows_deep_copies_input_rows(monkeypatch): + def fail_generate_id(*_args, **_kwargs): # pragma: no cover - should never be called + raise AssertionError("generate_id should not be called when row_id already exists") + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fail_generate_id) + + original = EvaluationRow(messages=[Message(role="user", content="hi")]) + original.input_metadata.row_id = "existing-id" + + kwargs = _make_kwargs(input_rows=[original]) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"), + preprocess_fn=None, + max_dataset_rows=None, + ) + + assert len(result) == 1 + assert result[0] is not original + assert result[0].input_metadata.row_id == "existing-id" + + result[0].messages[0].content = "changed" + assert original.messages[0].content == "hi", "Deep copy should isolate message objects" From 0232312806f73a972abe4bae723f7591775678b0 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Mon, 15 Sep 2025 23:32:59 -0700 Subject: [PATCH 2/4] Require real openai dependency in dataset preparation tests --- tests/pytest/test_dataset_preparation.py | 160 +---------------------- 1 file changed, 1 insertion(+), 159 deletions(-) diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py index 81deeccc..d6d207f5 100644 --- a/tests/pytest/test_dataset_preparation.py +++ b/tests/pytest/test_dataset_preparation.py @@ -1,166 +1,8 @@ -import sys -import types -from pathlib import Path from typing import cast import pytest -# The evaluation modules depend on optional third-party packages that aren't -# available in the execution environment for these unit tests. To exercise the -# real implementation of ``load_and_prepare_rows`` without installing those -# packages, we register lightweight stubs in ``sys.modules`` that provide the -# minimal interfaces required during import. - -PACKAGE_ROOT = Path(__file__).resolve().parents[2] / "eval_protocol" -sys.modules.pop("eval_protocol", None) -sys.modules.pop("eval_protocol.pytest", None) - -eval_protocol_pkg = types.ModuleType("eval_protocol") -eval_protocol_pkg.__path__ = [str(PACKAGE_ROOT)] # type: ignore[attr-defined] -sys.modules["eval_protocol"] = eval_protocol_pkg - -pytest_pkg = types.ModuleType("eval_protocol.pytest") -pytest_pkg.__path__ = [str(PACKAGE_ROOT / "pytest")] # type: ignore[attr-defined] -sys.modules["eval_protocol.pytest"] = pytest_pkg -setattr(eval_protocol_pkg, "pytest", pytest_pkg) - -if "loguru" not in sys.modules: - loguru_module = types.ModuleType("loguru") - - class _DummyLogger: - def __getattr__(self, _name): # pragma: no cover - dynamic fallback - def _noop(*_args, **_kwargs): - return None - - return _noop - - loguru_module.logger = _DummyLogger() # type: ignore[attr-defined] - sys.modules["loguru"] = loguru_module - -if "toml" not in sys.modules: - toml_module = types.ModuleType("toml") - - def _noop_load(*_args, **_kwargs): # pragma: no cover - helper for stubbing - return {} - - def _noop_dump(*_args, **_kwargs): # pragma: no cover - helper for stubbing - return None - - toml_module.load = _noop_load # type: ignore[attr-defined] - toml_module.dump = _noop_dump # type: ignore[attr-defined] - sys.modules["toml"] = toml_module - -if "addict" not in sys.modules: - addict_module = types.ModuleType("addict") - - class _AddictDict(dict): # pragma: no cover - simple stub - def __getattr__(self, item): - try: - return self[item] - except KeyError as exc: - raise AttributeError(item) from exc - - def __setattr__(self, key, value): - self[key] = value - - addict_module.Dict = _AddictDict # type: ignore[attr-defined] - sys.modules["addict"] = addict_module - -if "eval_protocol.mcp_env" not in sys.modules: - mcp_env_module = types.ModuleType("eval_protocol.mcp_env") - - class _DummyPolicy: # pragma: no cover - stub placeholder - pass - - def _noop(*_args, **_kwargs): - return None - - mcp_env_module.AnthropicPolicy = _DummyPolicy # type: ignore[attr-defined] - mcp_env_module.FireworksPolicy = _DummyPolicy # type: ignore[attr-defined] - mcp_env_module.LiteLLMPolicy = _DummyPolicy # type: ignore[attr-defined] - mcp_env_module.OpenAIPolicy = _DummyPolicy # type: ignore[attr-defined] - mcp_env_module.make = _noop # type: ignore[attr-defined] - mcp_env_module.rollout = _noop # type: ignore[attr-defined] - mcp_env_module.test_mcp = _noop # type: ignore[attr-defined] - sys.modules["eval_protocol.mcp_env"] = mcp_env_module - -if "eval_protocol.mcp" not in sys.modules: - sys.modules["eval_protocol.mcp"] = types.ModuleType("eval_protocol.mcp") - -if "eval_protocol.rewards" not in sys.modules: - sys.modules["eval_protocol.rewards"] = types.ModuleType("eval_protocol.rewards") - -if "eval_protocol.dataset_logger" not in sys.modules: - dataset_logger_module = types.ModuleType("eval_protocol.dataset_logger") - - class _StubDatasetLogger: # pragma: no cover - stub placeholder - def log(self, *_args, **_kwargs): - return None - - dataset_logger_module.default_logger = _StubDatasetLogger() # type: ignore[attr-defined] - sys.modules["eval_protocol.dataset_logger"] = dataset_logger_module - -if "eval_protocol.dataset_logger.dataset_logger" not in sys.modules: - dataset_logger_pkg = types.ModuleType("eval_protocol.dataset_logger.dataset_logger") - dataset_logger_pkg.DatasetLogger = _StubDatasetLogger # type: ignore[attr-defined] - sys.modules["eval_protocol.dataset_logger.dataset_logger"] = dataset_logger_pkg - -if "backoff" not in sys.modules: - backoff_module = types.ModuleType("backoff") - - def _noop_decorator(*_args, **_kwargs): - def _decorator(func): - return func - - return _decorator - - backoff_module.on_exception = _noop_decorator # type: ignore[attr-defined] - backoff_module.expo = lambda *args, **kwargs: None # type: ignore[attr-defined] - sys.modules["backoff"] = backoff_module - -if "litellm" not in sys.modules: - litellm_module = types.ModuleType("litellm") - cost_calculator_module = types.ModuleType("litellm.cost_calculator") - cost_calculator_module.cost_per_token = lambda *args, **kwargs: 0.0 # type: ignore[attr-defined] - sys.modules["litellm"] = litellm_module - sys.modules["litellm.cost_calculator"] = cost_calculator_module - -if "tqdm" not in sys.modules: - tqdm_module = types.ModuleType("tqdm") - - def _noop_tqdm(iterable=None, **_kwargs): - return iterable if iterable is not None else [] - - tqdm_module.tqdm = _noop_tqdm # type: ignore[attr-defined] - sys.modules["tqdm"] = tqdm_module - -if "openai" not in sys.modules: - openai_module = types.ModuleType("openai") - openai_types_module = types.ModuleType("openai.types") - openai_chat_module = types.ModuleType("openai.types.chat") - openai_chat_completion_module = types.ModuleType("openai.types.chat.chat_completion_message") - openai_chat_tool_module = types.ModuleType("openai.types.chat.chat_completion_message_tool_call") - - class _NotGiven: # pragma: no cover - stub placeholder - pass - - openai_module.NOT_GIVEN = _NotGiven() # type: ignore[attr-defined] - openai_module.NotGiven = _NotGiven # type: ignore[attr-defined] - - openai_types_module.CompletionUsage = object # type: ignore[attr-defined] - openai_chat_completion_module.FunctionCall = object # type: ignore[attr-defined] - openai_chat_tool_module.ChatCompletionMessageToolCall = object # type: ignore[attr-defined] - - openai_types_module.chat = openai_chat_module # type: ignore[attr-defined] - openai_chat_module.chat_completion_message = openai_chat_completion_module # type: ignore[attr-defined] - openai_chat_module.chat_completion_message_tool_call = openai_chat_tool_module # type: ignore[attr-defined] - openai_module.types = openai_types_module # type: ignore[attr-defined] - - sys.modules["openai"] = openai_module - sys.modules["openai.types"] = openai_types_module - sys.modules["openai.types.chat"] = openai_chat_module - sys.modules["openai.types.chat.chat_completion_message"] = openai_chat_completion_module - sys.modules["openai.types.chat.chat_completion_message_tool_call"] = openai_chat_tool_module +pytest.importorskip("openai") from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows From 3d142fc1741e43aa220587b7d7505bd59f70314c Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Mon, 15 Sep 2025 23:33:04 -0700 Subject: [PATCH 3/4] Allow dataset preparation tests to run without openai --- tests/pytest/test_dataset_preparation.py | 103 ++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py index d6d207f5..ef428012 100644 --- a/tests/pytest/test_dataset_preparation.py +++ b/tests/pytest/test_dataset_preparation.py @@ -1,8 +1,109 @@ +from __future__ import annotations + +import importlib +import sys +import types from typing import cast import pytest +from pydantic import BaseModel, ConfigDict + + +def _install_dependency_stubs() -> None: + """Register lightweight stubs for optional runtime dependencies.""" + + def _ensure_module(name: str, **attrs) -> None: + if name in sys.modules: + return + module = types.ModuleType(name) + for key, value in attrs.items(): + setattr(module, key, value) + sys.modules[name] = module + + try: # pragma: no cover - prefer real dependency when available + importlib.import_module("loguru") + except ModuleNotFoundError: + class _Logger: # pragma: no cover - inert logging shim + def __getattr__(self, _name: str): + def _noop(*_args, **_kwargs): + return None + + return _noop + + _ensure_module("loguru", logger=_Logger()) + + def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader + return {} + + optional_stub_attrs = { + "toml": {"loads": _noop_loader, "load": _noop_loader}, + "datasets": {}, + "addict": {"Dict": dict}, + "deepdiff": {}, + "litellm": {}, + "peewee": {}, + "backoff": {}, + } -pytest.importorskip("openai") + for optional_module, attrs in optional_stub_attrs.items(): + try: + importlib.import_module(optional_module) + except ModuleNotFoundError: + _ensure_module(optional_module, **attrs) + + try: + importlib.import_module("openai") + return + except ModuleNotFoundError: + pass + + openai_mod = types.ModuleType("openai") + types_mod = types.ModuleType("openai.types") + completion_usage_mod = types.ModuleType("openai.types.completion_usage") + chat_mod = types.ModuleType("openai.types.chat") + chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message") + tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call") + + class CompletionUsage(BaseModel): # pragma: no cover - simple data container + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + + model_config = ConfigDict(extra="allow") + + class FunctionCall(BaseModel): # pragma: no cover - simple data container + name: str | None = None + arguments: str | None = None + + model_config = ConfigDict(extra="allow") + + class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple data container + id: str | None = None + type: str | None = None + function: FunctionCall | None = None + + model_config = ConfigDict(extra="allow") + + types_mod.CompletionUsage = CompletionUsage + completion_usage_mod.CompletionUsage = CompletionUsage + chat_message_mod.FunctionCall = FunctionCall + tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall + + openai_mod.types = types_mod + types_mod.completion_usage = completion_usage_mod + types_mod.chat = chat_mod + chat_mod.chat_completion_message = chat_message_mod + chat_mod.chat_completion_message_tool_call = tool_call_mod + + sys.modules["openai"] = openai_mod + sys.modules["openai.types"] = types_mod + sys.modules["openai.types.completion_usage"] = completion_usage_mod + sys.modules["openai.types.chat"] = chat_mod + sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod + sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod + + +_install_dependency_stubs() from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows From 44ec364e659da12928007a4352bbed6cc4f260ba Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Tue, 16 Sep 2025 00:01:57 -0700 Subject: [PATCH 4/4] Improve optional dependency stubs and formatting --- eval_protocol/adapters/langfuse.py | 12 +- eval_protocol/adapters/langsmith.py | 20 ++- eval_protocol/pytest/evaluation_test.py | 1 + .../quickstart/llm_judge_langsmith.py | 32 +++- tests/pytest/test_dataset_preparation.py | 149 +++++++++++++++++- 5 files changed, 194 insertions(+), 20 deletions(-) diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index e3f3144a..b157662e 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -9,7 +9,7 @@ import random import time from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional, Protocol +from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol from eval_protocol.models import EvaluationRow, InputMetadata, Message @@ -49,9 +49,14 @@ def __call__( from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails LANGFUSE_AVAILABLE = True -except ImportError: +except ImportError: # pragma: no cover - optional dependency LANGFUSE_AVAILABLE = False +if TYPE_CHECKING: # pragma: no cover - import is optional at runtime + from langfuse.client import Langfuse as _LangfuseClient # type: ignore[import-not-found] +else: + _LangfuseClient = Any + def convert_trace_to_evaluation_row( trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None @@ -296,7 +301,8 @@ def __init__(self): if not LANGFUSE_AVAILABLE: raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") - self.client = get_client() + client_factory = cast(Callable[[], _LangfuseClient], get_client) + self.client = client_factory() def get_evaluation_rows( self, diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index 1d29b66a..a47d854f 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -10,18 +10,23 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Iterable +from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast from eval_protocol.models import EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) try: - from langsmith import Client # type: ignore + from langsmith import Client as _RuntimeClient # type: ignore[attr-defined] +except ImportError: # pragma: no cover - optional dependency + _RuntimeClient = None - LANGSMITH_AVAILABLE = True -except ImportError: - LANGSMITH_AVAILABLE = False +if TYPE_CHECKING: # pragma: no cover - import is optional at runtime + from langsmith import Client as LangSmithClient # type: ignore[import-not-found] +else: + LangSmithClient = Any + +LANGSMITH_AVAILABLE = _RuntimeClient is not None class LangSmithAdapter: @@ -34,10 +39,11 @@ class LangSmithAdapter: - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict] """ - def __init__(self, client: Optional[Client] = None) -> None: + def __init__(self, client: Optional["LangSmithClient"] = None) -> None: if not LANGSMITH_AVAILABLE: raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'") - self.client = client or Client() + runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient) + self.client = client or runtime_client() def get_evaluation_rows( self, diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 176fb114..aa160f1b 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -61,6 +61,7 @@ rollout_processor_with_retry, ) + def evaluation_test( *, completion_params: Sequence[CompletionParams | None] | None = None, diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index f4efb7f5..1ffe2353 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -19,11 +19,14 @@ pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s """ +import asyncio import os from typing import Any, Dict, List, Optional import pytest +from openai import AsyncOpenAI + from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor @@ -31,7 +34,7 @@ split_multi_turn_rows, JUDGE_CONFIGS, calculate_bootstrap_scores, - run_judgment, + run_judgment_async, ) from eval_protocol.adapters.langsmith import LangSmithAdapter @@ -91,10 +94,23 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation judgments: List[Dict[str, Any]] = [] - for row in rows: - result = run_judgment(row, model_name, judge_name) - if result and result["games"][0] and result["games"][1]: - judgments.append(result) + judge_config = JUDGE_CONFIGS[judge_name] + + async with AsyncOpenAI( + api_key=judge_config.get("api_key"), + base_url=judge_config.get("base_url"), + ) as shared_client: + semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8)) + + async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]: + async with semaphore: + return await run_judgment_async(row, model_name, judge_name, shared_client) + + tasks = [_run_judgment(row) for row in rows] + for coro in asyncio.as_completed(tasks): + result = await coro + if result and result["games"][0] and result["games"][1]: + judgments.append(result) if not judgments: print("❌ No valid judgments generated") @@ -102,11 +118,13 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation print(f"✅ Generated {len(judgments)} valid judgments") - mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments) - if mean_score == 0.0: + bootstrap_result = calculate_bootstrap_scores(judgments) + if not bootstrap_result: print("❌ No valid scores extracted") return rows + mean_score, lower_score, upper_score = bootstrap_result + print("\n##### LLM Judge Results (90th percentile CI) #####") clean_model_name = model_name.split("/")[-1] print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})") diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py index ef428012..cbf786a8 100644 --- a/tests/pytest/test_dataset_preparation.py +++ b/tests/pytest/test_dataset_preparation.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib +from importlib.machinery import ModuleSpec import sys import types from typing import cast @@ -23,6 +24,7 @@ def _ensure_module(name: str, **attrs) -> None: try: # pragma: no cover - prefer real dependency when available importlib.import_module("loguru") except ModuleNotFoundError: + class _Logger: # pragma: no cover - inert logging shim def __getattr__(self, _name: str): def _noop(*_args, **_kwargs): @@ -35,14 +37,59 @@ def _noop(*_args, **_kwargs): def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader return {} + def _field_type(name: str): + def __init__(self, *_args, **_kwargs): + return None + + return type(name, (), {"__init__": __init__}) + + class _SqliteDatabase: + def __init__(self, *_args, **_kwargs): + self.path = None + + def connect(self): # pragma: no cover - stub connection + return None + + def close(self): # pragma: no cover + return None + + def atomic(self): # pragma: no cover - context manager shim + class _Atomic: + def __enter__(self_inner): + return self_inner + + def __exit__(self_inner, *_exc): + return False + + return _Atomic() + + def create_tables(self, *_args, **_kwargs): # pragma: no cover + return None + + def create_table(self, *_args, **_kwargs): # pragma: no cover + return None + + def drop_tables(self, *_args, **_kwargs): # pragma: no cover + return None + optional_stub_attrs = { "toml": {"loads": _noop_loader, "load": _noop_loader}, "datasets": {}, "addict": {"Dict": dict}, - "deepdiff": {}, - "litellm": {}, - "peewee": {}, + "deepdiff": {"DeepDiff": type("DeepDiff", (), {})}, + "peewee": { + "Model": type("Model", (), {}), + "SqliteDatabase": _SqliteDatabase, + "CharField": _field_type("CharField"), + "TextField": _field_type("TextField"), + "IntegerField": _field_type("IntegerField"), + "DateTimeField": _field_type("DateTimeField"), + "AutoField": _field_type("AutoField"), + "OperationalError": Exception, + }, "backoff": {}, + "aiohttp": {"ClientSession": type("ClientSession", (), {})}, + "tqdm": {"tqdm": lambda iterable, *_args, **_kwargs: iterable}, } for optional_module, attrs in optional_stub_attrs.items(): @@ -51,6 +98,64 @@ def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader except ModuleNotFoundError: _ensure_module(optional_module, **attrs) + try: + importlib.import_module("litellm") + except ModuleNotFoundError: + litellm_mod = types.ModuleType("litellm") + + def _acompletion(*_args, **_kwargs): # pragma: no cover - stubbed async function + return None + + def _completion_cost(*_args, **_kwargs): # pragma: no cover - cost shim + return 0.0 + + litellm_mod.acompletion = _acompletion + litellm_mod.completion = _acompletion + litellm_mod.completion_cost = _completion_cost + + caching_pkg = types.ModuleType("litellm.caching") + caching_submodule = types.ModuleType("litellm.caching.caching") + caching_submodule.Cache = type("Cache", (), {}) + dual_cache_module = types.ModuleType("litellm.caching.dual_cache") + dual_cache_module.DualCache = type("DualCache", (), {}) + in_memory_cache_module = types.ModuleType("litellm.caching.in_memory_cache") + in_memory_cache_module.InMemoryCache = type("InMemoryCache", (), {}) + caching_pkg.caching = caching_submodule + caching_pkg.dual_cache = dual_cache_module + caching_pkg.in_memory_cache = in_memory_cache_module + redis_cache_module = types.ModuleType("litellm.caching.redis_cache") + redis_cache_module.RedisCache = type("RedisCache", (), {}) + caching_pkg.redis_cache = redis_cache_module + + litellm_mod.caching = caching_pkg + + main_module = types.ModuleType("litellm.main") + main_module.ModelResponse = type("ModelResponse", (), {}) + main_module.Usage = type("Usage", (), {}) + + cost_calculator_mod = types.ModuleType("litellm.cost_calculator") + cost_calculator_mod.cost_per_token = lambda *_args, **_kwargs: 0.0 + + sys.modules["litellm"] = litellm_mod + sys.modules["litellm.caching"] = caching_pkg + sys.modules["litellm.caching.caching"] = caching_submodule + sys.modules["litellm.caching.dual_cache"] = dual_cache_module + sys.modules["litellm.caching.in_memory_cache"] = in_memory_cache_module + sys.modules["litellm.caching.redis_cache"] = redis_cache_module + sys.modules["litellm.main"] = main_module + sys.modules["litellm.cost_calculator"] = cost_calculator_mod + + try: + importlib.import_module("playhouse.sqlite_ext") + except ModuleNotFoundError: + playhouse_mod = types.ModuleType("playhouse") + sqlite_ext_mod = types.ModuleType("playhouse.sqlite_ext") + sqlite_ext_mod.JSONField = type("JSONField", (), {}) + playhouse_mod.sqlite_ext = sqlite_ext_mod + + sys.modules["playhouse"] = playhouse_mod + sys.modules["playhouse.sqlite_ext"] = sqlite_ext_mod + try: importlib.import_module("openai") return @@ -62,6 +167,7 @@ def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader completion_usage_mod = types.ModuleType("openai.types.completion_usage") chat_mod = types.ModuleType("openai.types.chat") chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message") + chat_message_param_mod = types.ModuleType("openai.types.chat.chat_completion_message_param") tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call") class CompletionUsage(BaseModel): # pragma: no cover - simple data container @@ -77,6 +183,19 @@ class FunctionCall(BaseModel): # pragma: no cover - simple data container model_config = ConfigDict(extra="allow") + class FunctionDefinition(BaseModel): # pragma: no cover - simple data container + name: str | None = None + description: str | None = None + parameters: dict[str, Any] | None = None + + model_config = ConfigDict(extra="allow") + + class ChatCompletionContentPartTextParam(BaseModel): # pragma: no cover - simple data container + text: str | None = None + type: str = "text" + + model_config = ConfigDict(extra="allow") + class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple data container id: str | None = None type: str | None = None @@ -84,16 +203,39 @@ class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple dat model_config = ConfigDict(extra="allow") + class ChatCompletionMessageParam(BaseModel): # pragma: no cover - simple data container + content: str | None = None + role: str | None = None + + model_config = ConfigDict(extra="allow") + + class _NotGiven: # pragma: no cover - sentinel placeholder + pass + types_mod.CompletionUsage = CompletionUsage completion_usage_mod.CompletionUsage = CompletionUsage chat_message_mod.FunctionCall = FunctionCall + chat_message_param_mod.ChatCompletionMessageParam = ChatCompletionMessageParam tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall + chat_mod.ChatCompletionContentPartTextParam = ChatCompletionContentPartTextParam + types_mod.FunctionDefinition = FunctionDefinition + + openai_mod.__spec__ = ModuleSpec("openai", loader=None) + types_mod.__spec__ = ModuleSpec("openai.types", loader=None) + completion_usage_mod.__spec__ = ModuleSpec("openai.types.completion_usage", loader=None) + chat_mod.__spec__ = ModuleSpec("openai.types.chat", loader=None) + chat_message_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message", loader=None) + chat_message_param_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_param", loader=None) + tool_call_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_tool_call", loader=None) openai_mod.types = types_mod + openai_mod.NotGiven = _NotGiven + openai_mod.NOT_GIVEN = _NotGiven() types_mod.completion_usage = completion_usage_mod types_mod.chat = chat_mod chat_mod.chat_completion_message = chat_message_mod chat_mod.chat_completion_message_tool_call = tool_call_mod + chat_mod.chat_completion_message_param = chat_message_param_mod sys.modules["openai"] = openai_mod sys.modules["openai.types"] = types_mod @@ -101,6 +243,7 @@ class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple dat sys.modules["openai.types.chat"] = chat_mod sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod + sys.modules["openai.types.chat.chat_completion_message_param"] = chat_message_param_mod _install_dependency_stubs()