diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index e3f3144a..b157662e 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -9,7 +9,7 @@ import random import time from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional, Protocol +from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol from eval_protocol.models import EvaluationRow, InputMetadata, Message @@ -49,9 +49,14 @@ def __call__( from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails LANGFUSE_AVAILABLE = True -except ImportError: +except ImportError: # pragma: no cover - optional dependency LANGFUSE_AVAILABLE = False +if TYPE_CHECKING: # pragma: no cover - import is optional at runtime + from langfuse.client import Langfuse as _LangfuseClient # type: ignore[import-not-found] +else: + _LangfuseClient = Any + def convert_trace_to_evaluation_row( trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None @@ -296,7 +301,8 @@ def __init__(self): if not LANGFUSE_AVAILABLE: raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'") - self.client = get_client() + client_factory = cast(Callable[[], _LangfuseClient], get_client) + self.client = client_factory() def get_evaluation_rows( self, diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index 1d29b66a..a47d854f 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -10,18 +10,23 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Iterable +from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast from eval_protocol.models import EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) try: - from langsmith import Client # type: ignore + from langsmith import Client as _RuntimeClient # type: ignore[attr-defined] +except ImportError: # pragma: no cover - optional dependency + _RuntimeClient = None - LANGSMITH_AVAILABLE = True -except ImportError: - LANGSMITH_AVAILABLE = False +if TYPE_CHECKING: # pragma: no cover - import is optional at runtime + from langsmith import Client as LangSmithClient # type: ignore[import-not-found] +else: + LangSmithClient = Any + +LANGSMITH_AVAILABLE = _RuntimeClient is not None class LangSmithAdapter: @@ -34,10 +39,11 @@ class LangSmithAdapter: - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict] """ - def __init__(self, client: Optional[Client] = None) -> None: + def __init__(self, client: Optional["LangSmithClient"] = None) -> None: if not LANGSMITH_AVAILABLE: raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'") - self.client = client or Client() + runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient) + self.client = client or runtime_client() def get_evaluation_rows( self, diff --git a/eval_protocol/pytest/dataset_preparation.py b/eval_protocol/pytest/dataset_preparation.py new file mode 100644 index 00000000..ee75c15e --- /dev/null +++ b/eval_protocol/pytest/dataset_preparation.py @@ -0,0 +1,59 @@ +"""Utilities for preparing datasets for evaluation tests.""" + +from collections.abc import Callable +from typing import Any + +from eval_protocol.human_id import generate_id, num_combinations +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs +from eval_protocol.pytest.types import Dataset + +from ..common_utils import load_jsonl + + +def load_and_prepare_rows( + kwargs: ParameterizedTestKwargs, + *, + dataset_adapter: Callable[[list[dict[str, Any]]], Dataset], + preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None, + max_dataset_rows: int | None, +) -> list[EvaluationRow]: + """Load and preprocess evaluation rows based on parameterized pytest kwargs. + + This helper consolidates the logic that loads input data from various sources + (dataset paths, raw messages, or pre-built :class:`EvaluationRow` objects), + applies optional preprocessing, and ensures each row has a stable + ``row_id``. The behavior mirrors the original inline implementation inside + :func:`eval_protocol.pytest.evaluation_test.evaluation_test`. + """ + + data: list[EvaluationRow] = [] + + if kwargs.get("dataset_path") is not None: + ds_arg = kwargs["dataset_path"] + data_jsonl: list[dict[str, Any]] = [] + for path in ds_arg or []: + data_jsonl.extend(load_jsonl(path)) + if max_dataset_rows is not None: + data_jsonl = data_jsonl[:max_dataset_rows] + data = dataset_adapter(data_jsonl) + elif kwargs.get("input_messages") is not None: + input_messages = kwargs["input_messages"] or [] + data = [EvaluationRow(messages=dataset_messages) for dataset_messages in input_messages] + elif kwargs.get("input_rows") is not None: + input_rows = kwargs["input_rows"] or [] + data = [row.model_copy(deep=True) for row in input_rows] + else: + raise ValueError("No input dataset, input messages, or input rows provided") + + if preprocess_fn: + data = preprocess_fn(data) + + for row in data: + if row.input_metadata.row_id is None: + index = hash(row) + max_index = num_combinations() - 1 + index = abs(index) % (max_index + 1) + row.input_metadata.row_id = generate_id(seed=0, index=index) + + return data diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index a7ec65f3..aa160f1b 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -23,6 +23,7 @@ Status, ) from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper +from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows from eval_protocol.pytest.evaluation_test_postprocess import postprocess from eval_protocol.pytest.execution import execute_pytest from eval_protocol.pytest.generate_parameter_combinations import ( @@ -60,8 +61,6 @@ rollout_processor_with_retry, ) -from ..common_utils import load_jsonl - def evaluation_test( *, @@ -223,43 +222,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger) try: - # Handle dataset loading - data: list[EvaluationRow] = [] # Track all rows processed in the current run for error logging processed_rows_in_run: list[EvaluationRow] = [] - if "dataset_path" in kwargs and kwargs["dataset_path"] is not None: - ds_arg: list[str] = kwargs["dataset_path"] - # Support either a single path or a list of paths; if a list is provided, - # concatenate the rows from each file in order. - data_jsonl: list[dict[str, object]] = [] - for p in ds_arg: - data_jsonl.extend(load_jsonl(p)) - # Apply override for max rows if present - if max_dataset_rows is not None: - data_jsonl = data_jsonl[:max_dataset_rows] - data = dataset_adapter(data_jsonl) - elif "input_messages" in kwargs and kwargs["input_messages"] is not None: - # Support either a single row (List[Message]) or many rows (List[List[Message]]) - im = kwargs["input_messages"] - data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im] - elif "input_rows" in kwargs and kwargs["input_rows"] is not None: - # Deep copy pre-constructed EvaluationRow objects - data = [row.model_copy(deep=True) for row in kwargs["input_rows"]] - else: - raise ValueError("No input dataset, input messages, or input rows provided") - - if preprocess_fn: - data = preprocess_fn(data) - - for row in data: - # generate a stable row_id for each row - if row.input_metadata.row_id is None: - # Generate a stable, deterministic row_id using the row's hash and num_combinations - index = hash(row) - max_index = num_combinations() - 1 - # Ensure index is a non-negative integer within [0, max_index] - index = abs(index) % (max_index + 1) - row.input_metadata.row_id = generate_id(seed=0, index=index) + data = load_and_prepare_rows( + kwargs, + dataset_adapter=dataset_adapter, + preprocess_fn=preprocess_fn, + max_dataset_rows=max_dataset_rows, + ) completion_params = kwargs["completion_params"] # Create eval metadata with test function info and current commit hash diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index f4efb7f5..1ffe2353 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -19,11 +19,14 @@ pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s """ +import asyncio import os from typing import Any, Dict, List, Optional import pytest +from openai import AsyncOpenAI + from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor @@ -31,7 +34,7 @@ split_multi_turn_rows, JUDGE_CONFIGS, calculate_bootstrap_scores, - run_judgment, + run_judgment_async, ) from eval_protocol.adapters.langsmith import LangSmithAdapter @@ -91,10 +94,23 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation judgments: List[Dict[str, Any]] = [] - for row in rows: - result = run_judgment(row, model_name, judge_name) - if result and result["games"][0] and result["games"][1]: - judgments.append(result) + judge_config = JUDGE_CONFIGS[judge_name] + + async with AsyncOpenAI( + api_key=judge_config.get("api_key"), + base_url=judge_config.get("base_url"), + ) as shared_client: + semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8)) + + async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]: + async with semaphore: + return await run_judgment_async(row, model_name, judge_name, shared_client) + + tasks = [_run_judgment(row) for row in rows] + for coro in asyncio.as_completed(tasks): + result = await coro + if result and result["games"][0] and result["games"][1]: + judgments.append(result) if not judgments: print("❌ No valid judgments generated") @@ -102,11 +118,13 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation print(f"✅ Generated {len(judgments)} valid judgments") - mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments) - if mean_score == 0.0: + bootstrap_result = calculate_bootstrap_scores(judgments) + if not bootstrap_result: print("❌ No valid scores extracted") return rows + mean_score, lower_score, upper_score = bootstrap_result + print("\n##### LLM Judge Results (90th percentile CI) #####") clean_model_name = model_name.split("/")[-1] print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})") diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py new file mode 100644 index 00000000..cbf786a8 --- /dev/null +++ b/tests/pytest/test_dataset_preparation.py @@ -0,0 +1,375 @@ +from __future__ import annotations + +import importlib +from importlib.machinery import ModuleSpec +import sys +import types +from typing import cast + +import pytest +from pydantic import BaseModel, ConfigDict + + +def _install_dependency_stubs() -> None: + """Register lightweight stubs for optional runtime dependencies.""" + + def _ensure_module(name: str, **attrs) -> None: + if name in sys.modules: + return + module = types.ModuleType(name) + for key, value in attrs.items(): + setattr(module, key, value) + sys.modules[name] = module + + try: # pragma: no cover - prefer real dependency when available + importlib.import_module("loguru") + except ModuleNotFoundError: + + class _Logger: # pragma: no cover - inert logging shim + def __getattr__(self, _name: str): + def _noop(*_args, **_kwargs): + return None + + return _noop + + _ensure_module("loguru", logger=_Logger()) + + def _noop_loader(*_args, **_kwargs): # pragma: no cover - placeholder loader + return {} + + def _field_type(name: str): + def __init__(self, *_args, **_kwargs): + return None + + return type(name, (), {"__init__": __init__}) + + class _SqliteDatabase: + def __init__(self, *_args, **_kwargs): + self.path = None + + def connect(self): # pragma: no cover - stub connection + return None + + def close(self): # pragma: no cover + return None + + def atomic(self): # pragma: no cover - context manager shim + class _Atomic: + def __enter__(self_inner): + return self_inner + + def __exit__(self_inner, *_exc): + return False + + return _Atomic() + + def create_tables(self, *_args, **_kwargs): # pragma: no cover + return None + + def create_table(self, *_args, **_kwargs): # pragma: no cover + return None + + def drop_tables(self, *_args, **_kwargs): # pragma: no cover + return None + + optional_stub_attrs = { + "toml": {"loads": _noop_loader, "load": _noop_loader}, + "datasets": {}, + "addict": {"Dict": dict}, + "deepdiff": {"DeepDiff": type("DeepDiff", (), {})}, + "peewee": { + "Model": type("Model", (), {}), + "SqliteDatabase": _SqliteDatabase, + "CharField": _field_type("CharField"), + "TextField": _field_type("TextField"), + "IntegerField": _field_type("IntegerField"), + "DateTimeField": _field_type("DateTimeField"), + "AutoField": _field_type("AutoField"), + "OperationalError": Exception, + }, + "backoff": {}, + "aiohttp": {"ClientSession": type("ClientSession", (), {})}, + "tqdm": {"tqdm": lambda iterable, *_args, **_kwargs: iterable}, + } + + for optional_module, attrs in optional_stub_attrs.items(): + try: + importlib.import_module(optional_module) + except ModuleNotFoundError: + _ensure_module(optional_module, **attrs) + + try: + importlib.import_module("litellm") + except ModuleNotFoundError: + litellm_mod = types.ModuleType("litellm") + + def _acompletion(*_args, **_kwargs): # pragma: no cover - stubbed async function + return None + + def _completion_cost(*_args, **_kwargs): # pragma: no cover - cost shim + return 0.0 + + litellm_mod.acompletion = _acompletion + litellm_mod.completion = _acompletion + litellm_mod.completion_cost = _completion_cost + + caching_pkg = types.ModuleType("litellm.caching") + caching_submodule = types.ModuleType("litellm.caching.caching") + caching_submodule.Cache = type("Cache", (), {}) + dual_cache_module = types.ModuleType("litellm.caching.dual_cache") + dual_cache_module.DualCache = type("DualCache", (), {}) + in_memory_cache_module = types.ModuleType("litellm.caching.in_memory_cache") + in_memory_cache_module.InMemoryCache = type("InMemoryCache", (), {}) + caching_pkg.caching = caching_submodule + caching_pkg.dual_cache = dual_cache_module + caching_pkg.in_memory_cache = in_memory_cache_module + redis_cache_module = types.ModuleType("litellm.caching.redis_cache") + redis_cache_module.RedisCache = type("RedisCache", (), {}) + caching_pkg.redis_cache = redis_cache_module + + litellm_mod.caching = caching_pkg + + main_module = types.ModuleType("litellm.main") + main_module.ModelResponse = type("ModelResponse", (), {}) + main_module.Usage = type("Usage", (), {}) + + cost_calculator_mod = types.ModuleType("litellm.cost_calculator") + cost_calculator_mod.cost_per_token = lambda *_args, **_kwargs: 0.0 + + sys.modules["litellm"] = litellm_mod + sys.modules["litellm.caching"] = caching_pkg + sys.modules["litellm.caching.caching"] = caching_submodule + sys.modules["litellm.caching.dual_cache"] = dual_cache_module + sys.modules["litellm.caching.in_memory_cache"] = in_memory_cache_module + sys.modules["litellm.caching.redis_cache"] = redis_cache_module + sys.modules["litellm.main"] = main_module + sys.modules["litellm.cost_calculator"] = cost_calculator_mod + + try: + importlib.import_module("playhouse.sqlite_ext") + except ModuleNotFoundError: + playhouse_mod = types.ModuleType("playhouse") + sqlite_ext_mod = types.ModuleType("playhouse.sqlite_ext") + sqlite_ext_mod.JSONField = type("JSONField", (), {}) + playhouse_mod.sqlite_ext = sqlite_ext_mod + + sys.modules["playhouse"] = playhouse_mod + sys.modules["playhouse.sqlite_ext"] = sqlite_ext_mod + + try: + importlib.import_module("openai") + return + except ModuleNotFoundError: + pass + + openai_mod = types.ModuleType("openai") + types_mod = types.ModuleType("openai.types") + completion_usage_mod = types.ModuleType("openai.types.completion_usage") + chat_mod = types.ModuleType("openai.types.chat") + chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message") + chat_message_param_mod = types.ModuleType("openai.types.chat.chat_completion_message_param") + tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call") + + class CompletionUsage(BaseModel): # pragma: no cover - simple data container + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + + model_config = ConfigDict(extra="allow") + + class FunctionCall(BaseModel): # pragma: no cover - simple data container + name: str | None = None + arguments: str | None = None + + model_config = ConfigDict(extra="allow") + + class FunctionDefinition(BaseModel): # pragma: no cover - simple data container + name: str | None = None + description: str | None = None + parameters: dict[str, Any] | None = None + + model_config = ConfigDict(extra="allow") + + class ChatCompletionContentPartTextParam(BaseModel): # pragma: no cover - simple data container + text: str | None = None + type: str = "text" + + model_config = ConfigDict(extra="allow") + + class ChatCompletionMessageToolCall(BaseModel): # pragma: no cover - simple data container + id: str | None = None + type: str | None = None + function: FunctionCall | None = None + + model_config = ConfigDict(extra="allow") + + class ChatCompletionMessageParam(BaseModel): # pragma: no cover - simple data container + content: str | None = None + role: str | None = None + + model_config = ConfigDict(extra="allow") + + class _NotGiven: # pragma: no cover - sentinel placeholder + pass + + types_mod.CompletionUsage = CompletionUsage + completion_usage_mod.CompletionUsage = CompletionUsage + chat_message_mod.FunctionCall = FunctionCall + chat_message_param_mod.ChatCompletionMessageParam = ChatCompletionMessageParam + tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall + chat_mod.ChatCompletionContentPartTextParam = ChatCompletionContentPartTextParam + types_mod.FunctionDefinition = FunctionDefinition + + openai_mod.__spec__ = ModuleSpec("openai", loader=None) + types_mod.__spec__ = ModuleSpec("openai.types", loader=None) + completion_usage_mod.__spec__ = ModuleSpec("openai.types.completion_usage", loader=None) + chat_mod.__spec__ = ModuleSpec("openai.types.chat", loader=None) + chat_message_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message", loader=None) + chat_message_param_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_param", loader=None) + tool_call_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_tool_call", loader=None) + + openai_mod.types = types_mod + openai_mod.NotGiven = _NotGiven + openai_mod.NOT_GIVEN = _NotGiven() + types_mod.completion_usage = completion_usage_mod + types_mod.chat = chat_mod + chat_mod.chat_completion_message = chat_message_mod + chat_mod.chat_completion_message_tool_call = tool_call_mod + chat_mod.chat_completion_message_param = chat_message_param_mod + + sys.modules["openai"] = openai_mod + sys.modules["openai.types"] = types_mod + sys.modules["openai.types.completion_usage"] = completion_usage_mod + sys.modules["openai.types.chat"] = chat_mod + sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod + sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod + sys.modules["openai.types.chat.chat_completion_message_param"] = chat_message_param_mod + + +_install_dependency_stubs() + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows +from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs + + +def _make_kwargs(**overrides) -> ParameterizedTestKwargs: + base: ParameterizedTestKwargs = { + "dataset_path": None, + "completion_params": None, + "input_messages": None, + "input_rows": None, + "evaluation_test_kwargs": None, + } + base.update(overrides) + return cast(ParameterizedTestKwargs, base) + + +def test_load_and_prepare_rows_from_dataset(monkeypatch): + dataset_contents = { + "file1": [{"text": "f1a"}, {"text": "f1b"}], + "file2": [{"text": "f2a"}, {"text": "f2b"}], + } + load_calls: list[str] = [] + + def fake_load_jsonl(path: str): + load_calls.append(path) + return dataset_contents[path] + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.load_jsonl", fake_load_jsonl) + + generated_args: list[dict[str, int | None]] = [] + + def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str: + generated_args.append({"seed": seed, "index": index}) + return f"id-{index}" + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id) + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 10) + + adapter_inputs: list[list[dict[str, str]]] = [] + + def dataset_adapter(data): + adapter_inputs.append(list(data)) + rows: list[EvaluationRow] = [] + for entry in data: + rows.append(EvaluationRow(messages=[Message(role="user", content=entry["text"])])) + return rows + + preprocess_calls: list[list[EvaluationRow]] = [] + + def preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]: + preprocess_calls.append(list(rows)) + return rows + + kwargs = _make_kwargs(dataset_path=["file1", "file2"]) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=dataset_adapter, + preprocess_fn=preprocess, + max_dataset_rows=3, + ) + + assert load_calls == ["file1", "file2"], "Expected to load all dataset paths" + assert len(adapter_inputs) == 1 + assert len(adapter_inputs[0]) == 3, "max_dataset_rows should truncate concatenated data" + assert preprocess_calls and preprocess_calls[0] == result + assert all(row.input_metadata.row_id is not None for row in result) + assert len(generated_args) == len(result) + assert all(call["seed"] == 0 for call in generated_args) + assert all(0 <= call["index"] < 10 for call in generated_args if call["index"] is not None) + + +def test_load_and_prepare_rows_from_messages(monkeypatch): + generated_indices: list[int | None] = [] + + def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str: + generated_indices.append(index) + return f"row-{index}" + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id) + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 8) + + kwargs = _make_kwargs( + input_messages=[ + [Message(role="system", content="system")], + [Message(role="user", content="question")], + ] + ) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"), + preprocess_fn=None, + max_dataset_rows=None, + ) + + assert [row.messages for row in result] == kwargs["input_messages"] + assert generated_indices and all(index is not None for index in generated_indices) + + +def test_load_and_prepare_rows_deep_copies_input_rows(monkeypatch): + def fail_generate_id(*_args, **_kwargs): # pragma: no cover - should never be called + raise AssertionError("generate_id should not be called when row_id already exists") + + monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fail_generate_id) + + original = EvaluationRow(messages=[Message(role="user", content="hi")]) + original.input_metadata.row_id = "existing-id" + + kwargs = _make_kwargs(input_rows=[original]) + + result = load_and_prepare_rows( + kwargs, + dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"), + preprocess_fn=None, + max_dataset_rows=None, + ) + + assert len(result) == 1 + assert result[0] is not original + assert result[0].input_metadata.row_id == "existing-id" + + result[0].messages[0].content = "changed" + assert original.messages[0].content == "hi", "Deep copy should isolate message objects"