diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
index e3f3144a..b157662e 100644
--- a/eval_protocol/adapters/langfuse.py
+++ b/eval_protocol/adapters/langfuse.py
@@ -9,7 +9,7 @@
 import random
 import time
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional, Protocol
+from typing import Any, Dict, Iterator, List, Optional, Callable, TYPE_CHECKING, cast, Protocol
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
@@ -49,9 +49,14 @@ def __call__(
     from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails
 
     LANGFUSE_AVAILABLE = True
-except ImportError:
+except ImportError:  # pragma: no cover - optional dependency
     LANGFUSE_AVAILABLE = False
 
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langfuse.client import Langfuse as _LangfuseClient  # type: ignore[import-not-found]
+else:
+    _LangfuseClient = Any
+
 
 def convert_trace_to_evaluation_row(
     trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
@@ -296,7 +301,8 @@ def __init__(self):
         if not LANGFUSE_AVAILABLE:
             raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
 
-        self.client = get_client()
+        client_factory = cast(Callable[[], _LangfuseClient], get_client)
+        self.client = client_factory()
 
     def get_evaluation_rows(
         self,
diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
index 1d29b66a..a47d854f 100644
--- a/eval_protocol/adapters/langsmith.py
+++ b/eval_protocol/adapters/langsmith.py
@@ -10,18 +10,23 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, List, Optional, Iterable
+from typing import Any, Dict, List, Optional, Iterable, Callable, TYPE_CHECKING, cast
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
 logger = logging.getLogger(__name__)
 
 try:
-    from langsmith import Client  # type: ignore
+    from langsmith import Client as _RuntimeClient  # type: ignore[attr-defined]
+except ImportError:  # pragma: no cover - optional dependency
+    _RuntimeClient = None
 
-    LANGSMITH_AVAILABLE = True
-except ImportError:
-    LANGSMITH_AVAILABLE = False
+if TYPE_CHECKING:  # pragma: no cover - import is optional at runtime
+    from langsmith import Client as LangSmithClient  # type: ignore[import-not-found]
+else:
+    LangSmithClient = Any
+
+LANGSMITH_AVAILABLE = _RuntimeClient is not None
 
 
 class LangSmithAdapter:
@@ -34,10 +39,11 @@ class LangSmithAdapter:
     - outputs: { messages: [...] } | { content } | { result } | { answer } | { output } | str | list[dict]
     """
 
-    def __init__(self, client: Optional[Client] = None) -> None:
+    def __init__(self, client: Optional["LangSmithClient"] = None) -> None:
         if not LANGSMITH_AVAILABLE:
             raise ImportError("LangSmith not installed. Install with: pip install 'eval-protocol[langsmith]'")
-        self.client = client or Client()
+        runtime_client = cast(Callable[[], "LangSmithClient"], _RuntimeClient)
+        self.client = client or runtime_client()
 
     def get_evaluation_rows(
         self,
diff --git a/eval_protocol/pytest/dataset_preparation.py b/eval_protocol/pytest/dataset_preparation.py
new file mode 100644
index 00000000..ee75c15e
--- /dev/null
+++ b/eval_protocol/pytest/dataset_preparation.py
@@ -0,0 +1,59 @@
+"""Utilities for preparing datasets for evaluation tests."""
+
+from collections.abc import Callable
+from typing import Any
+
+from eval_protocol.human_id import generate_id, num_combinations
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs
+from eval_protocol.pytest.types import Dataset
+
+from ..common_utils import load_jsonl
+
+
+def load_and_prepare_rows(
+    kwargs: ParameterizedTestKwargs,
+    *,
+    dataset_adapter: Callable[[list[dict[str, Any]]], Dataset],
+    preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None,
+    max_dataset_rows: int | None,
+) -> list[EvaluationRow]:
+    """Load and preprocess evaluation rows based on parameterized pytest kwargs.
+
+    This helper consolidates the logic that loads input data from various sources
+    (dataset paths, raw messages, or pre-built :class:`EvaluationRow` objects),
+    applies optional preprocessing, and ensures each row has a stable
+    ``row_id``. The behavior mirrors the original inline implementation inside
+    :func:`eval_protocol.pytest.evaluation_test.evaluation_test`.
+    """
+
+    data: list[EvaluationRow] = []
+
+    if kwargs.get("dataset_path") is not None:
+        ds_arg = kwargs["dataset_path"]
+        data_jsonl: list[dict[str, Any]] = []
+        for path in ds_arg or []:
+            data_jsonl.extend(load_jsonl(path))
+        if max_dataset_rows is not None:
+            data_jsonl = data_jsonl[:max_dataset_rows]
+        data = dataset_adapter(data_jsonl)
+    elif kwargs.get("input_messages") is not None:
+        input_messages = kwargs["input_messages"] or []
+        data = [EvaluationRow(messages=dataset_messages) for dataset_messages in input_messages]
+    elif kwargs.get("input_rows") is not None:
+        input_rows = kwargs["input_rows"] or []
+        data = [row.model_copy(deep=True) for row in input_rows]
+    else:
+        raise ValueError("No input dataset, input messages, or input rows provided")
+
+    if preprocess_fn:
+        data = preprocess_fn(data)
+
+    for row in data:
+        if row.input_metadata.row_id is None:
+            index = hash(row)
+            max_index = num_combinations() - 1
+            index = abs(index) % (max_index + 1)
+            row.input_metadata.row_id = generate_id(seed=0, index=index)
+
+    return data
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index a7ec65f3..aa160f1b 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -23,6 +23,7 @@
     Status,
 )
 from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
+from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows
 from eval_protocol.pytest.evaluation_test_postprocess import postprocess
 from eval_protocol.pytest.execution import execute_pytest
 from eval_protocol.pytest.generate_parameter_combinations import (
@@ -60,8 +61,6 @@
     rollout_processor_with_retry,
 )
 
-from ..common_utils import load_jsonl
-
 
 def evaluation_test(
     *,
@@ -223,43 +222,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
 
                 try:
-                    # Handle dataset loading
-                    data: list[EvaluationRow] = []
                     # Track all rows processed in the current run for error logging
                     processed_rows_in_run: list[EvaluationRow] = []
-                    if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
-                        ds_arg: list[str] = kwargs["dataset_path"]
-                        # Support either a single path or a list of paths; if a list is provided,
-                        # concatenate the rows from each file in order.
-                        data_jsonl: list[dict[str, object]] = []
-                        for p in ds_arg:
-                            data_jsonl.extend(load_jsonl(p))
-                        # Apply override for max rows if present
-                        if max_dataset_rows is not None:
-                            data_jsonl = data_jsonl[:max_dataset_rows]
-                        data = dataset_adapter(data_jsonl)
-                    elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
-                        # Support either a single row (List[Message]) or many rows (List[List[Message]])
-                        im = kwargs["input_messages"]
-                        data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im]
-                    elif "input_rows" in kwargs and kwargs["input_rows"] is not None:
-                        # Deep copy pre-constructed EvaluationRow objects
-                        data = [row.model_copy(deep=True) for row in kwargs["input_rows"]]
-                    else:
-                        raise ValueError("No input dataset, input messages, or input rows provided")
-
-                    if preprocess_fn:
-                        data = preprocess_fn(data)
-
-                    for row in data:
-                        # generate a stable row_id for each row
-                        if row.input_metadata.row_id is None:
-                            # Generate a stable, deterministic row_id using the row's hash and num_combinations
-                            index = hash(row)
-                            max_index = num_combinations() - 1
-                            # Ensure index is a non-negative integer within [0, max_index]
-                            index = abs(index) % (max_index + 1)
-                            row.input_metadata.row_id = generate_id(seed=0, index=index)
+                    data = load_and_prepare_rows(
+                        kwargs,
+                        dataset_adapter=dataset_adapter,
+                        preprocess_fn=preprocess_fn,
+                        max_dataset_rows=max_dataset_rows,
+                    )
 
                     completion_params = kwargs["completion_params"]
                     # Create eval metadata with test function info and current commit hash
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
index f4efb7f5..1ffe2353 100644
--- a/eval_protocol/quickstart/llm_judge_langsmith.py
+++ b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -19,11 +19,14 @@
   pytest python-sdk/eval_protocol/quickstart/llm_judge_langsmith.py -q -s
 """
 
+import asyncio
 import os
 from typing import Any, Dict, List, Optional
 
 import pytest
 
+from openai import AsyncOpenAI
+
 from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
@@ -31,7 +34,7 @@
     split_multi_turn_rows,
     JUDGE_CONFIGS,
     calculate_bootstrap_scores,
-    run_judgment,
+    run_judgment_async,
 )
 from eval_protocol.adapters.langsmith import LangSmithAdapter
 
@@ -91,10 +94,23 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation
 
     judgments: List[Dict[str, Any]] = []
 
-    for row in rows:
-        result = run_judgment(row, model_name, judge_name)
-        if result and result["games"][0] and result["games"][1]:
-            judgments.append(result)
+    judge_config = JUDGE_CONFIGS[judge_name]
+
+    async with AsyncOpenAI(
+        api_key=judge_config.get("api_key"),
+        base_url=judge_config.get("base_url"),
+    ) as shared_client:
+        semaphore = asyncio.Semaphore(judge_config.get("max_concurrency", 8))
+
+        async def _run_judgment(row: EvaluationRow) -> Optional[Dict[str, Any]]:
+            async with semaphore:
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
+
+        tasks = [_run_judgment(row) for row in rows]
+        for coro in asyncio.as_completed(tasks):
+            result = await coro
+            if result and result["games"][0] and result["games"][1]:
+                judgments.append(result)
 
     if not judgments:
         print("❌ No valid judgments generated")
@@ -102,11 +118,13 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation
 
     print(f"✅ Generated {len(judgments)} valid judgments")
 
-    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
-    if mean_score == 0.0:
+    bootstrap_result = calculate_bootstrap_scores(judgments)
+    if not bootstrap_result:
         print("❌ No valid scores extracted")
         return rows
 
+    mean_score, lower_score, upper_score = bootstrap_result
+
     print("\n##### LLM Judge Results (90th percentile CI) #####")
     clean_model_name = model_name.split("/")[-1]
     print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
diff --git a/tests/pytest/test_dataset_preparation.py b/tests/pytest/test_dataset_preparation.py
new file mode 100644
index 00000000..cbf786a8
--- /dev/null
+++ b/tests/pytest/test_dataset_preparation.py
@@ -0,0 +1,375 @@
+from __future__ import annotations
+
+import importlib
+from importlib.machinery import ModuleSpec
+import sys
+import types
+from typing import cast
+
+import pytest
+from pydantic import BaseModel, ConfigDict
+
+
+def _install_dependency_stubs() -> None:
+    """Register lightweight stubs for optional runtime dependencies."""
+
+    def _ensure_module(name: str, **attrs) -> None:
+        if name in sys.modules:
+            return
+        module = types.ModuleType(name)
+        for key, value in attrs.items():
+            setattr(module, key, value)
+        sys.modules[name] = module
+
+    try:  # pragma: no cover - prefer real dependency when available
+        importlib.import_module("loguru")
+    except ModuleNotFoundError:
+
+        class _Logger:  # pragma: no cover - inert logging shim
+            def __getattr__(self, _name: str):
+                def _noop(*_args, **_kwargs):
+                    return None
+
+                return _noop
+
+        _ensure_module("loguru", logger=_Logger())
+
+    def _noop_loader(*_args, **_kwargs):  # pragma: no cover - placeholder loader
+        return {}
+
+    def _field_type(name: str):
+        def __init__(self, *_args, **_kwargs):
+            return None
+
+        return type(name, (), {"__init__": __init__})
+
+    class _SqliteDatabase:
+        def __init__(self, *_args, **_kwargs):
+            self.path = None
+
+        def connect(self):  # pragma: no cover - stub connection
+            return None
+
+        def close(self):  # pragma: no cover
+            return None
+
+        def atomic(self):  # pragma: no cover - context manager shim
+            class _Atomic:
+                def __enter__(self_inner):
+                    return self_inner
+
+                def __exit__(self_inner, *_exc):
+                    return False
+
+            return _Atomic()
+
+        def create_tables(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
+        def create_table(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
+        def drop_tables(self, *_args, **_kwargs):  # pragma: no cover
+            return None
+
+    optional_stub_attrs = {
+        "toml": {"loads": _noop_loader, "load": _noop_loader},
+        "datasets": {},
+        "addict": {"Dict": dict},
+        "deepdiff": {"DeepDiff": type("DeepDiff", (), {})},
+        "peewee": {
+            "Model": type("Model", (), {}),
+            "SqliteDatabase": _SqliteDatabase,
+            "CharField": _field_type("CharField"),
+            "TextField": _field_type("TextField"),
+            "IntegerField": _field_type("IntegerField"),
+            "DateTimeField": _field_type("DateTimeField"),
+            "AutoField": _field_type("AutoField"),
+            "OperationalError": Exception,
+        },
+        "backoff": {},
+        "aiohttp": {"ClientSession": type("ClientSession", (), {})},
+        "tqdm": {"tqdm": lambda iterable, *_args, **_kwargs: iterable},
+    }
+
+    for optional_module, attrs in optional_stub_attrs.items():
+        try:
+            importlib.import_module(optional_module)
+        except ModuleNotFoundError:
+            _ensure_module(optional_module, **attrs)
+
+    try:
+        importlib.import_module("litellm")
+    except ModuleNotFoundError:
+        litellm_mod = types.ModuleType("litellm")
+
+        def _acompletion(*_args, **_kwargs):  # pragma: no cover - stubbed async function
+            return None
+
+        def _completion_cost(*_args, **_kwargs):  # pragma: no cover - cost shim
+            return 0.0
+
+        litellm_mod.acompletion = _acompletion
+        litellm_mod.completion = _acompletion
+        litellm_mod.completion_cost = _completion_cost
+
+        caching_pkg = types.ModuleType("litellm.caching")
+        caching_submodule = types.ModuleType("litellm.caching.caching")
+        caching_submodule.Cache = type("Cache", (), {})
+        dual_cache_module = types.ModuleType("litellm.caching.dual_cache")
+        dual_cache_module.DualCache = type("DualCache", (), {})
+        in_memory_cache_module = types.ModuleType("litellm.caching.in_memory_cache")
+        in_memory_cache_module.InMemoryCache = type("InMemoryCache", (), {})
+        caching_pkg.caching = caching_submodule
+        caching_pkg.dual_cache = dual_cache_module
+        caching_pkg.in_memory_cache = in_memory_cache_module
+        redis_cache_module = types.ModuleType("litellm.caching.redis_cache")
+        redis_cache_module.RedisCache = type("RedisCache", (), {})
+        caching_pkg.redis_cache = redis_cache_module
+
+        litellm_mod.caching = caching_pkg
+
+        main_module = types.ModuleType("litellm.main")
+        main_module.ModelResponse = type("ModelResponse", (), {})
+        main_module.Usage = type("Usage", (), {})
+
+        cost_calculator_mod = types.ModuleType("litellm.cost_calculator")
+        cost_calculator_mod.cost_per_token = lambda *_args, **_kwargs: 0.0
+
+        sys.modules["litellm"] = litellm_mod
+        sys.modules["litellm.caching"] = caching_pkg
+        sys.modules["litellm.caching.caching"] = caching_submodule
+        sys.modules["litellm.caching.dual_cache"] = dual_cache_module
+        sys.modules["litellm.caching.in_memory_cache"] = in_memory_cache_module
+        sys.modules["litellm.caching.redis_cache"] = redis_cache_module
+        sys.modules["litellm.main"] = main_module
+        sys.modules["litellm.cost_calculator"] = cost_calculator_mod
+
+    try:
+        importlib.import_module("playhouse.sqlite_ext")
+    except ModuleNotFoundError:
+        playhouse_mod = types.ModuleType("playhouse")
+        sqlite_ext_mod = types.ModuleType("playhouse.sqlite_ext")
+        sqlite_ext_mod.JSONField = type("JSONField", (), {})
+        playhouse_mod.sqlite_ext = sqlite_ext_mod
+
+        sys.modules["playhouse"] = playhouse_mod
+        sys.modules["playhouse.sqlite_ext"] = sqlite_ext_mod
+
+    try:
+        importlib.import_module("openai")
+        return
+    except ModuleNotFoundError:
+        pass
+
+    openai_mod = types.ModuleType("openai")
+    types_mod = types.ModuleType("openai.types")
+    completion_usage_mod = types.ModuleType("openai.types.completion_usage")
+    chat_mod = types.ModuleType("openai.types.chat")
+    chat_message_mod = types.ModuleType("openai.types.chat.chat_completion_message")
+    chat_message_param_mod = types.ModuleType("openai.types.chat.chat_completion_message_param")
+    tool_call_mod = types.ModuleType("openai.types.chat.chat_completion_message_tool_call")
+
+    class CompletionUsage(BaseModel):  # pragma: no cover - simple data container
+        prompt_tokens: int | None = None
+        completion_tokens: int | None = None
+        total_tokens: int | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class FunctionCall(BaseModel):  # pragma: no cover - simple data container
+        name: str | None = None
+        arguments: str | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class FunctionDefinition(BaseModel):  # pragma: no cover - simple data container
+        name: str | None = None
+        description: str | None = None
+        parameters: dict[str, Any] | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class ChatCompletionContentPartTextParam(BaseModel):  # pragma: no cover - simple data container
+        text: str | None = None
+        type: str = "text"
+
+        model_config = ConfigDict(extra="allow")
+
+    class ChatCompletionMessageToolCall(BaseModel):  # pragma: no cover - simple data container
+        id: str | None = None
+        type: str | None = None
+        function: FunctionCall | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class ChatCompletionMessageParam(BaseModel):  # pragma: no cover - simple data container
+        content: str | None = None
+        role: str | None = None
+
+        model_config = ConfigDict(extra="allow")
+
+    class _NotGiven:  # pragma: no cover - sentinel placeholder
+        pass
+
+    types_mod.CompletionUsage = CompletionUsage
+    completion_usage_mod.CompletionUsage = CompletionUsage
+    chat_message_mod.FunctionCall = FunctionCall
+    chat_message_param_mod.ChatCompletionMessageParam = ChatCompletionMessageParam
+    tool_call_mod.ChatCompletionMessageToolCall = ChatCompletionMessageToolCall
+    chat_mod.ChatCompletionContentPartTextParam = ChatCompletionContentPartTextParam
+    types_mod.FunctionDefinition = FunctionDefinition
+
+    openai_mod.__spec__ = ModuleSpec("openai", loader=None)
+    types_mod.__spec__ = ModuleSpec("openai.types", loader=None)
+    completion_usage_mod.__spec__ = ModuleSpec("openai.types.completion_usage", loader=None)
+    chat_mod.__spec__ = ModuleSpec("openai.types.chat", loader=None)
+    chat_message_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message", loader=None)
+    chat_message_param_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_param", loader=None)
+    tool_call_mod.__spec__ = ModuleSpec("openai.types.chat.chat_completion_message_tool_call", loader=None)
+
+    openai_mod.types = types_mod
+    openai_mod.NotGiven = _NotGiven
+    openai_mod.NOT_GIVEN = _NotGiven()
+    types_mod.completion_usage = completion_usage_mod
+    types_mod.chat = chat_mod
+    chat_mod.chat_completion_message = chat_message_mod
+    chat_mod.chat_completion_message_tool_call = tool_call_mod
+    chat_mod.chat_completion_message_param = chat_message_param_mod
+
+    sys.modules["openai"] = openai_mod
+    sys.modules["openai.types"] = types_mod
+    sys.modules["openai.types.completion_usage"] = completion_usage_mod
+    sys.modules["openai.types.chat"] = chat_mod
+    sys.modules["openai.types.chat.chat_completion_message"] = chat_message_mod
+    sys.modules["openai.types.chat.chat_completion_message_tool_call"] = tool_call_mod
+    sys.modules["openai.types.chat.chat_completion_message_param"] = chat_message_param_mod
+
+
+_install_dependency_stubs()
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows
+from eval_protocol.pytest.generate_parameter_combinations import ParameterizedTestKwargs
+
+
+def _make_kwargs(**overrides) -> ParameterizedTestKwargs:
+    base: ParameterizedTestKwargs = {
+        "dataset_path": None,
+        "completion_params": None,
+        "input_messages": None,
+        "input_rows": None,
+        "evaluation_test_kwargs": None,
+    }
+    base.update(overrides)
+    return cast(ParameterizedTestKwargs, base)
+
+
+def test_load_and_prepare_rows_from_dataset(monkeypatch):
+    dataset_contents = {
+        "file1": [{"text": "f1a"}, {"text": "f1b"}],
+        "file2": [{"text": "f2a"}, {"text": "f2b"}],
+    }
+    load_calls: list[str] = []
+
+    def fake_load_jsonl(path: str):
+        load_calls.append(path)
+        return dataset_contents[path]
+
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.load_jsonl", fake_load_jsonl)
+
+    generated_args: list[dict[str, int | None]] = []
+
+    def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str:
+        generated_args.append({"seed": seed, "index": index})
+        return f"id-{index}"
+
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id)
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 10)
+
+    adapter_inputs: list[list[dict[str, str]]] = []
+
+    def dataset_adapter(data):
+        adapter_inputs.append(list(data))
+        rows: list[EvaluationRow] = []
+        for entry in data:
+            rows.append(EvaluationRow(messages=[Message(role="user", content=entry["text"])]))
+        return rows
+
+    preprocess_calls: list[list[EvaluationRow]] = []
+
+    def preprocess(rows: list[EvaluationRow]) -> list[EvaluationRow]:
+        preprocess_calls.append(list(rows))
+        return rows
+
+    kwargs = _make_kwargs(dataset_path=["file1", "file2"])
+
+    result = load_and_prepare_rows(
+        kwargs,
+        dataset_adapter=dataset_adapter,
+        preprocess_fn=preprocess,
+        max_dataset_rows=3,
+    )
+
+    assert load_calls == ["file1", "file2"], "Expected to load all dataset paths"
+    assert len(adapter_inputs) == 1
+    assert len(adapter_inputs[0]) == 3, "max_dataset_rows should truncate concatenated data"
+    assert preprocess_calls and preprocess_calls[0] == result
+    assert all(row.input_metadata.row_id is not None for row in result)
+    assert len(generated_args) == len(result)
+    assert all(call["seed"] == 0 for call in generated_args)
+    assert all(0 <= call["index"] < 10 for call in generated_args if call["index"] is not None)
+
+
+def test_load_and_prepare_rows_from_messages(monkeypatch):
+    generated_indices: list[int | None] = []
+
+    def fake_generate_id(separator: str = "-", seed: int | None = None, index: int | None = None) -> str:
+        generated_indices.append(index)
+        return f"row-{index}"
+
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fake_generate_id)
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.num_combinations", lambda: 8)
+
+    kwargs = _make_kwargs(
+        input_messages=[
+            [Message(role="system", content="system")],
+            [Message(role="user", content="question")],
+        ]
+    )
+
+    result = load_and_prepare_rows(
+        kwargs,
+        dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"),
+        preprocess_fn=None,
+        max_dataset_rows=None,
+    )
+
+    assert [row.messages for row in result] == kwargs["input_messages"]
+    assert generated_indices and all(index is not None for index in generated_indices)
+
+
+def test_load_and_prepare_rows_deep_copies_input_rows(monkeypatch):
+    def fail_generate_id(*_args, **_kwargs):  # pragma: no cover - should never be called
+        raise AssertionError("generate_id should not be called when row_id already exists")
+
+    monkeypatch.setattr("eval_protocol.pytest.dataset_preparation.generate_id", fail_generate_id)
+
+    original = EvaluationRow(messages=[Message(role="user", content="hi")])
+    original.input_metadata.row_id = "existing-id"
+
+    kwargs = _make_kwargs(input_rows=[original])
+
+    result = load_and_prepare_rows(
+        kwargs,
+        dataset_adapter=lambda data: pytest.fail("dataset_adapter should not be used"),
+        preprocess_fn=None,
+        max_dataset_rows=None,
+    )
+
+    assert len(result) == 1
+    assert result[0] is not original
+    assert result[0].input_metadata.row_id == "existing-id"
+
+    result[0].messages[0].content = "changed"
+    assert original.messages[0].content == "hi", "Deep copy should isolate message objects"