save

Dylan Huang · Dylan Huang · commit 68eea886d36c · 2025-09-23T16:52:38.000-07:00
diff --git a/eval_protocol/data_loader/factory_data_loader.py b/eval_protocol/data_loader/factory_data_loader.py
@@ -0,0 +1,50 @@
+from collections.abc import Callable, Sequence
+
+from eval_protocol.data_loader.models import (
+    DataLoaderContext,
+    DataLoaderResult,
+    DataLoaderVariant,
+    EvaluationDataLoader,
+)
+from eval_protocol.models import EvaluationRow
+
+
+class FactoryDataLoader(EvaluationDataLoader):
+    """Data loader for factory of list[EvaluationRow]"""
+
+    description: str | None = None
+    """Optional human-readable description of this data loader. Provides additional
+    context about the data source, purpose, or any special characteristics. Used for
+    documentation and debugging purposes. If not provided, the variant_id will be used instead."""
+
+    factory: Sequence[Callable[[], list[EvaluationRow]]]
+    """Factory function that generates evaluation rows dynamically. This callable
+    is invoked each time data needs to be loaded, allowing for dynamic data generation,
+    lazy loading, or data that changes between evaluation runs. The factory should return
+    a list of EvaluationRow objects. This is useful for scenarios like generating test
+    data on-the-fly, loading data from external sources, or creating data with randomized
+    elements for robust testing."""
+
+    def variants(self) -> Sequence[DataLoaderVariant]:
+        variants: Sequence[DataLoaderVariant] = []
+        for factory in self.factory:
+
+            def _load(ctx: DataLoaderContext) -> DataLoaderResult:
+                resolved_rows = factory()
+                return DataLoaderResult(
+                    rows=resolved_rows,
+                    num_rows=len(resolved_rows),
+                    type="factory",
+                    variant_id=ctx.variant_id,
+                    variant_description=ctx.variant_description,
+                )
+
+            variants.append(
+                DataLoaderVariant(
+                    id=factory.__name__,
+                    description=factory.__doc__,
+                    loader=_load,
+                )
+            )
+
+        return variants
diff --git a/eval_protocol/data_loader/inline_data_loader.py b/eval_protocol/data_loader/inline_data_loader.py
@@ -10,13 +10,33 @@
 from eval_protocol.pytest.types import InputMessagesParam
 
 
+DEFAULT_VARIANT_ID: str = "inline"
+
+
 class InlineDataLoader(EvaluationDataLoader):
     """Data loader for inline ``EvaluationRow`` or message payloads."""
 
-    rows: Sequence[EvaluationRow] | None = None
-    messages: Sequence[InputMessagesParam] | None = None
-    variant_id: str = "inline"
+    rows: list[EvaluationRow] | None = None
+    """Pre-defined evaluation rows with tools and metadata. Use this when you have complete
+    EvaluationRow objects that include tools, input_metadata, and other structured data.
+    This is the preferred option when working with tool-calling scenarios or when you need
+    to provide additional metadata like row_id, dataset information, or custom fields."""
+
+    messages: InputMessagesParam | None = None
+    """Raw chat completion message history. Use this when you only have simple
+    conversation history without tools or additional metadata. The messages will be
+    automatically converted to EvaluationRow objects. InputMessagesParam is a list of
+    Message objects representing the conversation flow (user, assistant, system messages)."""
+
+    variant_id: str = DEFAULT_VARIANT_ID
+    """Unique identifier for this data loader variant. Used to label and distinguish
+    different input data sources, versions, or configurations. This helps with tracking
+    and organizing evaluation results from different data sources."""
+
     description: str | None = None
+    """Optional human-readable description of this data loader. Provides additional
+    context about the data source, purpose, or any special characteristics. Used for
+    documentation and debugging purposes. If not provided, the variant_id will be used instead."""
 
     def __post_init__(self) -> None:
         if self.rows is None and self.messages is None:
@@ -26,7 +46,7 @@ def variants(self) -> Sequence[DataLoaderVariant]:
         def _load(ctx: DataLoaderContext) -> DataLoaderResult:
             resolved_rows: list[EvaluationRow] = []
             if self.rows is not None:
-                resolved_rows.extend(row.model_copy(deep=True) for row in self.rows)
+                resolved_rows = [row.model_copy(deep=True) for row in self.rows]
             if self.messages is not None:
                 for dataset_messages in self.messages:
                     row_messages: list[Message] = []
@@ -37,19 +57,11 @@ def _load(ctx: DataLoaderContext) -> DataLoaderResult:
                             row_messages.append(Message.model_validate(msg))
                     resolved_rows.append(EvaluationRow(messages=row_messages))
 
-            if ctx.max_rows is not None:
-                resolved_rows = resolved_rows[: ctx.max_rows]
-
-            metadata = {
-                "data_loader_variant_id": self.variant_id,
-                "data_loader_type": "inline",
-                "row_count": len(resolved_rows),
-            }
-
             return DataLoaderResult(
                 rows=resolved_rows,
-                source_id=self.variant_id,
-                source_metadata=metadata,
+                num_rows=len(resolved_rows),
+                variant_id=ctx.variant_id,
+                type=self.__class__.__name__,
             )
 
         description = self.description or self.variant_id
@@ -58,6 +70,5 @@ def _load(ctx: DataLoaderContext) -> DataLoaderResult:
                 id=self.variant_id,
                 description=description,
                 loader=_load,
-                metadata={"type": "inline"},
             )
         ]
diff --git a/eval_protocol/data_loader/models.py b/eval_protocol/data_loader/models.py
@@ -3,56 +3,129 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
-from typing import Any, Callable
+from typing import Callable
 from typing_extensions import Protocol
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
 from eval_protocol.models import EvaluationRow
-from eval_protocol.pytest.types import EvaluationTestMode
-from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 
 
 class DataLoaderContext(BaseModel):
-    """Context provided to loader variants when materializing data."""
+    """Context provided to loader variants when materializing data. This is mainly used internally by eval-protocol."""
 
-    max_rows: int | None = Field(default=None, ge=1, description="Maximum number of rows to load")
     preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None = Field(
-        default=None, description="Optional preprocessing function for evaluation rows"
+        default=None,
+        description="Optional preprocessing function for evaluation rows. This function is applied "
+        "to the loaded data before it's returned, allowing for data cleaning, transformation, "
+        "filtering, or other modifications. The function receives a list of EvaluationRow objects "
+        "and should return a modified list of EvaluationRow objects.",
+    )
+    variant_id: str = Field(
+        ...,
+        description="Unique identifier for the data loader variant. Used to distinguish between "
+        "different variants of the same data loader and for tracking purposes in evaluation results.",
+    )
+    variant_description: str | None = Field(
+        default=None,
+        description="Human-readable description of the data loader variant. Provides context about what "
+        "this variant represents, its purpose, or any special characteristics that distinguish "
+        "it from other variants.",
     )
-    logger: DatasetLogger = Field(description="Dataset logger for tracking operations")
-    invocation_id: str = Field(description="Unique identifier for this invocation")
-    experiment_id: str = Field(description="Unique identifier for this experiment")
-    mode: EvaluationTestMode = Field(description="The evaluation test mode")
 
-    class Config:
-        arbitrary_types_allowed = True  # For Callable and DatasetLogger types
+    @field_validator("variant_id")
+    @classmethod
+    def validate_variant_id(cls, v: str) -> str:
+        if not v or not v.strip():
+            raise ValueError("variant_id must be non-empty")
+        return v
 
 
 class DataLoaderResult(BaseModel):
     """Rows and metadata returned by a loader variant."""
 
-    rows: list[EvaluationRow] = Field(description="List of evaluation rows loaded")
-    source_id: str = Field(description="Unique identifier for the data source")
-    source_metadata: dict[str, Any] = Field(
-        default_factory=dict, description="Additional metadata about the data source"
+    rows: list[EvaluationRow] = Field(
+        description="List of evaluation rows loaded from the data source. These are the "
+        "processed and ready-to-use evaluation data that will be fed into the evaluation pipeline."
+    )
+    num_rows: int = Field(
+        ...,
+        description="Number of rows loaded. This should match the length of the rows list "
+        "and is used for validation and reporting purposes.",
+    )
+    type: str = Field(
+        ...,
+        description="Type of the data loader that produced this result. Used for identification "
+        "and debugging purposes (e.g., 'InlineDataLoader', 'FactoryDataLoader').",
+    )
+    variant_id: str = Field(
+        ...,
+        description="Unique identifier for the data loader variant that produced this result. "
+        "Used for tracking and organizing evaluation results from different data sources.",
     )
-    raw_payload: Any | None = Field(default=None, description="Raw payload data if available")
-    preprocessed: bool = Field(default=False, description="Whether the data has been preprocessed")
 
-    class Config:
-        arbitrary_types_allowed = True  # For Any type in raw_payload
+    variant_description: str | None = Field(
+        default=None,
+        description="Human-readable description of the data loader variant that produced this result. "
+        "Provides context about what this variant represents, its purpose, or any special characteristics that distinguish "
+        "it from other variants.",
+    )
+
+    preprocessed: bool = Field(
+        default=False,
+        description="Whether the data has been preprocessed. This flag indicates if any "
+        "preprocessing functions have been applied to the data, helping to avoid duplicate "
+        "processing and track data transformation state.",
+    )
+
+    @field_validator("type")
+    @classmethod
+    def validate_type(cls, v: str) -> str:
+        if not v or not v.strip():
+            raise ValueError("type must be non-empty")
+        return v
+
+    @field_validator("num_rows")
+    @classmethod
+    def validate_num_rows(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("num_rows must be greater than 0")
+        return v
+
+    @field_validator("variant_id")
+    @classmethod
+    def validate_variant_id(cls, v: str) -> str:
+        if not v or not v.strip():
+            raise ValueError("variant_id must be non-empty")
+        return v
 
 
 class DataLoaderVariant(BaseModel):
     """Single parameterizable variant from a data loader."""
 
-    id: str = Field(description="Unique identifier for this variant")
-    description: str = Field(description="Human-readable description of this variant")
+    id: str = Field(
+        description="Unique identifier for this variant. Used to distinguish between different "
+        "variants of the same data loader and for tracking purposes in evaluation results."
+    )
+    description: str | None = Field(
+        default=None,
+        description="Human-readable description of this variant. Provides context about what "
+        "this variant represents, its purpose, or any special characteristics that distinguish "
+        "it from other variants.",
+    )
     loader: Callable[[DataLoaderContext], DataLoaderResult] = Field(
-        description="Function that loads data for this variant"
+        description="Function that loads data for this variant. This callable is invoked with "
+        "a DataLoaderContext and should return a DataLoaderResult containing the loaded "
+        "evaluation rows and associated metadata. The loader function is responsible for "
+        "the actual data retrieval and any necessary processing."
     )
-    metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata for this variant")
+
+    @field_validator("id")
+    @classmethod
+    def validate_id(cls, v: str) -> str:
+        if not v or not v.strip():
+            raise ValueError("DataLoaderVariant.id must be non-empty")
+        return v
 
     class Config:
         arbitrary_types_allowed = True  # For Callable type
@@ -69,3 +142,10 @@ class EvaluationDataLoader(Protocol):
     def variants(self) -> Sequence[DataLoaderVariant]:
         """Return parameterizable variants emitted by this loader."""
         ...
+
+    def load(self, ctx: DataLoaderContext) -> list[DataLoaderResult]:
+        """
+        Loads all variants of this data loader and return a list of DataLoaderResult.
+        """
+        variants = self.variants()
+        return [variant.load(ctx) for variant in variants]
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -72,7 +72,7 @@ def evaluation_test(
     input_messages: Sequence[list[InputMessagesParam] | None] | None = None,
     input_dataset: Sequence[DatasetPathParam] | None = None,
     input_rows: Sequence[list[EvaluationRow]] | None = None,
-    input_data_loaders: Sequence[EvaluationDataLoader] | EvaluationDataLoader | None = None,
+    data_loaders: Sequence[EvaluationDataLoader] | EvaluationDataLoader | None = None,
     dataset_adapter: Callable[[list[dict[str, Any]]], Dataset] = default_dataset_adapter,  # pyright: ignore[reportExplicitAny]
     rollout_processor: RolloutProcessor | None = None,
     evaluation_test_kwargs: Sequence[EvaluationInputParam | None] | None = None,
@@ -176,7 +176,7 @@ def evaluation_test(
 
     active_logger: DatasetLogger = logger if logger else default_logger
 
-    if input_data_loaders is not None and (
+    if data_loaders is not None and (
         input_dataset is not None or input_messages is not None or input_rows is not None
     ):
         raise ValueError("data_loaders cannot be combined with input_dataset, input_messages, or input_rows.")