eval-protocol
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 8 additions & 0 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/handle_persist_flow.py‎
Lines changed: 3 additions & 1 deletion b/‎eval_protocol/pytest/handle_persist_flow.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎eval_protocol/pytest/parameterize.py‎
Lines changed: 87 additions & 0 deletions b/‎eval_protocol/pytest/parameterize.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 67 additions & 37 deletions b/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 67 additions & 37 deletions
diff --git a/‎eval_protocol/types/remote_rollout_processor.py‎
Lines changed: 16 additions & 3 deletions b/‎eval_protocol/types/remote_rollout_processor.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎tests/chinook/langgraph/test_langgraph_chinook_tools.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/chinook/langgraph/test_langgraph_chinook_tools.py‎
Lines changed: 1 addition & 0 deletions
@@ -81,6 +81,7 @@ def evaluation_test(
     aggregation_method: AggregationMethod = "mean",
     passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
     num_runs: int = 1,
+    filtered_row_ids: Sequence[str] | None = None,
     max_dataset_rows: int | None = None,
     mcp_config_path: str | None = None,
     max_concurrent_rollouts: int = 8,
@@ -148,6 +149,7 @@ def evaluation_test(
             Success rate must be above success, and if set, standard error must be below standard_error.
             Success rate +/- one standard_error is equivalent to 68% confidence interval.
         num_runs: Number of times to repeat the rollout and evaluations.
+        filtered_row_ids: List of row_ids to filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel.
@@ -272,6 +274,9 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                             results = data_loader.load()
                             for result in results:
                                 data.extend(result.rows)
+                        # Apply max_dataset_rows limit to data from data loaders
+                        if max_dataset_rows is not None:
+                            data = data[:max_dataset_rows]
                     elif "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
                         ds_arg: list[str] = kwargs["dataset_path"]
                         # Support either a single path or a list of paths; if a list is provided,
@@ -293,6 +298,9 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     else:
                         raise ValueError("No input dataset, input messages, or input rows provided")
 
+                    if filtered_row_ids is not None:
+                        data = [row for row in data if row.input_metadata.row_id in filtered_row_ids]
+
                     """
                     data_loaders handles preprocess_fn internally so we want
                     to specially handle data_loaders here so we don't double
 
@@ -7,6 +7,7 @@
 import re
 from typing import Any
 
+from eval_protocol.directory_utils import find_eval_protocol_dir
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.store_experiment_link import store_experiment_link
 import requests
@@ -25,7 +26,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
                     if row.execution_metadata and row.execution_metadata.experiment_id:
                         experiments[row.execution_metadata.experiment_id].append(row)
 
-                exp_dir = pathlib.Path("experiment_results")
+                eval_protocol_dir = find_eval_protocol_dir()
+                exp_dir = pathlib.Path(eval_protocol_dir) / "experiment_results"
                 exp_dir.mkdir(parents=True, exist_ok=True)
 
                 # Create one JSONL file per experiment_id
 
@@ -73,6 +73,9 @@ def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool:
                 and decorator.func.value.attr == "mark"
                 and decorator.func.attr == "parametrize"
             ):
+                # Validate argvalues if present
+                _validate_parametrize_argvalues(decorator)
+
                 # Check positional arguments first (argnames is typically the first positional arg)
                 if len(decorator.args) > 0:
                     argnames_arg = decorator.args[0]
@@ -88,6 +91,90 @@ def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool:
     return False
 
 
+def _ast_dict_to_string(dict_node: ast.Dict) -> str:
+    """
+    Convert an AST Dict node to its string representation.
+
+    Args:
+        dict_node: AST node representing a dictionary
+
+    Returns:
+        String representation of the dictionary
+    """
+    if not dict_node.keys:
+        return "{}"
+
+    pairs = []
+    for key, value in zip(dict_node.keys, dict_node.values):
+        if key is not None:
+            key_str = _ast_node_to_string(key)
+            value_str = _ast_node_to_string(value)
+            pairs.append(f"{key_str}: {value_str}")
+
+    return "{" + ", ".join(pairs) + "}"
+
+
+def _ast_node_to_string(node: ast.expr) -> str:
+    """
+    Convert an AST node to its string representation.
+
+    Args:
+        node: AST node to convert
+
+    Returns:
+        String representation of the node
+    """
+    if isinstance(node, ast.Constant):
+        if isinstance(node.value, str):
+            return repr(node.value)
+        else:
+            return str(node.value)
+    elif isinstance(node, ast.Name):
+        return node.id
+    elif isinstance(node, ast.Dict):
+        return _ast_dict_to_string(node)
+    elif isinstance(node, ast.List):
+        elements = [_ast_node_to_string(elt) for elt in node.elts]
+        return "[" + ", ".join(elements) + "]"
+    elif isinstance(node, ast.Tuple):
+        elements = [_ast_node_to_string(elt) for elt in node.elts]
+        return "(" + ", ".join(elements) + ")"
+    else:
+        # For complex expressions, return a simplified representation
+        return "<complex expression>"
+
+
+def _validate_parametrize_argvalues(decorator: ast.Call) -> None:
+    """
+    Validate that pytest.mark.parametrize argvalues is a list/tuple, not a dict.
+
+    Args:
+        decorator: AST node representing the pytest.mark.parametrize decorator call
+
+    Raises:
+        ValueError: If argvalues is a dict instead of a list/tuple
+    """
+    # Check positional arguments (argvalues is typically the second positional arg)
+    if len(decorator.args) > 1:
+        argvalues_arg = decorator.args[1]
+        if isinstance(argvalues_arg, ast.Dict):
+            dict_repr = _ast_dict_to_string(argvalues_arg)
+            raise ValueError(
+                f"For evaluation_test with completion_params, pytest.mark.parametrize argvalues must be a list or tuple, not a dict. "
+                f"Use [{dict_repr}] instead of {dict_repr}."
+            )
+
+    # Check keyword arguments for argvalues
+    for keyword in decorator.keywords:
+        if keyword.arg == "argvalues":
+            if isinstance(keyword.value, ast.Dict):
+                dict_repr = _ast_dict_to_string(keyword.value)
+                raise ValueError(
+                    f"For evaluation_test with completion_params, pytest.mark.parametrize argvalues must be a list or tuple, not a dict. "
+                    f"Use [{dict_repr}] instead of {dict_repr}."
+                )
+
+
 def _check_argnames_for_completion_params(argnames_node: ast.expr) -> bool:
     """
     Check if an argnames AST node contains "completion_params".
 
@@ -6,6 +6,7 @@
 
 from eval_protocol.models import EvaluationRow, Status
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
+from eval_protocol.types.remote_rollout_processor import InitRequest, RolloutMetadata
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
 import os
@@ -15,31 +16,14 @@ class RemoteRolloutProcessor(RolloutProcessor):
     """
     Rollout processor that triggers a remote HTTP server to perform the rollout.
 
-    Expected remote API:
-    - POST {remote_base_url}/init
-      Body: {
-        "rollout_id": str,
-        "model": str,
-        "messages": list[dict],
-        "tools": list[dict] | null,
-        "metadata": {
-          "invocation_id": str,
-          "experiment_id": str,
-          "rollout_id": str,
-          "run_id": str | null,
-          "row_id": str | null
-        },
-      }
-      Returns: {"ok": true}
-
-    - GET {remote_base_url}/status?rollout_id=...
-      Returns: {"terminated": bool, "info": {...}?}
+    See https://evalprotocol.io/tutorial/remote-rollout-processor for documentation.
     """
 
     def __init__(
         self,
         *,
         remote_base_url: Optional[str] = None,
+        model_base_url: Optional[str] = None,
         poll_interval: float = 1.0,
         timeout_seconds: float = 120.0,
         output_data_loader: Callable[[str], DynamicDataLoader],
@@ -58,6 +42,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
 
         # Start with constructor values
         remote_base_url: Optional[str] = self._remote_base_url
+        model_base_url: Optional[str] = self._model_base_url
         poll_interval: float = self._poll_interval
         timeout_seconds: float = self._timeout_seconds
 
@@ -74,14 +59,25 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
         async def _process_row(row: EvaluationRow) -> EvaluationRow:
             start_time = time.perf_counter()
 
+            if row.execution_metadata.invocation_id is None:
+                raise ValueError("Invocation ID is required in RemoteRolloutProcessor")
+            if row.execution_metadata.experiment_id is None:
+                raise ValueError("Experiment ID is required in RemoteRolloutProcessor")
+            if row.execution_metadata.rollout_id is None:
+                raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
+            if row.execution_metadata.run_id is None:
+                raise ValueError("Run ID is required in RemoteRolloutProcessor")
+            if row.input_metadata.row_id is None:
+                raise ValueError("Row ID is required in RemoteRolloutProcessor")
+
             # Build request metadata and payload
-            meta: Dict[str, Any] = {
-                "invocation_id": row.execution_metadata.invocation_id,
-                "experiment_id": row.execution_metadata.experiment_id,
-                "rollout_id": row.execution_metadata.rollout_id,
-                "run_id": row.execution_metadata.run_id,
-                "row_id": row.input_metadata.row_id,
-            }
+            meta: RolloutMetadata = RolloutMetadata(
+                invocation_id=row.execution_metadata.invocation_id,
+                experiment_id=row.execution_metadata.experiment_id,
+                rollout_id=row.execution_metadata.rollout_id,
+                run_id=row.execution_metadata.run_id,
+                row_id=row.input_metadata.row_id,
+            )
 
             model: Optional[str] = None
             if row.input_metadata and row.input_metadata.completion_params:
@@ -113,19 +109,33 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
                     }
                 clean_messages.append({k: v for k, v in md.items() if k in allowed_message_fields and v is not None})
 
-            init_payload: Dict[str, Any] = {
-                "rollout_id": row.execution_metadata.rollout_id,
-                "model": model,
-                "messages": clean_messages,
-                "tools": row.tools,
-                "metadata": meta,
-            }
+            if row.execution_metadata.rollout_id is None:
+                raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
+
+            init_payload: InitRequest = InitRequest(
+                model=model,
+                messages=clean_messages,
+                tools=row.tools,
+                metadata=meta,
+                model_base_url=model_base_url,
+            )
 
             # Fire-and-poll
             def _post_init() -> None:
                 url = f"{remote_base_url}/init"
-                r = requests.post(url, json=init_payload, timeout=30)
-                r.raise_for_status()
+                try:
+                    r = requests.post(url, json=init_payload.model_dump(), timeout=30)
+                    r.raise_for_status()
+                except requests.exceptions.Timeout:
+                    raise TimeoutError(
+                        "The /init endpoint timed out after 30 seconds. "
+                        "CRITICAL: The /init endpoint must return immediately (within 30s) and NOT block on rollout execution. "
+                        "Your remote server should:\n"
+                        "1. Accept the /init request and return a 200 response immediately\n"
+                        "2. Process the actual rollout asynchronously in the background\n"
+                        "3. Use the /status endpoint to report progress\n"
+                        "For Python/Node.js: Start a separate process per rollout to avoid blocking the /init response."
+                    )
 
             await asyncio.to_thread(_post_init)
 
@@ -147,7 +157,13 @@ def _get_status() -> Dict[str, Any]:
                 except Exception:
                     # transient errors; continue polling
                     pass
+
                 await asyncio.sleep(poll_interval)
+            else:
+                # Loop completed without breaking, which means we timed out
+                row.rollout_status = Status.rollout_error(
+                    f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
+                )
 
             # Update duration, regardless of termination
             row.execution_metadata.duration_seconds = time.perf_counter() - start_time
@@ -170,14 +186,28 @@ def _load_data():
             elif len(output_rows) == 1:  # Return the Langfuse row
                 langfuse_row = output_rows[0]
                 langfuse_row.input_metadata.completion_params = row.input_metadata.completion_params
+                # merge dataset_info dicts on input_metadata
+                if langfuse_row.input_metadata.dataset_info and row.input_metadata.dataset_info:
+                    langfuse_row.input_metadata.dataset_info = {
+                        **row.input_metadata.dataset_info,
+                        **langfuse_row.input_metadata.dataset_info,
+                    }
+                elif row.input_metadata.dataset_info:
+                    langfuse_row.input_metadata.dataset_info = row.input_metadata.dataset_info
                 langfuse_row.eval_metadata = row.eval_metadata
+                langfuse_row.ground_truth = row.ground_truth
                 return langfuse_row
             else:
                 raise ValueError("RemoteRolloutProcessor's output_data_loader should return exactly one row.")
 
-        for r in rows:
-            tasks.append(asyncio.create_task(_process_row(r)))
+        semaphore = config.semaphore
+
+        async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+            async with semaphore:
+                result = await _process_row(r)
+                return result
 
+        tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
         return tasks
 
     def cleanup(self) -> None:
 
@@ -4,7 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Field
-from eval_protocol.models import Message
+from eval_protocol.models import Message, Status
 
 
 class RolloutMetadata(BaseModel):
@@ -20,10 +20,17 @@ class RolloutMetadata(BaseModel):
 class InitRequest(BaseModel):
     """Request model for POST /init endpoint."""
 
-    rollout_id: str
     model: str
-    messages: List[Message] = Field(min_length=1)
+    messages: Optional[List[Message]] = None
     tools: Optional[List[Dict[str, Any]]] = None
+
+    model_base_url: Optional[str] = None
+    """
+    A Base URL that the remote server can use to make LLM calls. This is useful
+    to configure on the eval-protocol side for flexibility in
+    development/traning.
+    """
+
     metadata: RolloutMetadata
 
 
@@ -33,6 +40,12 @@ class StatusResponse(BaseModel):
     terminated: bool
     info: Optional[Dict[str, Any]] = None
 
+    status: Optional[Status] = None
+    """
+    Optional status indicator for the rollout to be used by eval-protocol. This
+    is useful to distinguish between successful and failed rollouts.
+    """
+
 
 def create_langfuse_config_tags(init_request: InitRequest) -> List[str]:
     """Create Langfuse tags from InitRequest metadata."""
 
@@ -19,6 +19,7 @@ def build_graph_kwargs(cp: CompletionParams) -> Dict[str, Any]:
 
 
 @pytest.mark.asyncio
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally since its not stable")
 @pytest.mark.skipif(os.getenv("FIREWORKS_API_KEY") in (None, ""), reason="FIREWORKS_API_KEY not set")
 @evaluation_test(
     input_messages=[[[Message(role="user", content="Use tools to count total tracks in the database.")]]],