eval-protocol
diff --git a/‎eval_protocol/adapters/langfuse.py‎
Lines changed: 23 additions & 52 deletions b/‎eval_protocol/adapters/langfuse.py‎
Lines changed: 23 additions & 52 deletions
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 6 additions & 7 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎eval_protocol/pytest/utils.py‎
Lines changed: 0 additions & 39 deletions b/‎eval_protocol/pytest/utils.py‎
Lines changed: 0 additions & 39 deletions
@@ -5,15 +5,17 @@
 """
 
 import logging
-from datetime import datetime
+from datetime import datetime, timedelta
 from typing import Any, Dict, Iterator, List, Optional, cast
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
 
 logger = logging.getLogger(__name__)
 
 try:
-    from langfuse import Langfuse  # pyright: ignore[reportPrivateImportUsage]
+    from langfuse import get_client  # pyright: ignore[reportPrivateImportUsage]
+    from langfuse.api.resources.trace.types.traces import Traces
+    from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails
 
     LANGFUSE_AVAILABLE = True
 except ImportError:
@@ -45,35 +47,20 @@ class LangfuseAdapter:
         ... ))
     """
 
-    def __init__(
-        self,
-        public_key: str,
-        secret_key: str,
-        host: str = "https://cloud.langfuse.com",
-        project_id: Optional[str] = None,
-    ):
-        """Initialize the Langfuse adapter.
-
-        Args:
-            public_key: Langfuse public key
-            secret_key: Langfuse secret key
-            host: Langfuse host URL (default: https://cloud.langfuse.com)
-            project_id: Optional project ID to filter traces
-        """
+    def __init__(self):
+        """Initialize the Langfuse adapter."""
         if not LANGFUSE_AVAILABLE:
             raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
 
-        self.client = cast(Any, Langfuse)(public_key=public_key, secret_key=secret_key, host=host)
-        self.project_id = project_id
+        self.client = get_client()
 
     def get_evaluation_rows(
         self,
         limit: int = 100,
         tags: Optional[List[str]] = None,
         user_id: Optional[str] = None,
         session_id: Optional[str] = None,
-        from_timestamp: Optional[datetime] = None,
-        to_timestamp: Optional[datetime] = None,
+        hours_back: Optional[int] = None,
         include_tool_calls: bool = True,
     ) -> List[EvaluationRow]:
         """Pull traces from Langfuse and convert to EvaluationRow format.
@@ -83,16 +70,23 @@ def get_evaluation_rows(
             tags: Filter by specific tags
             user_id: Filter by user ID
             session_id: Filter by session ID
-            from_timestamp: Filter traces after this timestamp
-            to_timestamp: Filter traces before this timestamp
+            hours_back: Filter traces from this many hours ago
             include_tool_calls: Whether to include tool calling traces
 
         Yields:
             EvaluationRow: Converted evaluation rows
         """
         # Get traces from Langfuse using new API
+
+        if hours_back:
+            to_timestamp = datetime.now()
+            from_timestamp = to_timestamp - timedelta(hours=hours_back)
+        else:
+            to_timestamp = None
+            from_timestamp = None
+
         eval_rows = []
-        traces = self.client.api.trace.list(
+        traces: Traces = self.client.api.trace.list(
             limit=limit,
             tags=tags,
             user_id=user_id,
@@ -128,7 +122,7 @@ def get_evaluation_rows_by_ids(
         eval_rows = []
         for trace_id in trace_ids:
             try:
-                trace = self.client.api.trace.get(trace_id)
+                trace: TraceWithFullDetails = self.client.api.trace.get(trace_id)
                 eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
                 if eval_row:
                     eval_rows.append(eval_row)
@@ -147,10 +141,10 @@ def _convert_trace_to_evaluation_row(self, trace: Any, include_tool_calls: bool
         Returns:
             EvaluationRow or None if conversion fails
         """
-        # TODO: move this logic into an adapter in llm_judge.py. langfuse.py should just return traces
         try:
             # Get observations (generations, spans) from the trace
             observations_response = self.client.api.observations.get_many(trace_id=trace.id, limit=100)
+            # print(observations_response)
             observations = (
                 observations_response.data if hasattr(observations_response, "data") else list(observations_response)
             )
@@ -406,7 +400,6 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe
             "trace_id": trace.id,
             "trace_name": getattr(trace, "name", None),
             "trace_tags": getattr(trace, "tags", []),
-            "langfuse_project_id": self.project_id,
         }
 
         # Add trace metadata if available
@@ -418,9 +411,6 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe
             "session_id": getattr(trace, "session_id", None),
             "user_id": getattr(trace, "user_id", None),
             "timestamp": getattr(trace, "timestamp", None),
-            "langfuse_trace_url": (
-                f"{self.client.host}/project/{self.project_id}/traces/{trace.id}" if self.project_id else None
-            ),
         }
 
         return InputMetadata(
@@ -497,26 +487,7 @@ def _extract_tools(self, observations: List[Any], trace: Any = None) -> Optional
         return tools if tools else None
 
 
-def create_langfuse_adapter(
-    public_key: str,
-    secret_key: str,
-    host: str = "https://cloud.langfuse.com",
-    project_id: Optional[str] = None,
-) -> LangfuseAdapter:
-    """Factory function to create a Langfuse adapter.
+def create_langfuse_adapter() -> LangfuseAdapter:
+    """Factory function to create a Langfuse adapter."""
 
-    Args:
-        public_key: Langfuse public key
-        secret_key: Langfuse secret key
-        host: Langfuse host URL
-        project_id: Optional project ID
-
-    Returns:
-        LangfuseAdapter instance
-    """
-    return LangfuseAdapter(
-        public_key=public_key,
-        secret_key=secret_key,
-        host=host,
-        project_id=project_id,
-    )
+    return LangfuseAdapter()
@@ -58,7 +58,6 @@
     parse_ep_num_runs,
     parse_ep_passed_threshold,
     rollout_processor_with_retry,
-    split_multi_turn_rows,
 )
 
 from ..common_utils import load_jsonl
@@ -85,7 +84,7 @@ def evaluation_test(
     steps: int = 30,
     mode: EvaluationTestMode = "pointwise",
     combine_datasets: bool = True,
-    split_multi_turn: bool = False,
+    preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None = None,
     logger: DatasetLogger | None = None,
     exception_handler_config: ExceptionHandlerConfig | None = None,
 ) -> Callable[[TestFunction], TestFunction]:
@@ -152,9 +151,9 @@ def evaluation_test(
         mode: Evaluation mode. "pointwise" (default) applies test function to each row (rollout result).
             "groupwise" applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
             "all" applies test function to the whole dataset.
-        split_multi_turn: If True, splits multi-turn conversations into individual evaluation rows
-            for each assistant response. Each row will contain the conversation context up to that point
-            and the assistant's response as ground truth. Useful for Arena-Hard-Auto style evaluations.
+        preprocess_fn: Optional preprocessing function that takes a list of EvaluationRow objects
+            and returns a modified list. Useful for transformations like splitting multi-turn conversations,
+            filtering data, or other preprocessing steps before rollout execution.
         logger: DatasetLogger to use for logging. If not provided, a default logger will be used.
         exception_handler_config: Configuration for exception handling and backoff retry logic.
             If not provided, a default configuration will be used with common retryable exceptions.
@@ -249,8 +248,8 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     else:
                         raise ValueError("No input dataset, input messages, or input rows provided")
 
-                    if split_multi_turn:
-                        data = split_multi_turn_rows(data)
+                    if preprocess_fn:
+                        data = preprocess_fn(data)
 
                     for row in data:
                         # generate a stable row_id for each row
 
@@ -352,42 +352,3 @@ def add_cost_metrics(row: EvaluationRow) -> None:
         output_cost=output_cost,
         total_cost=total_cost,
     )
-
-
-def split_multi_turn_rows(data: list[EvaluationRow]) -> list[EvaluationRow]:
-    """
-    Split multi-turn conversation rows into individual evaluation rows for each assistant message.
-
-    Args:
-        data: List of EvaluationRow objects
-
-    Returns:
-        List of expanded EvaluationRow objects, one for each assistant message
-    """
-    expanded_rows = []
-
-    for row in data:
-        messages = row.messages
-        tools = row.tools
-        input_metadata = row.input_metadata
-
-        assistant_positions = []
-        for i, message in enumerate(messages):
-            if message.role == "assistant":
-                assistant_positions.append(i)
-
-        # Create separate evaluation rows on each assistant message (where the comparison model will respond)
-        for assistant_pos in assistant_positions:
-            messages_before_assistant = messages[:assistant_pos]
-            ground_truth_message = messages[assistant_pos].content
-
-            expanded_rows.append(
-                EvaluationRow(
-                    messages=messages_before_assistant,
-                    tools=tools,
-                    input_metadata=input_metadata,
-                    ground_truth=ground_truth_message,
-                )
-            )
-
-    return expanded_rows