pipelined

xzrderek · xzrderek · commit 903584b9ce38 · 2025-09-25T13:33:47.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -402,33 +402,15 @@ async def _execute_groupwise_eval_with_semaphore(
                                 return results
 
                         if mode == "pointwise":
+                            # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution
                             pointwise_tasks: list[asyncio.Task[EvaluationRow]] = []
-
-                            if rollout_processor.supports_pipelining:
-                                # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution
-                                # Use wrapper that handles retry logic internally
-                                async for row in rollout_processor_with_retry(
-                                    rollout_processor, fresh_dataset, config, run_idx
-                                ):
-                                    pointwise_tasks.append(
-                                        asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row))
-                                    )
-                            else:
-                                # Non-pipelined mode: collect all rollout results first, then postprocess, then evaluate
-                                collected_rollout_rows: list[EvaluationRow] = []
-                                async for row in rollout_processor_with_retry(
-                                    rollout_processor, fresh_dataset, config, run_idx
-                                ):
-                                    collected_rollout_rows.append(row)
-
-                                # Post-process rollout results to get evaluation inputs
-                                eval_input_rows = rollout_processor.postprocess(collected_rollout_rows)
-
-                                # Now evaluate all the post-processed rows
-                                for row in eval_input_rows:
-                                    pointwise_tasks.append(
-                                        asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row))
-                                    )
+                            # Use wrapper that handles retry logic internally
+                            async for row in rollout_processor_with_retry(
+                                rollout_processor, fresh_dataset, config, run_idx
+                            ):
+                                pointwise_tasks.append(
+                                    asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row))
+                                )
 
                             # Run evaluation tasks with progress bar
                             results = await run_tasks_with_eval_progress(pointwise_tasks, run_idx)
@@ -471,13 +453,9 @@ async def _collect_result(config, lst):  # pyright: ignore[reportUnknownParamete
                                     lst.append(copied_row)  # pyright: ignore[reportUnknownMemberType]
                                 tasks.append(asyncio.create_task(_collect_result(config, lst)))  # pyright: ignore[reportUnknownArgumentType]
                             rollout_results = await asyncio.gather(*tasks)
-
-                            # Flatten and postprocess all rollout results
-                            all_rollout_rows = [row for result in rollout_results for row in result]
-                            processed_rows = rollout_processor.postprocess(all_rollout_rows)
-
-                            for row in processed_rows:
-                                row_groups[row.input_metadata.row_id].append(row)
+                            for result in rollout_results:
+                                for row in result:
+                                    row_groups[row.input_metadata.row_id].append(row)  # pyright: ignore[reportUnknownMemberType]
                             tasks = []
                             for _, rows in row_groups.items():  # pyright: ignore[reportUnknownVariableType]
                                 tasks.append(asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows)))  # pyright: ignore[reportUnknownArgumentType]
@@ -494,8 +472,6 @@ async def _collect_result(config, lst):  # pyright: ignore[reportUnknownParamete
                             ):
                                 input_dataset.append(row)  # pyright: ignore[reportUnknownMemberType]
 
-                            input_dataset = rollout_processor.postprocess(input_dataset)
-
                             # NOTE: we will still evaluate errored rows (give users control over this)
                             # i.e., they can choose to give EvaluateResult.score = 0 for errored rows in their test_func
                             results = await execute_pytest(
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
@@ -36,8 +36,6 @@ class RemoteRolloutProcessor(RolloutProcessor):
       Returns: {"terminated": bool, "info": {...}?}
     """
 
-    supports_pipelining: bool = False  # Remote rollout processor cannot pipeline - must wait for all rollouts to complete before fetching results.
-
     def __init__(
         self,
         *,
@@ -156,27 +154,30 @@ def _get_status() -> Dict[str, Any]:
 
             # Update duration, regardless of termination
             row.execution_metadata.duration_seconds = time.perf_counter() - start_time
-            return row
 
-        for r in rows:
-            tasks.append(asyncio.create_task(_process_row(r)))
+            if row.execution_metadata.rollout_id is None:
+                raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
 
-        return tasks
+            data_loader = self._output_data_loader(row.execution_metadata.rollout_id)
+
+            def _load_data():
+                return data_loader.load()
+
+            results = await asyncio.to_thread(_load_data)
 
-    def postprocess(self, finished_rollout_rows: List[EvaluationRow]) -> List[EvaluationRow]:
-        """Fetch actual evaluation rows from Langfuse using the output_data_loader."""
-        invocation_id = finished_rollout_rows[0].execution_metadata.invocation_id
-        if not invocation_id:
-            raise ValueError("Invocation ID is required in RemoteRolloutProcessor")
+            output_rows: List[EvaluationRow] = [row for result in results for row in result.rows]
 
-        data_loader = self._output_data_loader(invocation_id)
+            assert len(output_rows) == 1, "Dataloader used for RemoteRolloutProcessor should have exactly one row"
 
-        results = data_loader.load()
-        output_rows: List[EvaluationRow] = []
-        for result in results:
-            output_rows.extend(result.rows)
+            langfuse_row = output_rows[0]
+            langfuse_row.input_metadata.completion_params = row.input_metadata.completion_params
 
-        return output_rows
+            return langfuse_row
+
+        for r in rows:
+            tasks.append(asyncio.create_task(_process_row(r)))
+
+        return tasks
 
     def cleanup(self) -> None:
         return None
diff --git a/eval_protocol/pytest/rollout_processor.py b/eval_protocol/pytest/rollout_processor.py
@@ -10,19 +10,11 @@ class RolloutProcessor(ABC):
     Abstract base class for all rollout processor strategies.
     """
 
-    supports_pipelining: bool = (
-        True  # Whether this processor supports pipelined evaluation (evaluate rows as rollouts complete)
-    )
-
     @abstractmethod
     def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> list[asyncio.Task[EvaluationRow]]:
         """Process evaluation rows and return async tasks. Must be implemented by subclasses."""
         pass
 
-    def postprocess(self, finished_rollout_rows: list[EvaluationRow]) -> list[EvaluationRow]:
-        """Post-process rollout results to produce evaluation inputs. Only available for processors that return False from supports_pipelining."""
-        return finished_rollout_rows
-
     def cleanup(self) -> None:
         """Cleanup resources. Override in subclasses if cleanup is needed."""
         pass
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -186,6 +186,28 @@ def assistant_to_ground_truth(data: list[EvaluationRow]) -> list[EvaluationRow]:
     return processed_rows
 
 
+def filter_longest_conversation(data: list[EvaluationRow]) -> list[EvaluationRow]:
+    """
+    Filter out the longest conversation from a list of evaluation rows that share the same rollout_id.
+
+    Args:
+        data: List of EvaluationRow objects that share the same rollout_id
+
+    Returns:
+        List containing only the EvaluationRow with the most messages (longest conversation)
+    """
+    if not data:
+        return data
+
+    if len(data) == 1:
+        return data
+
+    # Find the row with the most messages (longest conversation)
+    longest_row = max(data, key=lambda row: len(row.messages))
+
+    return [longest_row]
+
+
 async def run_single_judgment(
     question_text: str, answer_a: str, answer_b: str, tools, judge_config, client
 ) -> Optional[Dict[str, Any]]:
diff --git a/tests/chinook/langfuse/test_remote_langfuse_chinook.py b/tests/chinook/langfuse/test_remote_langfuse_chinook.py
@@ -13,33 +13,37 @@
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
 from eval_protocol.adapters.langfuse import create_langfuse_adapter
+from eval_protocol.quickstart.utils import filter_longest_conversation
 
-INVOCATION_ID = ""
-ASSERTION_EXECUTED = False
+ROLLOUT_IDS = set()
 
 
 @pytest.fixture(autouse=True)
-def check_assertion_executed():
-    """Ensure the test actually executed the Langfuse validation"""
-    global ASSERTION_EXECUTED
-    ASSERTION_EXECUTED = False  # Reset before test
+def check_rollout_coverage():
+    """Ensure we processed all expected rollout_ids"""
+    global ROLLOUT_IDS
+    ROLLOUT_IDS.clear()
     yield
-    # After test completes, verify the assertion was executed
-    assert ASSERTION_EXECUTED, (
-        "Test passed but never validated Langfuse data - check if output_data_loader returned empty results"
+
+    # Verify we've seen the expected number of rollout_ids after test is done
+    expected_rollout_count = 3
+    assert len(ROLLOUT_IDS) == expected_rollout_count, (
+        f"Expected to see {expected_rollout_count} rollout_ids, but only saw {len(ROLLOUT_IDS)}: {ROLLOUT_IDS}"
     )
 
 
-def fetch_trajectories(invocation_id: str) -> List[EvaluationRow]:
-    global INVOCATION_ID  # This is just to verify the invocation_id is set correctly in the test
-    INVOCATION_ID = invocation_id
+def fetch_langfuse_traces(rollout_id: str) -> List[EvaluationRow]:
+    global ROLLOUT_IDS  # Track all rollout_ids we've seen
+    ROLLOUT_IDS.add(rollout_id)
 
     adapter = create_langfuse_adapter()
-    return adapter.get_evaluation_rows(tags=[f"invocation_id:{invocation_id}"])
+    return adapter.get_evaluation_rows(tags=[f"rollout_id:{rollout_id}"])
 
 
-def create_output_data_loader(invocation_id: str) -> DynamicDataLoader:
-    return DynamicDataLoader(generators=[lambda: fetch_trajectories(invocation_id)])
+def langfuse_output_data_loader(rollout_id: str) -> DynamicDataLoader:
+    return DynamicDataLoader(
+        generators=[lambda: fetch_langfuse_traces(rollout_id)], preprocess_fn=filter_longest_conversation
+    )
 
 
 def _start_remote_server():
@@ -87,7 +91,7 @@ def remote_langfuse_data_generator() -> List[EvaluationRow]:
 
     # Minimal single-user-turn message to trigger a response
     row = EvaluationRow(messages=[Message(role="user", content="Hello there! Please say hi back.")])
-    return [row]
+    return [row, row, row]
 
 
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally (skipped in CI)")
@@ -100,7 +104,7 @@ def remote_langfuse_data_generator() -> List[EvaluationRow]:
         remote_base_url="http://127.0.0.1:7077",
         num_turns=2,
         timeout_seconds=30,
-        output_data_loader=create_output_data_loader,
+        output_data_loader=langfuse_output_data_loader,
     ),
 )
 async def test_remote_rollout_and_fetch_langfuse(row: EvaluationRow) -> EvaluationRow:
@@ -110,13 +114,9 @@ async def test_remote_rollout_and_fetch_langfuse(row: EvaluationRow) -> Evaluati
     - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
     - fetch traces from Langfuse filtered by metadata via output_data_loader; FAIL if none found
     """
-    global ASSERTION_EXECUTED
-
-    # Sanity check: row should have an invocation_id since it came from Langfuse via output_data_loader
     assert row.messages[0].content == "Hello there! Please say hi back.", "Row should have correct message content"
-    assert row.execution_metadata.invocation_id == INVOCATION_ID, "Row should have correct invocation_id set"
-
-    ASSERTION_EXECUTED = True
-    print(f"✅ Successfully received row from Langfuse with invocation_id: {row.execution_metadata.invocation_id}")
+    assert row.execution_metadata.rollout_id in ROLLOUT_IDS, (
+        f"Row rollout_id {row.execution_metadata.rollout_id} should be in tracked rollout_ids: {ROLLOUT_IDS}"
+    )
 
     return row