groupwise

mayinghan · mayinghan · commit 37e0210f54ed · 2025-12-04T11:34:23.000-08:00
diff --git a/eval_protocol/pytest/priority_scheduler.py b/eval_protocol/pytest/priority_scheduler.py
@@ -44,15 +44,17 @@ def __init__(
         max_concurrent_rollouts: int,
         active_logger: DatasetLogger,
         eval_executor: Callable[[Union[EvaluationRow, List[EvaluationRow]]], Awaitable[Union[EvaluationRow, List[EvaluationRow]]]], # Callback to run evaluation
-        mini_batch_data_buffer: Optional[MiniBatchDataBuffer] = None,
+        output_buffer: Optional[MiniBatchDataBuffer] = None,
         max_concurrent_evaluations: Optional[int] = None,
+        mode: str = "pointwise",
     ):
         self.rollout_processor = rollout_processor
         self.max_concurrent_rollouts = max_concurrent_rollouts
         self.max_concurrent_evaluations = max_concurrent_evaluations
         self.active_logger = active_logger
         self.eval_executor = eval_executor
-        self.mini_batch_data_buffer = mini_batch_data_buffer
+        self.output_buffer = output_buffer 
+        self.mode = mode
         
         # Priority Queue: Stores RolloutTask
         self.queue: asyncio.PriorityQueue[RolloutTask] = asyncio.PriorityQueue()
@@ -61,6 +63,10 @@ def __init__(
         self.rollout_sem = asyncio.Semaphore(max_concurrent_rollouts)
         self.eval_sem = asyncio.Semaphore(max_concurrent_evaluations) if max_concurrent_evaluations else None
         
+        # Results storage
+        self.results: List[EvaluationRow] = [] # for backward compatibility reason, we save all results here to return
+        self.groups_buffer: Dict[int, List[EvaluationRow]] = defaultdict(list) # buffer for group results. only flush to output buffer when a whole group is ready
+
         self.num_runs = 0
         self.micro_batch_size = 0
 
@@ -155,24 +161,85 @@ async def _process_task(self, task: RolloutTask):
         # 3. Evaluate and Collect History
         current_batch_history_updates = []
         
-        async def _run_eval():
-            for res in batch_results:
-                # Run Evaluation
-                eval_res = await self.eval_executor(res)
-                
-                # Depending on the execution mode, eval_executor might return a single row or a list
-                # For pointwise, it's a single row. For groupwise, it's a list.
-                # Since PriorityScheduler processes a batch of single-turn rollouts, we expect single rows back
-                # But to be safe and type-correct, we handle both.
+        if self.mode == "groupwise":
+            # Collect all results from this batch
+             for res in batch_results:
+                self.groupwise_buffer[task.row_index].append(res)
                 
-                if isinstance(eval_res, list):
-                    # Should not happen in pointwise mode which is typically used with this scheduler
-                    # But if it does, we process each result
-                    for r in eval_res:
+                # Update history from rollout result (assuming eval doesn't change content needed for history)
+                last_msg = res.last_assistant_message()
+                if last_msg and last_msg.content:
+                    content = last_msg.content
+                    if isinstance(content, list):
+                        text_parts = [p["text"] for p in content if p["type"] == "text"]
+                        current_batch_history_updates.append("".join(text_parts))
+                    else:
+                        current_batch_history_updates.append(str(content))
+                else:
+                    current_batch_history_updates.append("")
+            
+             # Check if this is the last batch for this sample
+             last_run_idx = task.run_indices[-1]
+             if last_run_idx + 1 >= self.num_runs:
+                 # Last batch: Execute Groupwise Evaluation
+                 full_group = self.groupwise_buffer[task.row_index]
+                 
+                 async def _run_group_eval():
+                     eval_res = await self.eval_executor(full_group)
+                     # Handle result (could be list or single row wrapping list?)
+                     # Usually groupwise returns list of scored rows
+                     if isinstance(eval_res, list):
+                         self.results.extend(eval_res)
+                         if self.mini_batch_data_buffer:
+                             # Push the whole group at once if possible, or iterate
+                             for r in eval_res:
+                                 await self.mini_batch_data_buffer.add_result(r)
+                     else:
+                         self.results.append(eval_res)
+                         if self.mini_batch_data_buffer:
+                             await self.mini_batch_data_buffer.add_result(eval_res)
+                 
+                 if self.eval_sem:
+                    async with self.eval_sem:
+                        await _run_group_eval()
+                 else:
+                    await _run_group_eval()
+                 
+                 # Clear buffer to free memory
+                 del self.groupwise_buffer[task.row_index]
+
+        else:
+            # Pointwise: Process each result individually
+            async def _run_eval():
+                for res in batch_results:
+                    # Run Evaluation
+                    eval_res = await self.eval_executor(res)
+                    
+                    if isinstance(eval_res, list):
+                        # Should not happen in pointwise mode which is typically used with this scheduler
+                        # But if it does, we process each result
+                        self.results.extend(eval_res)
+                        for r in eval_res:
+                            if self.mini_batch_data_buffer:
+                                await self.mini_batch_data_buffer.add_result(r)
+                            
+                            last_msg = r.last_assistant_message()
+                            if last_msg and last_msg.content:
+                                content = last_msg.content
+                                if isinstance(content, list):
+                                    text_parts = [p["text"] for p in content if p["type"] == "text"]
+                                    current_batch_history_updates.append("".join(text_parts))
+                                else:
+                                    current_batch_history_updates.append(str(content))
+                            else:
+                                current_batch_history_updates.append("")
+                    else:
+                        self.results.append(eval_res)
                         if self.mini_batch_data_buffer:
-                            await self.mini_batch_data_buffer.add_result(r)
-                        
-                        last_msg = r.last_assistant_message()
+                            await self.mini_batch_data_buffer.add_result(eval_res)
+
+                        # Extract prediction for history
+                        last_msg = eval_res.last_assistant_message()
                         if last_msg and last_msg.content:
                             content = last_msg.content
                             if isinstance(content, list):
@@ -181,28 +248,13 @@ async def _run_eval():
                             else:
                                 current_batch_history_updates.append(str(content))
                         else:
-                            current_batch_history_updates.append("")
-                else:
-                    if self.mini_batch_data_buffer:
-                        await self.mini_batch_data_buffer.add_result(eval_res)
+                            current_batch_history_updates.append("") # Empty string for failed turns
 
-                    # Extract prediction for history
-                    last_msg = eval_res.last_assistant_message()
-                    if last_msg and last_msg.content:
-                        content = last_msg.content
-                        if isinstance(content, list):
-                            text_parts = [p["text"] for p in content if p["type"] == "text"]
-                            current_batch_history_updates.append("".join(text_parts))
-                        else:
-                            current_batch_history_updates.append(str(content))
-                    else:
-                        current_batch_history_updates.append("") # Empty string for failed turns
-
-        if self.eval_sem:
-            async with self.eval_sem:
+            if self.eval_sem:
+                async with self.eval_sem:
+                    await _run_eval()
+            else:
                 await _run_eval()
-        else:
-            await _run_eval()
 
         # 4. Schedule Next Micro-batch (High Priority)
         last_run_idx = task.run_indices[-1]
@@ -248,12 +300,11 @@ async def run(self, dataset: List[EvaluationRow], num_runs: int, micro_batch_siz
         for w in workers:
             w.cancel()
         
-        # Ensure cancellation is complete
         if workers:
             await asyncio.gather(*workers, return_exceptions=True)
             
-        # Return empty dict as we rely on side effects (streaming buffer)
-        return {}
+        # Return collected results
+        return self.results
 
 async def execute_priority_rollouts(
     dataset: List[EvaluationRow],
diff --git a/tests/test_priority_scheduler.py b/tests/test_priority_scheduler.py
@@ -279,4 +279,54 @@ async def schedule_dataset(self, *args):
     
     assert worker_start_count == expected_workers
 
+@pytest.mark.asyncio
+async def test_groupwise_mode(
+    mock_logger, mock_eval_executor, base_config
+):
+    """
+    Test that groupwise mode collects all runs before evaluating.
+    """
+    dataset = [create_mock_row("row-0")]
+    num_runs = 4
+    micro_batch_size = 2
+    
+    # We expect 2 batches of 2 runs each.
+    # Batch 1 (Runs 0,1): Should buffer and update history, NOT call eval.
+    # Batch 2 (Runs 2,3): Should buffer, update history, AND call eval with all 4 runs.
+    
+    eval_calls = []
+    
+    async def mock_eval(rows):
+        eval_calls.append(rows)
+        return rows # Pass through
+
+    async def mock_rollout_gen(processor, rows, config, run_idx):
+        for row in rows:
+            yield row
+
+    mock_eval_executor.side_effect = mock_eval
+    
+    with patch('eval_protocol.pytest.priority_scheduler.rollout_processor_with_retry', side_effect=mock_rollout_gen):
+        processor_instance = MagicMock()
+        
+        scheduler = PriorityRolloutScheduler(
+            rollout_processor=processor_instance,
+            max_concurrent_rollouts=1,
+            active_logger=mock_logger,
+            eval_executor=mock_eval_executor,
+            mode="groupwise"
+        )
+        
+        results = await scheduler.run(dataset, num_runs, micro_batch_size, base_config)
+        
+        # Verify evaluation was called EXACTLY ONCE
+        assert len(eval_calls) == 1, f"Expected 1 eval call, got {len(eval_calls)}"
+        
+        # Verify it was called with ALL 4 rows
+        evaluated_rows = eval_calls[0]
+        assert len(evaluated_rows) == 4, f"Expected 4 rows in group eval, got {len(evaluated_rows)}"
+        
+        # Verify results contains all 4 rows
+        assert len(results) == 4
+