resolve comments and fix bugs

mayinghan · mayinghan · commit 81fbc701646c · 2025-12-08T15:38:25.000-08:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -418,37 +418,42 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                         else:
                             output_buffer = None
                         
-                        priority_results = await execute_priority_rollouts(
-                            dataset=data,
-                            num_runs=num_runs,
-                            rollout_processor=rollout_processor,
-                            config=config,
-                            max_concurrent_rollouts=max_concurrent_rollouts,
-                            active_logger=active_logger,
-                            eval_executor=test_func,
-                            max_concurrent_evaluations=max_concurrent_evaluations,
-                            mode=mode,
-                            micro_batch_data_buffer=output_buffer,
-                            evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
-                        )
+                        try:
+                            priority_results = await execute_priority_rollouts(
+                                dataset=data,
+                                num_runs=num_runs,
+                                rollout_processor=rollout_processor,
+                                config=config,
+                                max_concurrent_rollouts=max_concurrent_rollouts,
+                                active_logger=active_logger,
+                                eval_executor=test_func,
+                                max_concurrent_evaluations=max_concurrent_evaluations,
+                                mode=mode,
+                                micro_batch_data_buffer=output_buffer,
+                                evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
+                            )
+                        finally:
+                            if output_buffer:
+                                await output_buffer.close()
                         
                         for res in priority_results:
                             run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
                             if run_idx < len(all_results):
                                 all_results[run_idx].append(res)
                             
                             processed_rows_in_run.append(res)
-                            postprocess(
-                                all_results,
-                                aggregation_method,
-                                passed_threshold,
-                                active_logger,
-                                mode,
-                                completion_params,  # pyright: ignore[reportArgumentType]
-                                test_func.__name__,
-                                num_runs,
-                                time.perf_counter() - experiment_start_time,
-                            )
+
+                        postprocess(
+                            all_results,
+                            aggregation_method,
+                            passed_threshold,
+                            active_logger,
+                            mode,
+                            completion_params,  # pyright: ignore[reportArgumentType]
+                            test_func.__name__,
+                            num_runs,
+                            time.perf_counter() - experiment_start_time,
+                        )
 
                     else:
                         async def execute_run(run_idx: int, config: RolloutProcessorConfig):
diff --git a/eval_protocol/pytest/priority_scheduler.py b/eval_protocol/pytest/priority_scheduler.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 import os
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -67,7 +68,6 @@ def __init__(
         self.queue: asyncio.PriorityQueue[RolloutTask] = asyncio.PriorityQueue()
         
         # Concurrency Control
-        self.rollout_sem = asyncio.Semaphore(max_concurrent_rollouts)
         self.eval_sem = asyncio.Semaphore(max_concurrent_evaluations)
         
         # Results storage
@@ -112,16 +112,13 @@ async def worker(self):
         Worker loop: fetch task -> execute micro-batch -> schedule next batch (if any).
         """
         while True:
-            try:
-                # Get a task from the priority queue
-                task: RolloutTask = await self.queue.get()
-            except asyncio.QueueEmpty:
-                break
+            # Get a task from the priority queue    
+            task: RolloutTask = await self.queue.get()
 
             try:
                 await self._process_task(task)
             except Exception as e:
-                print(f"Error processing task for row {task.row.input_metadata.row_id}: {e}")
+                logging.error(f"Error processing task for row {task.row.input_metadata.row_id}: {e}", exc_info=True)
             finally:
                 self.queue.task_done()