Merge branch 'increase-sql-retries' into dhuang/dxe-478-implement-evaluator-versions

Dylan Huang · Dylan Huang · commit b3adfee941a7 · 2026-01-15T16:02:41.000-08:00
diff --git a/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py b/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py
@@ -42,9 +42,10 @@ class EvaluationRow(BaseModel):  # type: ignore
 
         self._EvaluationRow = EvaluationRow
 
-        self._db.connect()
+        # Wrap connect() in retry logic since setting pragmas can fail with "database is locked"
+        execute_with_sqlite_retry(lambda: self._db.connect(reuse_if_open=True))
         # Use safe=True to avoid errors when tables/indexes already exist
-        self._db.create_tables([EvaluationRow], safe=True)
+        execute_with_sqlite_retry(lambda: self._db.create_tables([EvaluationRow], safe=True))
 
     @property
     def db_path(self) -> str:
diff --git a/eval_protocol/event_bus/sqlite_event_bus_database.py b/eval_protocol/event_bus/sqlite_event_bus_database.py
@@ -181,9 +181,10 @@ class Event(BaseModel):  # type: ignore
             processed = BooleanField(default=False)  # Track if event has been processed
 
         self._Event = Event
-        self._db.connect()
+        # Wrap connect() in retry logic since setting pragmas can fail with "database is locked"
+        execute_with_sqlite_retry(lambda: self._db.connect(reuse_if_open=True))
         # Use safe=True to avoid errors when tables already exist
-        self._db.create_tables([Event], safe=True)
+        execute_with_sqlite_retry(lambda: self._db.create_tables([Event], safe=True))
 
     def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
         """Publish an event to the database."""
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -57,6 +57,7 @@
     log_eval_status_and_rows,
     parse_ep_completion_params,
     parse_ep_completion_params_overwrite,
+    parse_ep_max_concurrent_evaluations,
     parse_ep_max_concurrent_rollouts,
     parse_ep_max_rows,
     parse_ep_num_runs,
@@ -201,6 +202,7 @@ def evaluation_test(
     # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}').
     num_runs = parse_ep_num_runs(num_runs)
     max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts)
+    max_concurrent_evaluations = parse_ep_max_concurrent_evaluations(max_concurrent_evaluations)
     max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
     completion_params = parse_ep_completion_params(completion_params)
     completion_params = parse_ep_completion_params_overwrite(completion_params)
diff --git a/eval_protocol/pytest/evaluation_test_utils.py b/eval_protocol/pytest/evaluation_test_utils.py
@@ -226,6 +226,15 @@ def parse_ep_max_concurrent_rollouts(default_value: int) -> int:
     return int(raw) if raw is not None else default_value
 
 
+def parse_ep_max_concurrent_evaluations(default_value: int) -> int:
+    """Read EP_MAX_CONCURRENT_EVALUATIONS env override as int.
+
+    Assumes the environment variable was already validated by plugin.py.
+    """
+    raw = os.getenv("EP_MAX_CONCURRENT_EVALUATIONS")
+    return int(raw) if raw is not None else default_value
+
+
 def parse_ep_completion_params(
     completion_params: Sequence[CompletionParams | None] | None,
 ) -> Sequence[CompletionParams | None]:
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -45,6 +45,12 @@ def pytest_addoption(parser) -> None:
         default=None,
         help=("Override the maximum number of concurrent rollouts. Pass an integer (e.g., 8, 50, 100)."),
     )
+    group.addoption(
+        "--ep-max-concurrent-evaluations",
+        action="store",
+        default=None,
+        help=("Override the maximum number of concurrent evaluations. Pass an integer (e.g., 8, 50, 100)."),
+    )
     group.addoption(
         "--ep-print-summary",
         action="store_true",
@@ -242,10 +248,15 @@ def pytest_configure(config) -> None:
     if norm_runs is not None:
         os.environ["EP_NUM_RUNS"] = norm_runs
 
-    max_concurrent_val = config.getoption("--ep-max-concurrent-rollouts")
-    norm_concurrent = _normalize_number(max_concurrent_val)
-    if norm_concurrent is not None:
-        os.environ["EP_MAX_CONCURRENT_ROLLOUTS"] = norm_concurrent
+    max_concurrent_rollouts_val = config.getoption("--ep-max-concurrent-rollouts")
+    norm_concurrent_rollouts = _normalize_number(max_concurrent_rollouts_val)
+    if norm_concurrent_rollouts is not None:
+        os.environ["EP_MAX_CONCURRENT_ROLLOUTS"] = norm_concurrent_rollouts
+
+    max_concurrent_evals_val = config.getoption("--ep-max-concurrent-evaluations")
+    norm_concurrent_evals = _normalize_number(max_concurrent_evals_val)
+    if norm_concurrent_evals is not None:
+        os.environ["EP_MAX_CONCURRENT_EVALUATIONS"] = norm_concurrent_evals
 
     if config.getoption("--ep-print-summary"):
         os.environ["EP_PRINT_SUMMARY"] = "1"