try

Dylan Huang · Dylan Huang · commit dbab7654b584 · 2025-12-13T01:08:57.000-08:00
diff --git a/eval_protocol/dataset_logger/tinydb_evaluation_row_store.py b/eval_protocol/dataset_logger/tinydb_evaluation_row_store.py
@@ -1,11 +1,16 @@
+import json
+import logging
 import os
+import time
 from typing import List, Optional
 
 from tinydb import Query, TinyDB
 from tinyrecord.transaction import transaction
 
 from eval_protocol.dataset_logger.evaluation_row_store import EvaluationRowStore
 
+logger = logging.getLogger(__name__)
+
 
 class TinyDBEvaluationRowStore(EvaluationRowStore):
     """
@@ -24,9 +29,30 @@ def __init__(self, db_path: str):
         if db_dir:
             os.makedirs(db_dir, exist_ok=True)
         self._db_path = db_path
-        self._db = TinyDB(db_path)
+        self._db = self._open_db_with_retry()
         self._table = self._db.table("evaluation_rows")
 
+    def _open_db_with_retry(self, max_retries: int = 3) -> TinyDB:
+        """Open TinyDB with retry logic to handle transient JSON decode errors."""
+        last_error: Exception | None = None
+        for attempt in range(max_retries):
+            try:
+                return TinyDB(self._db_path)
+            except json.JSONDecodeError as e:
+                last_error = e
+                logger.warning(f"TinyDB JSON decode error on attempt {attempt + 1}: {e}")
+                # Wait a bit and retry - the file might be mid-write
+                time.sleep(0.1 * (attempt + 1))
+                # Try to recover by removing the corrupted file
+                if attempt == max_retries - 1 and os.path.exists(self._db_path):
+                    try:
+                        logger.warning(f"Removing corrupted TinyDB file: {self._db_path}")
+                        os.remove(self._db_path)
+                        return TinyDB(self._db_path)
+                    except Exception:
+                        pass
+        raise last_error if last_error else RuntimeError("Failed to open TinyDB")
+
     @property
     def db_path(self) -> str:
         return self._db_path
@@ -54,12 +80,25 @@ def upsert_row(self, data: dict) -> None:
                 tr.insert(data)
 
     def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
-        # Clear cache to ensure fresh read in multi-process scenarios
-        self._table.clear_cache()
-        if rollout_id is not None:
-            Row = Query()
-            return list(self._table.search(Row.execution_metadata.rollout_id == rollout_id))
-        return list(self._table.all())
+        """Read rows with retry logic for transient JSON decode errors."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                # Clear cache to ensure fresh read in multi-process scenarios
+                self._table.clear_cache()
+                if rollout_id is not None:
+                    Row = Query()
+                    return list(self._table.search(Row.execution_metadata.rollout_id == rollout_id))
+                return list(self._table.all())
+            except json.JSONDecodeError as e:
+                logger.warning(f"TinyDB JSON decode error on read attempt {attempt + 1}: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(0.1 * (attempt + 1))
+                else:
+                    # Return empty list on final failure rather than crash
+                    logger.warning("Failed to read TinyDB after retries, returning empty list")
+                    return []
+        return []
 
     def delete_row(self, rollout_id: str) -> int:
         Row = Query()
diff --git a/eval_protocol/event_bus/tinydb_event_bus_database.py b/eval_protocol/event_bus/tinydb_event_bus_database.py
@@ -1,3 +1,4 @@
+import json
 import os
 import time
 from typing import Any, List
@@ -27,9 +28,30 @@ def __init__(self, db_path: str):
         if db_dir:
             os.makedirs(db_dir, exist_ok=True)
         self._db_path = db_path
-        self._db = TinyDB(db_path)
+        self._db = self._open_db_with_retry()
         self._table = self._db.table("events")
 
+    def _open_db_with_retry(self, max_retries: int = 3) -> TinyDB:
+        """Open TinyDB with retry logic to handle transient JSON decode errors."""
+        last_error: Exception | None = None
+        for attempt in range(max_retries):
+            try:
+                return TinyDB(self._db_path)
+            except json.JSONDecodeError as e:
+                last_error = e
+                logger.warning(f"TinyDB JSON decode error on attempt {attempt + 1}: {e}")
+                # Wait a bit and retry - the file might be mid-write
+                time.sleep(0.1 * (attempt + 1))
+                # Try to recover by removing the corrupted file
+                if attempt == max_retries - 1 and os.path.exists(self._db_path):
+                    try:
+                        logger.warning(f"Removing corrupted TinyDB file: {self._db_path}")
+                        os.remove(self._db_path)
+                        return TinyDB(self._db_path)
+                    except Exception:
+                        pass
+        raise last_error if last_error else RuntimeError("Failed to open TinyDB")
+
     def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
         """Publish an event to the database using atomic transaction."""
         try:
@@ -55,38 +77,48 @@ def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
             logger.warning(f"Failed to publish event to database: {e}")
 
     def get_unprocessed_events(self, process_id: str) -> List[dict]:
-        """Get unprocessed events from other processes."""
-        try:
-            # Clear query cache to force fresh read from disk
-            # TinyDB caches query results, so we need to clear cache to see
-            # events written by other processes. The search() method will
-            # automatically call _read_table() on a cache miss.
-            self._table.clear_cache()
-
-            Event = Query()
-            results = self._table.search((Event.process_id != process_id) & (Event.processed == False))  # noqa: E712
-
-            logger.debug(
-                f"TinyDBEventBusDatabase: Found {len(results)} unprocessed events for process_id: {process_id} in database: {self._db_path}"
-            )
-
-            events = []
-            # Sort by timestamp
-            for event in sorted(results, key=lambda x: x.get("timestamp", 0)):
-                events.append(
-                    {
-                        "event_id": event["event_id"],
-                        "event_type": event["event_type"],
-                        "data": event["data"],
-                        "timestamp": event["timestamp"],
-                        "process_id": event["process_id"],
-                    }
+        """Get unprocessed events from other processes with retry logic."""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                # Clear query cache to force fresh read from disk
+                # TinyDB caches query results, so we need to clear cache to see
+                # events written by other processes. The search() method will
+                # automatically call _read_table() on a cache miss.
+                self._table.clear_cache()
+
+                Event = Query()
+                results = self._table.search((Event.process_id != process_id) & (Event.processed == False))  # noqa: E712
+
+                logger.debug(
+                    f"TinyDBEventBusDatabase: Found {len(results)} unprocessed events for process_id: {process_id} in database: {self._db_path}"
                 )
 
-            return events
-        except Exception as e:
-            logger.warning(f"Failed to get unprocessed events: {e}")
-            return []
+                events = []
+                # Sort by timestamp
+                for event in sorted(results, key=lambda x: x.get("timestamp", 0)):
+                    events.append(
+                        {
+                            "event_id": event["event_id"],
+                            "event_type": event["event_type"],
+                            "data": event["data"],
+                            "timestamp": event["timestamp"],
+                            "process_id": event["process_id"],
+                        }
+                    )
+
+                return events
+            except json.JSONDecodeError as e:
+                logger.warning(f"TinyDB JSON decode error on get_unprocessed_events attempt {attempt + 1}: {e}")
+                if attempt < max_retries - 1:
+                    time.sleep(0.1 * (attempt + 1))
+                else:
+                    logger.warning("Failed to read events after retries, returning empty list")
+                    return []
+            except Exception as e:
+                logger.warning(f"Failed to get unprocessed events: {e}")
+                return []
+        return []
 
     def mark_event_processed(self, event_id: str) -> None:
         """Mark an event as processed using atomic transaction."""
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,5 @@
 import os
 import sys
-import tempfile
 from pathlib import Path
 
 import pytest
@@ -26,23 +25,26 @@
 # concurrent test workers from corrupting the shared logs.json file.
 # This is especially important in CI where pytest-xdist runs tests in parallel.
 
+# Store the original function before any patching
+import eval_protocol.directory_utils as dir_utils
+
+_original_find_eval_protocol_dir = dir_utils.find_eval_protocol_dir
+
 
 @pytest.fixture(scope="session", autouse=True)
-def isolated_eval_protocol_dir(tmp_path_factory):
+def isolated_eval_protocol_dir(tmp_path_factory, request):
     """
     Create an isolated .eval_protocol directory for the test session.
 
     This prevents concurrent test workers from corrupting the shared
     ~/.eval_protocol/logs.json file when using TinyDB storage.
+
+    Note: Tests in test_directory_utils.py are excluded from this fixture
+    as they need to test the actual find_eval_protocol_dir behavior.
     """
     # Create a unique temp directory for this test session/worker
     isolated_dir = tmp_path_factory.mktemp("eval_protocol")
 
-    # Monkeypatch the find_eval_protocol_dir function to return our isolated dir
-    import eval_protocol.directory_utils as dir_utils
-
-    original_find_eval_protocol_dir = dir_utils.find_eval_protocol_dir
-
     def isolated_find_eval_protocol_dir() -> str:
         os.makedirs(str(isolated_dir), exist_ok=True)
         return str(isolated_dir)
@@ -52,4 +54,18 @@ def isolated_find_eval_protocol_dir() -> str:
     yield isolated_dir
 
     # Restore original function after tests
-    dir_utils.find_eval_protocol_dir = original_find_eval_protocol_dir
+    dir_utils.find_eval_protocol_dir = _original_find_eval_protocol_dir
+
+
+@pytest.fixture
+def restore_original_find_eval_protocol_dir():
+    """
+    Fixture to restore the original find_eval_protocol_dir for tests that
+    need to test the actual implementation (e.g., test_directory_utils.py).
+
+    Use this fixture in tests that need to test the real directory behavior.
+    """
+    # Temporarily restore the original function
+    dir_utils.find_eval_protocol_dir = _original_find_eval_protocol_dir
+    yield _original_find_eval_protocol_dir
+    # The session fixture will clean up when tests complete
diff --git a/tests/test_directory_utils.py b/tests/test_directory_utils.py
@@ -1,9 +1,22 @@
 import os
 import tempfile
 from unittest.mock import patch
+
 import pytest
 
-from eval_protocol.directory_utils import find_eval_protocol_dir, find_eval_protocol_datasets_dir
+import eval_protocol.directory_utils as dir_utils
+
+
+@pytest.fixture(autouse=True)
+def use_real_directory_utils(restore_original_find_eval_protocol_dir):
+    """
+    Automatically use the real find_eval_protocol_dir for all tests in this module.
+
+    This is necessary because the session-scoped isolated_eval_protocol_dir fixture
+    patches find_eval_protocol_dir globally, but these tests need to test the
+    actual implementation behavior.
+    """
+    yield
 
 
 class TestDirectoryUtils:
@@ -13,7 +26,7 @@ def test_find_eval_protocol_dir_uses_home_folder(self):
         """Test that find_eval_protocol_dir always maps to home folder."""
         with tempfile.TemporaryDirectory() as temp_dir:
             with patch.dict(os.environ, {"HOME": temp_dir}):
-                result = find_eval_protocol_dir()
+                result = dir_utils.find_eval_protocol_dir()
                 expected = os.path.expanduser("~/.eval_protocol")
                 assert result == expected
                 assert result.endswith(".eval_protocol")
@@ -29,7 +42,7 @@ def test_find_eval_protocol_dir_creates_directory(self):
                     os.rmdir(eval_protocol_dir)
 
                 # Call the function
-                result = find_eval_protocol_dir()
+                result = dir_utils.find_eval_protocol_dir()
 
                 # Verify the directory was created
                 assert result == eval_protocol_dir
@@ -40,7 +53,7 @@ def test_find_eval_protocol_dir_handles_tilde_expansion(self):
         """Test that find_eval_protocol_dir properly handles tilde expansion."""
         with tempfile.TemporaryDirectory() as temp_dir:
             with patch.dict(os.environ, {"HOME": temp_dir}):
-                result = find_eval_protocol_dir()
+                result = dir_utils.find_eval_protocol_dir()
                 expected = os.path.expanduser("~/.eval_protocol")
                 assert result == expected
                 assert result.startswith(temp_dir)
@@ -49,7 +62,7 @@ def test_find_eval_protocol_datasets_dir_uses_home_folder(self):
         """Test that find_eval_protocol_datasets_dir also uses home folder."""
         with tempfile.TemporaryDirectory() as temp_dir:
             with patch.dict(os.environ, {"HOME": temp_dir}):
-                result = find_eval_protocol_datasets_dir()
+                result = dir_utils.find_eval_protocol_datasets_dir()
                 expected = os.path.expanduser("~/.eval_protocol/datasets")
                 assert result == expected
                 assert result.endswith(".eval_protocol/datasets")
@@ -69,7 +82,7 @@ def test_find_eval_protocol_datasets_dir_creates_directory(self):
                     os.rmdir(eval_protocol_dir)
 
                 # Call the function
-                result = find_eval_protocol_datasets_dir()
+                result = dir_utils.find_eval_protocol_datasets_dir()
 
                 # Verify both directories were created
                 assert result == datasets_dir
@@ -82,14 +95,14 @@ def test_find_eval_protocol_dir_consistency(self):
         """Test that multiple calls to find_eval_protocol_dir return the same path."""
         with tempfile.TemporaryDirectory() as temp_dir:
             with patch.dict(os.environ, {"HOME": temp_dir}):
-                result1 = find_eval_protocol_dir()
-                result2 = find_eval_protocol_dir()
+                result1 = dir_utils.find_eval_protocol_dir()
+                result2 = dir_utils.find_eval_protocol_dir()
                 assert result1 == result2
 
     def test_find_eval_protocol_datasets_dir_consistency(self):
         """Test that multiple calls to find_eval_protocol_datasets_dir return the same path."""
         with tempfile.TemporaryDirectory() as temp_dir:
             with patch.dict(os.environ, {"HOME": temp_dir}):
-                result1 = find_eval_protocol_datasets_dir()
-                result2 = find_eval_protocol_datasets_dir()
+                result1 = dir_utils.find_eval_protocol_datasets_dir()
+                result2 = dir_utils.find_eval_protocol_datasets_dir()
                 assert result1 == result2