python-sdk/tests/pytest/test_pytest_ensure_logging.py at 58d9cf2ced22207ada280c0f2a91e0b5bd7a0268 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
from unittest.mock import Mock, patch


def test_ensure_logging(monkeypatch):
    """
    Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called.
    """
    # Mock the SqliteEvaluationRowStore to track calls
    mock_store = Mock()
    mock_store.upsert_row = Mock()
    mock_store.read_rows = Mock(return_value=[])
    mock_store.db_path = "/tmp/test.db"

    # Mock the SqliteEvaluationRowStore constructor so that when SqliteDatasetLoggerAdapter
    # creates its store, it gets our mock instead
    with patch(
        "eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store
    ):
        from eval_protocol.models import EvaluationRow
        from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
        from eval_protocol.pytest.evaluation_test import evaluation_test
        from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row

        @evaluation_test(
            input_dataset=[
                "tests/pytest/data/markdown_dataset.jsonl",
            ],
            completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}],
            dataset_adapter=markdown_dataset_to_evaluation_row,
            rollout_processor=NoOpRolloutProcessor(),
            mode="pointwise",
            combine_datasets=False,
            num_runs=2,
            # Don't pass logger parameter - let it use the default_logger (which we've replaced)
        )
        def eval_fn(row: EvaluationRow) -> EvaluationRow:
            return row

        eval_fn(
            dataset_path=["tests/pytest/data/markdown_dataset.jsonl"],
            completion_params={"temperature": 0.0, "model": "dummy/local-model"},
        )

        # Verify that the store's upsert_row method was called
        assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called"

        # Check that it was called multiple times (once for each row)
        call_count = mock_store.upsert_row.call_count
        assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times"

        # Verify the calls were made with proper data structure
        for call in mock_store.upsert_row.call_args_list:
            args, kwargs = call
            data = args[0] if args else kwargs.get("data")
            assert data is not None, "upsert_row should be called with data parameter"
            assert isinstance(data, dict), "data should be a dictionary"
            assert "execution_metadata" in data, "data should contain execution_metadata"
            assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata"