|
| 1 | +from eval_protocol.models import EvaluationRow, Message |
| 2 | +from eval_protocol.pytest import evaluation_test |
| 3 | +from eval_protocol.pytest.parameterize import DefaultParameterIdGenerator, pytest_parametrize |
| 4 | +from eval_protocol.pytest.generate_parameter_combinations import generate_parameter_combinations |
| 5 | + |
| 6 | + |
| 7 | +def test_parameterized_ids(): |
| 8 | + """Test that evaluation_test generates proper parameter IDs.""" |
| 9 | + collected_ids = [] |
| 10 | + |
| 11 | + @evaluation_test( |
| 12 | + input_messages=[[[Message(role="user", content="Hello, how are you?")]]], |
| 13 | + completion_params=[ |
| 14 | + {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, |
| 15 | + {"model": "gpt-4"}, |
| 16 | + {"temperature": 0.5}, # No model - should not generate ID |
| 17 | + ], |
| 18 | + ) |
| 19 | + def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow: |
| 20 | + # Collect the row to verify it was processed |
| 21 | + collected_ids.append(row.input_metadata.row_id) |
| 22 | + return row |
| 23 | + |
| 24 | + # The function should exist and be callable |
| 25 | + assert test_parameterized_ids is not None |
| 26 | + assert callable(test_parameterized_ids) |
| 27 | + |
| 28 | + # Test that the decorator was applied (function should have pytest marks) |
| 29 | + import pytest |
| 30 | + |
| 31 | + marks = getattr(test_parameterized_ids, "pytestmark", []) |
| 32 | + assert len(marks) > 0, "Function should have pytest marks from evaluation_test decorator" |
| 33 | + |
| 34 | + # Verify it's a parametrize mark |
| 35 | + parametrize_marks = [mark for mark in marks if hasattr(mark, "name") and mark.name == "parametrize"] |
| 36 | + assert len(parametrize_marks) > 0, "Should have parametrize mark" |
| 37 | + |
| 38 | + # Check that the parametrize mark has IDs |
| 39 | + parametrize_mark = parametrize_marks[0] |
| 40 | + assert hasattr(parametrize_mark, "kwargs"), "Parametrize mark should have kwargs" |
| 41 | + assert "ids" in parametrize_mark.kwargs, "Should have ids in kwargs" |
| 42 | + |
| 43 | + # Extract the IDs from the parametrize mark |
| 44 | + ids = parametrize_mark.kwargs.get("ids") |
| 45 | + if ids is not None: |
| 46 | + # Should have IDs for models but not for temperature-only params |
| 47 | + expected_ids = ["model-gpt-oss-120b", "model-gpt-4"] |
| 48 | + assert list(ids) == expected_ids, f"Expected {expected_ids}, got {list(ids)}" |
| 49 | + |
| 50 | + |
| 51 | +def test_default_id_generator(): |
| 52 | + """Test the DefaultParameterIdGenerator with various model names.""" |
| 53 | + generator = DefaultParameterIdGenerator() |
| 54 | + |
| 55 | + # Test with full model path |
| 56 | + combo1 = (None, {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, None, None, None) |
| 57 | + id1 = generator.generate_id(combo1) |
| 58 | + assert id1 == "model-gpt-oss-120b" |
| 59 | + |
| 60 | + # Test with simple model name |
| 61 | + combo2 = (None, {"model": "gpt-4"}, None, None, None) |
| 62 | + id2 = generator.generate_id(combo2) |
| 63 | + assert id2 == "model-gpt-4" |
| 64 | + |
| 65 | + # Test with no model |
| 66 | + combo3 = (None, {"temperature": 0.5}, None, None, None) |
| 67 | + id3 = generator.generate_id(combo3) |
| 68 | + assert id3 is None |
| 69 | + |
| 70 | + # Test with None completion_params |
| 71 | + combo4 = (None, None, None, None, None) |
| 72 | + id4 = generator.generate_id(combo4) |
| 73 | + assert id4 is None |
| 74 | + |
| 75 | + |
| 76 | +def test_pytest_parametrize_with_custom_id_generator(): |
| 77 | + """Test pytest_parametrize with a custom ID generator.""" |
| 78 | + |
| 79 | + # Create test combinations |
| 80 | + combinations = [ |
| 81 | + (None, {"model": "gpt-4"}, None, None, None), |
| 82 | + (None, {"model": "claude-3"}, None, None, None), |
| 83 | + (None, {"temperature": 0.5}, None, None, None), # No model |
| 84 | + ] |
| 85 | + |
| 86 | + # Test with default generator |
| 87 | + result = pytest_parametrize( |
| 88 | + combinations=combinations, |
| 89 | + input_dataset=None, |
| 90 | + completion_params=[{"model": "gpt-4"}, {"model": "claude-3"}, {"temperature": 0.5}], |
| 91 | + input_messages=None, |
| 92 | + input_rows=None, |
| 93 | + evaluation_test_kwargs=None, |
| 94 | + ) |
| 95 | + |
| 96 | + assert result["argnames"] == ["completion_params"] |
| 97 | + assert len(list(result["argvalues"])) == 3 |
| 98 | + assert result["ids"] == ["model-gpt-4", "model-claude-3"] # None for no model |
| 99 | + |
| 100 | + |
| 101 | +def test_id_generator_max_length(): |
| 102 | + """Test that ID generator respects max_length parameter.""" |
| 103 | + generator = DefaultParameterIdGenerator(max_length=10) |
| 104 | + |
| 105 | + # Test with long model name |
| 106 | + combo = (None, {"model": "very-long-model-name-that-exceeds-max-length"}, None, None, None) |
| 107 | + id_str = generator.generate_id(combo) |
| 108 | + assert id_str == "model-v..." |
| 109 | + assert len(id_str) <= 10 |
0 commit comments