vetting-python/test_real_api.py at main · hichipli/vetting-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
"""
Real API test script for VETTING framework.

This script tests the actual functionality with OpenAI API.
"""

import asyncio
import os
import sys
import traceback
from pathlib import Path

# Add the project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from vetting_python import VettingFramework, VettingConfig, ChatMessage, ModelConfig, OpenAIProvider
from vetting_python.core.models import ContextItem


async def test_chat_mode():
    """Test basic chat mode functionality."""
    print("=" * 50)
    print("Testing Chat Mode")
    print("=" * 50)

    try:
        # Initialize provider with API key
        provider = OpenAIProvider(api_key="INSERT_OPENAI_API_KEY_HERE")
        framework = VettingFramework(chat_provider=provider)

        # Create config
        config = VettingConfig(
            mode="chat",
            chat_model=ModelConfig(model_id="gpt-4o-mini", temperature=0.7, max_tokens=100)
        )

        # Test simple chat
        messages = [ChatMessage("user", "Hello! Can you tell me what 2+2 equals?")]
        response = await framework.process(messages, config)

        print(f"✅ Chat Response: {response.content}")
        print(f"✅ Total Cost: ${response.total_cost:.6f}")
        if response.total_usage:
            print(f"✅ Token Usage: {response.total_usage.total_tokens} tokens")
        else:
            print(f"✅ Token Usage: No usage data available")
        print(f"✅ Requires Attention: {response.requires_attention}")

        return True

    except Exception as e:
        print(f"❌ Chat Mode Test Failed: {e}")
        traceback.print_exc()
        return False


async def test_vetting_mode():
    """Test vetting mode functionality."""
    print("\n" + "=" * 50)
    print("Testing Vetting Mode")
    print("=" * 50)

    try:
        # Initialize provider
        provider = OpenAIProvider(api_key="INSERT_OPENAI_API_KEY_HERE")
        framework = VettingFramework(chat_provider=provider)

        # Create educational context
        context_item = ContextItem(
            question={
                "text": "What is 2 + 2?",
                "id": "math_basic_001",
                "subject": "Elementary Math"
            },
            answer_key={
                "correctAnswer": "4",
                "keyConcepts": ["addition", "arithmetic"],
                "explanation": "Adding 2 and 2 gives us 4."
            }
        )

        # Create vetting config
        config = VettingConfig(
            mode="vetting",
            chat_model=ModelConfig(model_id="gpt-4o-mini", temperature=0.7, max_tokens=150),
            verification_model=ModelConfig(model_id="gpt-4o-mini", temperature=0.1, max_tokens=100),
            context_items=[context_item],
            max_attempts=2,
            enable_educational_rules=True,
            enable_safety_prefix=True
        )

        # Test vetting with educational question
        messages = [ChatMessage("user", "I need help with my math homework. What is 2 + 2?")]
        response = await framework.process(messages, config)

        print(f"✅ Vetting Response: {response.content}")
        print(f"✅ Verification Passed: {response.verification_passed}")
        print(f"✅ Attempt Count: {response.attempt_count}")
        print(f"✅ Stop Reason: {response.stop_reason}")
        print(f"✅ Total Cost: ${response.total_cost:.6f}")
        if response.total_usage:
            print(f"✅ Total Token Usage: {response.total_usage.total_tokens} tokens")
        else:
            print(f"✅ Total Token Usage: No usage data available")

        if response.attempts:
            print(f"✅ Detailed Attempts:")
            for i, attempt in enumerate(response.attempts, 1):
                print(f"   Attempt {i}: Verification {'PASSED' if attempt.verification_passed else 'FAILED'}")
                if not attempt.verification_passed:
                    print(f"   Reason: {attempt.verification_output}")

        return True

    except Exception as e:
        print(f"❌ Vetting Mode Test Failed: {e}")
        traceback.print_exc()
        return False


async def test_cost_calculation():
    """Test cost calculation accuracy."""
    print("\n" + "=" * 50)
    print("Testing Cost Calculation")
    print("=" * 50)

    try:
        provider = OpenAIProvider(api_key="INSERT_OPENAI_API_KEY_HERE")

        # Test model pricing info
        supported_models = provider.get_supported_models()
        print(f"✅ Supported Models: {supported_models[:5]}...")  # Show first 5

        aliases = provider.get_model_aliases()
        print(f"✅ Model Aliases: {aliases}")

        return True

    except Exception as e:
        print(f"❌ Cost Calculation Test Failed: {e}")
        traceback.print_exc()
        return False


async def main():
    """Run all tests."""
    print("VETTING Framework Real API Test")
    print("Testing with OpenAI gpt-4o-mini model")
    print("Date: 2025-07-31")

    results = []

    # Run tests
    results.append(await test_chat_mode())
    results.append(await test_vetting_mode())
    results.append(await test_cost_calculation())

    # Summary
    print("\n" + "=" * 50)
    print("TEST SUMMARY")
    print("=" * 50)

    passed = sum(results)
    total = len(results)

    print(f"Tests Passed: {passed}/{total}")

    if passed == total:
        print("🎉 ALL TESTS PASSED! The VETTING framework is working correctly.")
    else:
        print("⚠️  Some tests failed. Please check the output above.")

    return passed == total


if __name__ == "__main__":
    # Run the tests
    success = asyncio.run(main())
    sys.exit(0 if success else 1)