tactic-annot/theorem_validator.py at main · padieul/tactic-annot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""Theorem validator - orchestrates validation of theorems using multiple strategies."""

from typing import Dict, List, Set
from context.theorem_analyzer import TheoremAnalyzer
from context.context_builder import ContextBuilder
from validators.kimina_validator import KiminaValidator
from strategies.aesop_naive_strategy import AesopNaiveStrategy
from strategies.aesop_llm_strategy import AesopLLMStrategy


class TheoremValidator:
    """Orchestrates validation of theorems using original proof, naive aesop, and LLM strategies."""

    def __init__(
        self,
        validator: KiminaValidator,
        naive_strategy: AesopNaiveStrategy,
        llm_strategy: AesopLLMStrategy,
        cleaned_content: str
    ):
        """Initialize the validator.

        Args:
            validator: KiminaValidator instance
            naive_strategy: AesopNaiveStrategy instance
            llm_strategy: AesopLLMStrategy instance
            cleaned_content: Full cleaned Lean file content
        """
        self.validator = validator
        self.naive_strategy = naive_strategy
        self.llm_strategy = llm_strategy
        self.cleaned_content = cleaned_content
        self.analyzer = TheoremAnalyzer()
        self.context_builder = ContextBuilder()

    def validate_theorems(
        self,
        theorems: List[Dict],
        skip_list: Set[str],
        num_to_prove: int = None
    ) -> Dict:
        """Validate a list of theorems.

        Args:
            theorems: List of theorem dictionaries
            skip_list: Set of theorem names to skip
            num_to_prove: Number of theorems to validate (None = all)

        Returns:
            Statistics dictionary with results
        """
        # Initialize statistics
        stats = self._init_stats()

        # Determine how many to prove
        if num_to_prove is None:
            num_to_prove = len(theorems)

        theorems_to_prove = theorems[:num_to_prove]

        # Process theorems with simple output
        total = len(theorems_to_prove)
        for idx, theorem in enumerate(theorems_to_prove):
            result = self._validate_single_theorem(theorem, idx, skip_list, stats)

            # Print simple one-line status
            status = "✓ SOLVED" if result else "✗ NOT SOLVED"
            print(f"[{idx+1}/{total}] {theorem['name']}: {status}")

        return stats

    def _validate_single_theorem(
        self,
        theorem: Dict,
        idx: int,
        skip_list: Set[str],
        stats: Dict
    ) -> bool:
        """Validate a single theorem through all strategies.

        Args:
            theorem: Theorem dictionary
            idx: Index in the list
            skip_list: Set of theorem names to skip
            stats: Statistics dictionary to update

        Returns:
            True if successful (original or aesop), False otherwise
        """
        theorem_name = theorem['name']
        line_number = theorem.get('line_number', 0)
        has_simp = self.analyzer.has_simp_annotation(theorem['full_text'])

        # Track @[simp] annotations
        if has_simp:
            stats['simp_total'] += 1

        # Skip if in skip list
        if theorem_name in skip_list:
            stats['skipped_theorems'] += 1
            return True

        # Detect context
        context_key = self.context_builder.detect_context(theorem, line_number)
        imports = self.context_builder.get_imports(context_key)

        # Step 1: Validate with original proof
        original_valid = self._validate_original(theorem, imports, stats)

        if not original_valid:
            return False

        # Step 2: Try naive aesop variants
        stats['total_attempted'] += 1

        naive_result = self.naive_strategy.try_prove(theorem, imports, stats)

        if naive_result:
            # Success with naive aesop!
            stats['aesop_solved_theorems'].append(naive_result)
            stats['all_aesop_successes'].append({
                **naive_result,
                'context': context_key
            })
            return True

        # Naive aesop failed
        stats['aesop_failed'] += 1

        # Get error details
        error_details, failure_class = self.naive_strategy.get_failure_info(theorem, imports)

        # Track @[simp] failures
        if has_simp:
            stats['simp_aesop_failed'] += 1
            stats['simp_not_solved_by_aesop'].append({
                'name': theorem_name,
                'line': line_number,
                'reason': 'all_variants_failed'
            })

        # Step 3: Try LLM-assisted proof
        llm_result = self.llm_strategy.try_prove(
            theorem,
            imports,
            self.cleaned_content,
            stats
        )

        if llm_result and not llm_result.get('is_failure', False):
            # Success with LLM!
            stats['llm_solved_theorems'].append(llm_result)
            stats['all_aesop_successes'].append({
                **llm_result,
                'context': context_key
            })
            return True

        # Complete failure - add to failures list
        if llm_result and llm_result.get('is_failure', False):
            failure_entry = {
                **llm_result,
                'context': context_key
            }
            # Add original aesop error context
            if 'error_details' in failure_entry:
                failure_entry['error_details'] = f"Original aesop error:\n{error_details.strip()}\n\n{failure_entry['error_details']}"
            else:
                failure_entry['error_details'] = f"Original aesop error:\n{error_details.strip()}"

            stats['aesop_failed_theorems'].append(failure_entry)

            # Track in classifications
            class_key = failure_entry['failure_class'].split(':')[0]
            if class_key in stats['failure_classifications']:
                stats['failure_classifications'][class_key].append(failure_entry)
            else:
                stats['failure_classifications']['unknown'].append(failure_entry)

        return False

    def _validate_original(
        self,
        theorem: Dict,
        imports: str,
        stats: Dict
    ) -> bool:
        """Validate theorem with original proof.

        Args:
            theorem: Theorem dictionary
            imports: Import context
            stats: Statistics to update

        Returns:
            True if original proof is valid
        """
        full_code = imports + theorem['full_text_renamed']
        full_code = self.validator.apply_code_fixes(full_code)

        is_valid, error_msg, time_taken = self.validator.validate_proof(full_code)

        return is_valid

    def _init_stats(self) -> Dict:
        """Initialize statistics dictionary.

        Returns:
            Empty stats dictionary
        """
        return {
            'skipped_theorems': 0,
            'total_attempted': 0,
            'aesop_solved': 0,
            'aesop_failed': 0,
            'llm_attempted': 0,
            'llm_solved': 0,
            'llm_failed': 0,
            'simp_total': 0,
            'simp_aesop_solved': 0,
            'simp_aesop_failed': 0,
            'non_simp_aesop_solved': 0,
            'aesop_solved_theorems': [],
            'llm_solved_theorems': [],
            'all_aesop_successes': [],
            'aesop_failed_theorems': [],
            'simp_not_solved_by_aesop': [],
            'failure_classifications': {
                'syntax_error': [],
                'no_progress': [],
                'partial_progress': [],
                'fundamental': [],
                'unknown': []
            }
        }