-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_extraction.py
More file actions
140 lines (118 loc) · 4.87 KB
/
debug_extraction.py
File metadata and controls
140 lines (118 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""
Debug the extraction issues step by step.
"""
import re
def clean_answer(answer):
"""Clean answer for comparison."""
if not answer:
return ""
answer = str(answer).strip()
print(f" clean_answer input: '{answer}'")
# Remove common prefixes
answer = re.sub(r'^(The answer is|Answer:|Final answer:?)\s*', '', answer, flags=re.IGNORECASE)
print(f" after prefix removal: '{answer}'")
# Remove dollar signs and other currency symbols
answer = re.sub(r'[\$\s]+', '', answer)
print(f" after dollar removal: '{answer}'")
# Remove boxed formatting (handle nested cases like $\boxed{$70,000}$)
answer = re.sub(r'\\boxed\{([^}]+)\}', r'\1', answer)
print(f" after boxed removal: '{answer}'")
# Remove brackets, parentheses
answer = re.sub(r'^[\[\](){}]+|[\[\](){}]+$', '', answer)
print(f" after bracket removal: '{answer}'")
# Remove trailing punctuation (periods, commas, etc.)
answer = re.sub(r'[.,;:!?]+$', '', answer)
print(f" after punctuation removal: '{answer}'")
# Remove commas from numbers (e.g., "70,000" -> "70000")
answer = re.sub(r',', '', answer)
print(f" after comma removal: '{answer}'")
# Try to extract just the number
number_match = re.search(r'([0-9]+(?:\.[0-9]+)?)', answer)
if number_match:
num_str = number_match.group(1)
print(f" extracted number: '{num_str}'")
# Convert to float and back to remove unnecessary decimals
try:
num = float(num_str)
if num == int(num):
result = str(int(num))
else:
result = str(num)
print(f" final result: '{result}'")
return result
except ValueError:
print(f" ValueError, returning: '{num_str}'")
return num_str
print(f" no number found, returning: '{answer.strip()}'")
return answer.strip()
def extract_answer_improved(text):
"""Improved answer extraction that looks for the final answer more carefully."""
if not text or not text.strip():
return ""
print(f"Extracting from text: {text[:100]}...")
# Split into lines for analysis
lines = text.strip().split('\n')
print(f"Split into {len(lines)} lines")
# Strategy 1: Look for explicit answer patterns in the last few lines
explicit_patterns = [
r"(?:The )?final answer is:?\s*([^\n]+)",
r"(?:The )?answer is:?\s*([^\n]+)",
r"Answer:?\s*([^\n]+)",
r"Therefore,?\s*(?:the )?answer is:?\s*([^\n]+)",
r"Thus,?\s*(?:the )?answer is:?\s*([^\n]+)",
r"Hence,?\s*(?:the )?answer is:?\s*([^\n]+)",
r"So,?\s*(?:the )?answer is:?\s*([^\n]+)",
]
print("Strategy 1: Looking for explicit answer patterns in last 5 lines")
for i, line in enumerate(lines[-5:]):
print(f" Line {len(lines)-5+i}: '{line}'")
for pattern in explicit_patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
answer = match.group(1).strip()
print(f" Found match with pattern '{pattern}': '{answer}'")
cleaned = clean_answer(answer)
if cleaned and is_valid_answer(cleaned):
print(f" Valid answer: '{cleaned}'")
return cleaned
else:
print(f" Invalid answer: '{cleaned}'")
# Strategy 2: Look for boxed answers (check all lines, not just last few)
boxed_patterns = [
r"\\boxed\{([^}]+)\}",
r"\$\$([^$]+)\$\$",
r"\$([^$]+)\$",
]
print("Strategy 2: Looking for boxed answers in all lines")
for i, line in enumerate(lines):
print(f" Line {i}: '{line}'")
for pattern in boxed_patterns:
match = re.search(pattern, line)
if match:
answer = match.group(1).strip()
print(f" Found boxed match with pattern '{pattern}': '{answer}'")
cleaned = clean_answer(answer)
if cleaned and is_valid_answer(cleaned):
print(f" Valid boxed answer: '{cleaned}'")
return cleaned
else:
print(f" Invalid boxed answer: '{cleaned}'")
print("No valid answer found")
return ""
def is_valid_answer(answer):
"""Check if an answer is valid."""
if not answer or answer.strip() == "":
return False
try:
num = float(answer)
# Updated range to handle larger numbers like 70,000
return -100000 <= num <= 1000000
except ValueError:
return False
# Test the problematic case
test_text = "The final answer is: $\\boxed{$70,000}$"
print("Testing problematic case:")
print(f"Input: {test_text}")
result = extract_answer_improved(test_text)
print(f"Result: '{result}'")