Improve answer extraction and logging in majority voting

codelion · codelion · commit e4d09250d613 · 2025-07-08T21:29:16.000+08:00
Enhanced the extract_answer function to better handle LaTeX boxed answers and multiple choice patterns. Moved majority voting summary from being appended to the response to logging via the logger, ensuring cleaner output.
diff --git a/optillm/plugins/majority_voting_plugin.py b/optillm/plugins/majority_voting_plugin.py
@@ -45,7 +45,14 @@ def extract_answer(text: str) -> Optional[str]:
     # Remove any trailing whitespace
     text = text.strip()
     
-    # Pattern 1: Look for "Answer:" or "Final Answer:" patterns
+    # Pattern 1: Look for LaTeX boxed format first (handle both \boxed and \\boxed)
+    boxed_match = re.search(r'\\{1,2}boxed\{([^}]+)\}', text)
+    if boxed_match:
+        answer = boxed_match.group(1).strip()
+        logger.debug(f"Extracted boxed answer: {answer}")
+        return answer
+    
+    # Pattern 2: Look for "Answer:" or "Final Answer:" patterns
     answer_patterns = [
         r'(?:final\s+)?answer\s*[:=]\s*(.+?)(?:\n|$)',
         r'(?:the\s+)?(?:final\s+)?answer\s+is\s*[:=]?\s*(.+?)(?:\n|$)',
@@ -62,13 +69,6 @@ def extract_answer(text: str) -> Optional[str]:
                 logger.debug(f"Extracted answer using pattern: {answer}")
                 return answer
     
-    # Pattern 2: Look for LaTeX boxed format
-    boxed_match = re.search(r'\\boxed\{([^}]+)\}', text)
-    if boxed_match:
-        answer = boxed_match.group(1).strip()
-        logger.debug(f"Extracted boxed answer: {answer}")
-        return answer
-    
     # Pattern 3: Look for standalone numbers (useful for math problems)
     # Check the last few lines for a number
     lines = text.split('\n')
@@ -80,20 +80,29 @@ def extract_answer(text: str) -> Optional[str]:
             logger.debug(f"Extracted number answer: {line}")
             return line
     
-    # Pattern 4: If the last line is short (< 50 chars), it might be the answer
+    # Pattern 4: For multiple choice, look for single letter answers
+    # Check this before the generic last line check
+    mc_patterns = [
+        r'(?:the\s+)?(?:correct\s+)?(?:answer|option)\s+is\s+([A-E])(?:\b|$)',
+        r'(?:choose|select|pick)\s+(?:option\s+)?([A-E])(?:\b|$)',
+        r'\b([A-E])\s*\)\s*[A-Za-z]+.*is\s+(?:the\s+)?(?:correct|right)',
+        r'^([A-E])$',  # Just a letter on its own line
+    ]
+    
+    for pattern in mc_patterns:
+        mc_match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+        if mc_match:
+            answer = mc_match.group(1).upper()
+            logger.debug(f"Extracted multiple choice answer: {answer}")
+            return answer
+    
+    # Pattern 5: If the last line is short (< 50 chars), it might be the answer
     if lines:
         last_line = lines[-1].strip()
         if last_line and len(last_line) < 50 and not last_line.endswith(':'):
             logger.debug(f"Using last line as answer: {last_line}")
             return last_line
     
-    # Pattern 5: For multiple choice, look for single letter answers
-    mc_match = re.search(r'\b([A-E])\b(?:\s*\))?$', text)
-    if mc_match:
-        answer = mc_match.group(1)
-        logger.debug(f"Extracted multiple choice answer: {answer}")
-        return answer
-    
     logger.warning("Could not extract a clear answer from the response")
     return None
 
@@ -240,21 +249,18 @@ def run(
         # Get the full response corresponding to the most common answer
         winning_response = answer_to_response.get(most_common_answer, candidates[0])
         
-        # Add voting summary to the response
-        voting_summary = f"\n\n**Majority Voting Result**:\n"
-        voting_summary += f"- Generated {k} candidates\n"
-        voting_summary += f"- Most common answer: {most_common_answer}\n"
-        voting_summary += f"- Votes: {count}/{len(answers)} ({confidence:.1%} confidence)\n"
+        # Log voting summary to console instead of adding to response
+        logger.info("Majority Voting Summary:")
+        logger.info(f"  - Generated {k} candidates")
+        logger.info(f"  - Most common answer: {most_common_answer}")
+        logger.info(f"  - Votes: {count}/{len(answers)} ({confidence:.1%} confidence)")
         
         if len(answer_counts) > 1:
-            voting_summary += f"- Other answers: "
             other_answers = [f"{ans} ({cnt} votes)" for ans, cnt in answer_counts.items() if ans != most_common_answer]
-            voting_summary += ", ".join(other_answers)
-        
-        # Return the full response from the winning answer with voting summary
-        final_response = winning_response + voting_summary
+            logger.info(f"  - Other answers: {', '.join(other_answers)}")
         
-        return final_response, total_tokens
+        # Return only the full response from the winning answer
+        return winning_response, total_tokens
         
     except Exception as e:
         logger.error(f"Error in majority voting: {str(e)}")
diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
@@ -21,19 +21,44 @@
 logger = logging.getLogger(__name__)
 
 # Define the approaches to test
-# Each approach is (name, description)
+# Each approach is (name, description, extra_body_params)
 APPROACHES = [
-    ("none", "Baseline without any optimization"),
-    ("leap", "LEAP Approach"),
-    ("rto", "Round Trip Optimization"),
-    ("cot_reflection", "Chain of Thought with Reflection"),
-    ("self_consistency", "Self Consistency Check"),
-    ("plansearch", "Planning with Search"),
-    ("re2", "ReRead Approach"),
-    ("z3", "Z3 Solver for Mathematical Problems"),
-    ("coc", "Chain of Code"),
-    ("executecode" , "Execute Code"),
-    ("spl", "System Prompt Learning")
+    ("none", "Baseline without any optimization", {}),
+    ("leap", "LEAP Approach", {}),
+    ("rto", "Round Trip Optimization", {}),
+    ("cot_reflection", "Chain of Thought with Reflection", {}),
+    ("self_consistency", "Self Consistency Check", {}),
+    ("plansearch", "Planning with Search", {}),
+    ("re2", "ReRead Approach", {}),
+    ("z3", "Z3 Solver for Mathematical Problems", {}),
+    ("coc", "Chain of Code", {}),
+    ("executecode" , "Execute Code", {}),
+    ("spl", "System Prompt Learning", {})
+]
+
+# Define test-time compute approaches for sequential and parallel scaling
+TEST_TIME_COMPUTE_APPROACHES = [
+    # Baseline
+    ("none", "Baseline without any optimization", {}),
+    
+    # Sequential test-time compute using thinkdeeper with different thinking budgets
+    ("thinkdeeper_8k", "ThinkDeeper with 8K thinking tokens", {
+        "decoding": "thinkdeeper",
+        "max_thinking_tokens": 8000
+    }),
+    ("thinkdeeper_16k", "ThinkDeeper with 16K thinking tokens", {
+        "decoding": "thinkdeeper", 
+        "max_thinking_tokens": 16000
+    }),
+    ("thinkdeeper_32k", "ThinkDeeper with 32K thinking tokens", {
+        "decoding": "thinkdeeper",
+        "max_thinking_tokens": 32000
+    }),
+    
+    # Parallel test-time compute using majority voting with different k values
+    ("majority_voting_6", "Majority Voting with k=6", {"k": 6}),
+    ("majority_voting_36", "Majority Voting with k=36", {"k": 36}),
+    ("majority_voting_60", "Majority Voting with k=60", {"k": 60}),
 ]
 
 def load_optillm_bench() -> datasets.Dataset:
@@ -265,6 +290,7 @@ def evaluate_model(
     model: str,
     dataset: datasets.Dataset,
     approach: str,
+    approach_extra_body: Dict[str, Any] = None,
     max_samples: int = None
 ) -> Tuple[Dict[str, float], List[Dict[str, Any]]]:
     """
@@ -286,8 +312,18 @@ def evaluate_model(
     # Prepare the dataset
     examples = dataset if max_samples is None else dataset.select(range(max_samples))
     
-    # Create model name with approach
-    full_model_name = f"{approach}-{model}" if approach != "none" else model
+    # Create model name with approach - handle special cases
+    if approach == "none":
+        full_model_name = model
+    elif approach.startswith("thinkdeeper_"):
+        # For thinkdeeper, use base model name (decoding is passed in extra_body)
+        full_model_name = model
+    elif approach.startswith("majority_voting_"):
+        # For majority voting, use majority_voting prefix
+        full_model_name = f"majority_voting-{model}"
+    else:
+        # Standard approach prefix
+        full_model_name = f"{approach}-{model}"
     
     for example in tqdm(examples, desc=f"Evaluating {approach}"):
         try:
@@ -297,6 +333,11 @@ def evaluate_model(
             # Record start time
             start_time = time.time()
             
+            # Prepare extra_body parameters
+            extra_body = {"spl_learning": False}
+            if approach_extra_body:
+                extra_body.update(approach_extra_body)
+            
             # Make API call
             response = client.chat.completions.create(
                 model=full_model_name,
@@ -306,7 +347,7 @@ def evaluate_model(
                 ],
                 temperature=0.2,
                 max_tokens=4096,
-                extra_body= {"spl_learning": False},
+                extra_body=extra_body,
             )
             
             # Calculate time taken
@@ -469,6 +510,8 @@ def main():
                         help="Directory to save results")
     parser.add_argument("--approaches", nargs="+", 
                         help="Specific approaches to evaluate (default: all)")
+    parser.add_argument("--test-time-compute", action="store_true",
+                        help="Evaluate test-time compute approaches (sequential and parallel scaling)")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging")
     args = parser.parse_args()