fix edge cases

codelion · codelion · commit 7e2ebf9649b4 · 2025-05-16T09:57:13.000+08:00
diff --git a/optillm/plugins/spl/config.py b/optillm/plugins/spl/config.py
@@ -20,9 +20,10 @@
 # How often to perform maintenance operations (merge, prune)
 MAINTENANCE_INTERVAL = 40
 
-# Similarity thresholds
+# Strategy selection thresholds
 STRATEGY_CREATION_THRESHOLD = 0.7  # Higher threshold to avoid creating similar strategies
 STRATEGY_MERGING_THRESHOLD = 0.6   # Lower threshold to merge more similar strategies
+MIN_SUCCESS_RATE_FOR_INFERENCE = 0.4  # Minimum success rate for a strategy to be used during inference
 
 # Limits for strategy management
 MAX_STRATEGIES_PER_TYPE = 10  # Maximum strategies to store in DB per problem type
diff --git a/optillm/plugins/spl/evaluation.py b/optillm/plugins/spl/evaluation.py
@@ -14,7 +14,8 @@
 )
 from optillm.plugins.spl.config import (
     DEFAULT_MAX_TOKENS,
-    MAX_STRATEGIES_FOR_INFERENCE
+    MAX_STRATEGIES_FOR_INFERENCE,
+    MIN_SUCCESS_RATE_FOR_INFERENCE
 )
 
 # Setup logging
@@ -25,23 +26,40 @@ def select_relevant_strategies(query: str, problem_type: str, db: Any, max_strat
     Select the most relevant strategies for a given problem to be used during inference.
     This controls how many strategies are included in the system prompt augmentation.
     
+    Only selects strategies of the matching problem type with success rate >= MIN_SUCCESS_RATE_FOR_INFERENCE.
+    
     Args:
         query: The problem/query text
         problem_type: The type of problem
         db: Strategy database
         max_strategies: Maximum number of strategies to return
     
     Returns:
-        List[Strategy]: The selected strategies
+        List[Strategy]: The selected strategies (may be empty if none meet criteria)
     """
     # First, get strategies specifically for this problem type
     type_specific = db.get_strategies_for_problem(problem_type)
+    logger.info(f"Found {len(type_specific)} strategies for problem type '{problem_type}'")
+    
+    # Filter strategies by minimum success rate
+    qualified_strategies = []
+    for strategy in type_specific:
+        if strategy.success_rate >= MIN_SUCCESS_RATE_FOR_INFERENCE or strategy.total_attempts < 5:
+            qualified_strategies.append(strategy)
+        else:
+            logger.info(f"Strategy {strategy.strategy_id} skipped - success rate {strategy.success_rate:.2f} < minimum {MIN_SUCCESS_RATE_FOR_INFERENCE:.2f}")
+    
+    if not qualified_strategies:
+        logger.info(f"No strategies meet the minimum success rate threshold ({MIN_SUCCESS_RATE_FOR_INFERENCE:.2f}) for problem type '{problem_type}'")
+        return []
     
-    # If we have more type-specific strategies than needed, sort and select the best ones
-    if len(type_specific) > max_strategies:
+    logger.info(f"Found {len(qualified_strategies)} strategies that meet minimum success rate requirement")
+    
+    # If we have more qualified strategies than needed, sort and select the best ones
+    if len(qualified_strategies) > max_strategies:
         # Score each strategy based on success rate and recency
         scored_strategies = []
-        for strategy in type_specific:
+        for strategy in qualified_strategies:
             recency_score = 0
             if strategy.last_used:
                 # Calculate days since last use
@@ -55,40 +73,19 @@ def select_relevant_strategies(query: str, problem_type: str, db: Any, max_strat
         
         # Sort by score (descending) and take top strategies
         scored_strategies.sort(key=lambda x: x[1], reverse=True)
-        return [s[0] for s in scored_strategies[:max_strategies]]
-    
-    # If we don't have enough type-specific strategies, get similar strategies from other types
-    if len(type_specific) < max_strategies:
-        # Calculate how many more strategies we need
-        needed = max_strategies - len(type_specific)
-        
-        # Get similar strategies from other problem types
-        type_specific_ids = {s.strategy_id for s in type_specific}
-        similar_strategies = []
-        
-        for s, score in db.get_similar_strategies(query, n=max_strategies*2):  # Get more than needed to filter
-            # Only include strategies from other problem types and not already selected
-            if s.strategy_id not in type_specific_ids and s.problem_type != problem_type:
-                similar_strategies.append(s)
-                if len(similar_strategies) >= needed:
-                    break
-        
-        # Combine type-specific strategies with similar strategies
-        combined = type_specific + similar_strategies[:needed]  # Only add as many as needed
+        selected = [s[0] for s in scored_strategies[:max_strategies]]
         
         # Log which strategies we're using
-        for i, strategy in enumerate(combined, 1):
-            problem_type_str = "(same type)" if strategy.problem_type == problem_type else f"(from {strategy.problem_type})"
-            logger.info(f"Selected strategy {i}/{max_strategies} for inference: {strategy.strategy_id} {problem_type_str} (success rate: {strategy.success_rate:.2f})")
-            
-        return combined
-    
-    # If we have exactly the right number, just return them
-    # Log which strategies we're using
-    for i, strategy in enumerate(type_specific, 1):
-        logger.info(f"Selected strategy {i}/{max_strategies} for inference: {strategy.strategy_id} (same type) (success rate: {strategy.success_rate:.2f})")
+        for i, strategy in enumerate(selected, 1):
+            logger.info(f"Selected strategy {i}/{max_strategies} for inference: {strategy.strategy_id} (success rate: {strategy.success_rate:.2f})")
         
-    return type_specific[:max_strategies]
+        return selected
+    
+    # If we have fewer or equal to the maximum, use all qualified strategies
+    for i, strategy in enumerate(qualified_strategies, 1):
+        logger.info(f"Selected strategy {i}/{len(qualified_strategies)} for inference: {strategy.strategy_id} (success rate: {strategy.success_rate:.2f})")
+    
+    return qualified_strategies
 
 def evaluate_strategy_effectiveness(response: str, thinking: Optional[str], selected_strategies: List[Strategy], client, model: str) -> Dict[str, bool]:
     """
diff --git a/optillm/plugins/spl/main.py b/optillm/plugins/spl/main.py
@@ -26,7 +26,8 @@
     MAINTENANCE_INTERVAL,
     STRATEGY_MERGING_THRESHOLD,
     MAX_STRATEGIES_PER_TYPE,
-    MAX_STRATEGIES_FOR_INFERENCE
+    MAX_STRATEGIES_FOR_INFERENCE,
+    MIN_SUCCESS_RATE_FOR_INFERENCE
 )
 
 # Setup logging
@@ -124,15 +125,22 @@ def run_spl(system_prompt: str, initial_query: str, client, model: str, request_
     for i, strategy in enumerate(selected_strategies, 1):
         logger.info(f"Selected strategy {i}/{MAX_STRATEGIES_FOR_INFERENCE} for inference: {strategy.strategy_id} (success rate: {strategy.success_rate:.2f})")
     
-    # 7. Handle strategies for the problem type
+    # 7. Handle situation when no strategies are selected
     if not selected_strategies:
-        logger.info(f"No existing strategies found for problem type: {problem_type}")
-        logger.info(f"Running without strategy augmentation - using base system prompt only")
+        if not existing_strategies:
+            # No strategies exist for this problem type
+            logger.info(f"No strategies exist for problem type '{problem_type}'. Enable learning mode with 'spl_learning=True' to create strategies.")
+        else:
+            # Strategies exist but don't meet the minimum success rate
+            logger.info(f"Strategies exist for problem type '{problem_type}' but none meet the minimum success rate threshold of {MIN_SUCCESS_RATE_FOR_INFERENCE:.2f}.")
+            logger.info(f"Enable learning mode with 'spl_learning=True' to improve strategies.")
         
-        # Just use the original system prompt with no augmentation
+        # Use the original system prompt without augmentation
+        logger.info("Running without strategy augmentation - using base system prompt only.")
         augmented_prompt = system_prompt
     else:
-        # 8. Augment the system prompt with the selected strategies
+        # Normal case - strategies were selected
+        # Augment the system prompt with the selected strategies
         augmented_prompt = augment_system_prompt(system_prompt, selected_strategies)
         logger.info(f"Augmented system prompt with {len(selected_strategies)} strategies (inference limit: {MAX_STRATEGIES_FOR_INFERENCE})")
     
@@ -148,19 +156,12 @@ def run_spl(system_prompt: str, initial_query: str, client, model: str, request_
             request_params['max_tokens'] = DEFAULT_MAX_TOKENS
         elif request_params['max_tokens'] < DEFAULT_MAX_TOKENS:
             request_params['max_tokens'] = DEFAULT_MAX_TOKENS
-        
-        # Log a suggestion if no strategies found in inference mode
-        if not learning_mode and not existing_strategies:
-            logger.info(f"No strategies exist for problem type '{problem_type}'. To learn strategies for this specific problem type, enable learning mode by setting 'spl_learning=True' in the request config.")
-        
-        # Use unmodified query - no need to add fallback message to the actual query
-        initial_query_with_suggestion = initial_query
             
         response = client.chat.completions.create(
             model=model,
             messages=[
                 {"role": "system", "content": augmented_prompt},
-                {"role": "user", "content": initial_query_with_suggestion}
+                {"role": "user", "content": initial_query}
             ],
             **request_params
         )
@@ -177,39 +178,40 @@ def run_spl(system_prompt: str, initial_query: str, client, model: str, request_
             logger.debug(f"Main response - final answer after removing thinking: '{final_response}'")
         
         # Only perform learning operations if in learning mode and we have strategies
-        if learning_mode and selected_strategies:
-            # 10. Evaluate the effectiveness of the strategies
-            strategy_effectiveness = evaluate_strategy_effectiveness(
-                final_response,
-                thinking,
-                selected_strategies,
-                client,
-                model
-            )
-            
-            # 11. Update strategy metrics based on effectiveness
-            for strategy_id, effective in strategy_effectiveness.items():
-                # Skip temporary fallback strategies
-                if strategy_id != "fallback_temporary":
-                    db.update_strategy_performance(strategy_id, effective)
-                    logger.info(f"Strategy {strategy_id} effectiveness: {effective}")
-                    
-                    # If the strategy was effective and thinking was used, add the thinking as a reasoning example
-                    if effective and thinking and strategy_id != "fallback_temporary":
-                        db.add_reasoning_example(strategy_id, thinking)
-                        logger.info(f"Added reasoning example to strategy {strategy_id}")
-            
-            # 12. Periodically refine strategies (after every 10 uses)
-            for strategy in selected_strategies:
-                # Skip temporary fallback strategies
-                if (strategy.strategy_id != "fallback_temporary" and 
-                    strategy.total_attempts % 10 == 0 and 
-                    strategy.total_attempts > 0):
-                    logger.info(f"Refining strategy {strategy.strategy_id} after {strategy.total_attempts} attempts")
-                    refined_strategy = refine_strategy(strategy, initial_query, final_response, thinking, client, model)
-                    db.refine_strategy(strategy.strategy_id, refined_strategy.strategy_text)
-        elif learning_mode:
-            logger.info("No strategies to evaluate")
+        if learning_mode:
+            if selected_strategies:
+                # 10. Evaluate the effectiveness of the strategies
+                strategy_effectiveness = evaluate_strategy_effectiveness(
+                    final_response,
+                    thinking,
+                    selected_strategies,
+                    client,
+                    model
+                )
+                
+                # 11. Update strategy metrics based on effectiveness
+                for strategy_id, effective in strategy_effectiveness.items():
+                    # Skip temporary fallback strategies
+                    if strategy_id != "fallback_temporary":
+                        db.update_strategy_performance(strategy_id, effective)
+                        logger.info(f"Strategy {strategy_id} effectiveness: {effective}")
+                        
+                        # If the strategy was effective and thinking was used, add the thinking as a reasoning example
+                        if effective and thinking and strategy_id != "fallback_temporary":
+                            db.add_reasoning_example(strategy_id, thinking)
+                            logger.info(f"Added reasoning example to strategy {strategy_id}")
+                
+                # 12. Periodically refine strategies (after every 10 uses)
+                for strategy in selected_strategies:
+                    # Skip temporary fallback strategies
+                    if (strategy.strategy_id != "fallback_temporary" and 
+                        strategy.total_attempts % 10 == 0 and 
+                        strategy.total_attempts > 0):
+                        logger.info(f"Refining strategy {strategy.strategy_id} after {strategy.total_attempts} attempts")
+                        refined_strategy = refine_strategy(strategy, initial_query, final_response, thinking, client, model)
+                        db.refine_strategy(strategy.strategy_id, refined_strategy.strategy_text)
+            else:
+                logger.info("No strategies to evaluate or refine - consider adding strategies for this problem type")
         else:
             logger.info("Strategy evaluation and refinement skipped (not in learning mode)")