@@ -96,26 +96,35 @@ def reward_hybrid_binary(new_best_y, old_best_y, initial_range, is_final=False,
9696
9797# Probably the best
9898def reward_hybrid_sign (new_best_y , old_best_y , initial_range , is_final = False , optimum = None ):
99- """Hybrid B: dense progress signal + full-magnitude terminal reward.
100-
101- Aggressive variant: higher base rewards, tighter threshold, stronger penalty.
102- - Threshold 0.5% (step_ratio > scale * 5e-3) filters micro-steps
103- - Reward range [0.1, 1.1]: large steps worth significantly more
104- - Penalty -0.15 decayed: stronger motivation to keep searching
105- """
99+ """Hybrid B: dense progress signal + full-magnitude terminal reward."""
106100 if is_final :
107101 return _terminal_reward (new_best_y , initial_range , optimum )
108102
109- scale = initial_range [1 ] - initial_range [0 ]
110- step_ratio = _improvement_ratio (new_best_y , old_best_y , initial_range )
111- if step_ratio > scale * 5e-3 :
112- return float (0.1 + 1.0 * np .clip (step_ratio , 0.0 , 1.0 ))
103+ base , slope , penalty = 0.1 , 1.0 , 0.15
113104
114105 if optimum is not None :
115- progress = max (_log_gap_orders (initial_range [0 ], new_best_y , optimum ), 0.0 )
106+ step_threshold = 0.05
107+
108+ def gain (y_from , y_to ):
109+ return _log_gap_orders (y_from , y_to , optimum )
116110 else :
117- progress = max (_improvement_ratio (new_best_y , initial_range [0 ], initial_range ), 0.0 )
118- return float (- 0.15 / (1.0 + progress ))
111+ step_threshold = 5e-3
112+
113+ def gain (y_from , y_to ):
114+ return _improvement_ratio (y_to , y_from , initial_range )
115+
116+ step_gain = gain (old_best_y , new_best_y )
117+ if step_gain > step_threshold :
118+ return float (base + slope * np .clip (step_gain , 0.0 , 1.0 ))
119+
120+ # Already at the precision target: a stalled step is the goal state, not
121+ # stagnation, so don't penalise it (otherwise solving early is discouraged).
122+ if optimum is not None and (new_best_y - optimum ) <= _GAP_FLOOR :
123+ return 0.0
124+
125+ progress = max (gain (initial_range [0 ], new_best_y ), 0.0 )
126+ shortfall = 1.0 - np .clip (step_gain / step_threshold , 0.0 , 1.0 )
127+ return float (- penalty * shortfall ** 2 / (1.0 + progress ))
119128
120129
121130REWARD_FNS = {
0 commit comments