Skip to content

Commit 4c82268

Browse files
committed
Improvements to the reward_hybrid_sign function
1 parent 9fbef34 commit 4c82268

1 file changed

Lines changed: 23 additions & 14 deletions

File tree

das/env/reward.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -96,26 +96,35 @@ def reward_hybrid_binary(new_best_y, old_best_y, initial_range, is_final=False,
9696

9797
# Probably the best
9898
def reward_hybrid_sign(new_best_y, old_best_y, initial_range, is_final=False, optimum=None):
99-
"""Hybrid B: dense progress signal + full-magnitude terminal reward.
100-
101-
Aggressive variant: higher base rewards, tighter threshold, stronger penalty.
102-
- Threshold 0.5% (step_ratio > scale * 5e-3) filters micro-steps
103-
- Reward range [0.1, 1.1]: large steps worth significantly more
104-
- Penalty -0.15 decayed: stronger motivation to keep searching
105-
"""
99+
"""Hybrid B: dense progress signal + full-magnitude terminal reward."""
106100
if is_final:
107101
return _terminal_reward(new_best_y, initial_range, optimum)
108102

109-
scale = initial_range[1] - initial_range[0]
110-
step_ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
111-
if step_ratio > scale * 5e-3:
112-
return float(0.1 + 1.0 * np.clip(step_ratio, 0.0, 1.0))
103+
base, slope, penalty = 0.1, 1.0, 0.15
113104

114105
if optimum is not None:
115-
progress = max(_log_gap_orders(initial_range[0], new_best_y, optimum), 0.0)
106+
step_threshold = 0.05
107+
108+
def gain(y_from, y_to):
109+
return _log_gap_orders(y_from, y_to, optimum)
116110
else:
117-
progress = max(_improvement_ratio(new_best_y, initial_range[0], initial_range), 0.0)
118-
return float(-0.15 / (1.0 + progress))
111+
step_threshold = 5e-3
112+
113+
def gain(y_from, y_to):
114+
return _improvement_ratio(y_to, y_from, initial_range)
115+
116+
step_gain = gain(old_best_y, new_best_y)
117+
if step_gain > step_threshold:
118+
return float(base + slope * np.clip(step_gain, 0.0, 1.0))
119+
120+
# Already at the precision target: a stalled step is the goal state, not
121+
# stagnation, so don't penalise it (otherwise solving early is discouraged).
122+
if optimum is not None and (new_best_y - optimum) <= _GAP_FLOOR:
123+
return 0.0
124+
125+
progress = max(gain(initial_range[0], new_best_y), 0.0)
126+
shortfall = 1.0 - np.clip(step_gain / step_threshold, 0.0, 1.0)
127+
return float(-penalty * shortfall**2 / (1.0 + progress))
119128

120129

121130
REWARD_FNS = {

0 commit comments

Comments
 (0)