From 7b7efa922b47f3cb2b3d1e1b6c549ae6d7aa640e Mon Sep 17 00:00:00 2001 From: David <12414531+DavidBellamy@users.noreply.github.com> Date: Thu, 16 Apr 2026 07:13:14 -0700 Subject: [PATCH] fix(rollout): guard round(None) in zero-std metric aggregation _compute_zero_std_metrics crashes with TypeError when any zero-std group's leading sample has a None reward (typical for Status.ABORTED trials): File "miles/ray/rollout.py", line 1266, in _compute_zero_std_metrics interesting_rewards = [str(round(g[0].get_reward_value(args), 1)) ...] TypeError: type NoneType doesn't define __round__ method This crash fires on RolloutManager.generate() inside _log_rollout_data, after the rollout collection + dynamic sampling filter have already accepted the batch. With agentic tasks where some trials routinely abort (Daytona sandbox timeout, tool-invocation loops, etc.), the trainer never receives the batch and optimizer.step() never fires, so async RL training silently stalls. Fix: extract a _reward_label helper that buckets None-reward samples under a dedicated 'none' label instead of passing None to round(). This keeps the metric informative (zero_std/count_none shows the aborted-group count) and preserves the existing behavior for numeric rewards. Observed on LLM360/RL360 #76 FAST_ITER smoke runs (job 1559799) with GLM-4.7-Flash on agentic terminal-bench tasks. --- miles/ray/rollout.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py index 2a75d492b9..af11accfc3 100644 --- a/miles/ray/rollout.py +++ b/miles/ray/rollout.py @@ -1260,10 +1260,18 @@ def _is_zero_std(samples: list[Sample]): rewards = [sample.get_reward_value(args) for sample in samples] return len(rewards) == 0 or all(rewards[0] == r for r in rewards) + def _reward_label(sample: Sample) -> str: + # Aborted / None-reward samples have no numeric reward to round; bucket + # them under a dedicated label so downstream round() never sees None. + reward = sample.get_reward_value(args) + if reward is None: + return "none" + return str(round(reward, 1)) + all_sample_groups = group_by(all_samples, lambda s: s.group_index) interesting_sample_groups = [g for g in all_sample_groups.values() if _is_zero_std(g)] - interesting_rewards = [str(round(g[0].get_reward_value(args), 1)) for g in interesting_sample_groups] + interesting_rewards = [_reward_label(g[0]) for g in interesting_sample_groups] return {f"zero_std/count_{reward}": len(items) for reward, items in group_by(interesting_rewards).items()}