From 7b7efa922b47f3cb2b3d1e1b6c549ae6d7aa640e Mon Sep 17 00:00:00 2001
From: David <12414531+DavidBellamy@users.noreply.github.com>
Date: Thu, 16 Apr 2026 07:13:14 -0700
Subject: [PATCH] fix(rollout): guard round(None) in zero-std metric
 aggregation

_compute_zero_std_metrics crashes with TypeError when any zero-std group's
leading sample has a None reward (typical for Status.ABORTED trials):

  File "miles/ray/rollout.py", line 1266, in _compute_zero_std_metrics
    interesting_rewards = [str(round(g[0].get_reward_value(args), 1)) ...]
  TypeError: type NoneType doesn't define __round__ method

This crash fires on RolloutManager.generate() inside _log_rollout_data,
after the rollout collection + dynamic sampling filter have already
accepted the batch. With agentic tasks where some trials routinely abort
(Daytona sandbox timeout, tool-invocation loops, etc.), the trainer never
receives the batch and optimizer.step() never fires, so async RL training
silently stalls.

Fix: extract a _reward_label helper that buckets None-reward samples under
a dedicated 'none' label instead of passing None to round(). This keeps
the metric informative (zero_std/count_none shows the aborted-group count)
and preserves the existing behavior for numeric rewards.

Observed on LLM360/RL360 #76 FAST_ITER smoke runs (job 1559799) with
GLM-4.7-Flash on agentic terminal-bench tasks.
---
 miles/ray/rollout.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py
index 2a75d492b9..af11accfc3 100644
--- a/miles/ray/rollout.py
+++ b/miles/ray/rollout.py
@@ -1260,10 +1260,18 @@ def _is_zero_std(samples: list[Sample]):
         rewards = [sample.get_reward_value(args) for sample in samples]
         return len(rewards) == 0 or all(rewards[0] == r for r in rewards)
 
+    def _reward_label(sample: Sample) -> str:
+        # Aborted / None-reward samples have no numeric reward to round; bucket
+        # them under a dedicated label so downstream round() never sees None.
+        reward = sample.get_reward_value(args)
+        if reward is None:
+            return "none"
+        return str(round(reward, 1))
+
     all_sample_groups = group_by(all_samples, lambda s: s.group_index)
     interesting_sample_groups = [g for g in all_sample_groups.values() if _is_zero_std(g)]
 
-    interesting_rewards = [str(round(g[0].get_reward_value(args), 1)) for g in interesting_sample_groups]
+    interesting_rewards = [_reward_label(g[0]) for g in interesting_sample_groups]
 
     return {f"zero_std/count_{reward}": len(items) for reward, items in group_by(interesting_rewards).items()}