fix normalization and tests

wlnc · wlnc · commit 6db1e029ed02 · 2026-06-05T16:08:00.000+02:00
diff --git a/agents/exponential_das/normalizer.py b/agents/exponential_das/normalizer.py
@@ -1,9 +1,14 @@
 """Running normalizers for observations and rewards.
 
 Both use Welford's online algorithm for numerically stable mean/variance.
-Normalisation is only updated during the warmup phase (while the buffer is
-filling for the first time); afterwards the statistics are frozen.  This
-mirrors the StateNormalizer behaviour in the source project.
+
+ObservationNormalizer statistics are frozen after the warmup phase (first
+buffer fill) so the obs space presented to the actor/critic networks stays
+stable.
+
+RewardNormalizer keeps updating throughout training so that its per-step
+statistics track the shifting reward distribution as the agent improves.
+This matches the StepwiseRewardNormalizer behaviour in the reference project.
 """
 
 from __future__ import annotations
diff --git a/agents/exponential_das/trainer.py b/agents/exponential_das/trainer.py
@@ -77,10 +77,9 @@ def train(
             next_obs, reward, terminated, truncated, step_info = train_env.step(action)
             done = terminated or truncated
 
-            # Reward normalisation (update only during warmup)
-            normed_reward = agent.rew_norm.normalize(
-                reward, step_idx, update=not agent.buffer.warmed_up
-            )
+            # Reward normalisation: always update so stats track the shifting
+            # reward distribution as the agent improves (matches reference).
+            normed_reward = agent.rew_norm.normalize(reward, step_idx, update=True)
             ep_reward += reward
 
             agent.buffer.add(obs, action, log_prob, value, normed_reward, done)
diff --git a/tests/test_baselines.py b/tests/test_baselines.py
@@ -288,16 +288,16 @@ def test_fitness_history_y_is_nonincreasing(self):
             assert ys[i] < ys[i - 1]
 
     def test_fitness_history_nonempty_after_episode(self):
-        """At least one improvement must occur (first evaluation beats inf)."""
+        """reset() probe establishes a finite initial best; optimizer steps may
+        not improve on it, so fitness_history from steps can be empty."""
         env = make_env()
-        _, fitness_history = run_episode(env, random_policy)
-        assert len(fitness_history) >= 1
+        env.reset()
+        assert np.isfinite(env._best_y)
 
     def test_fixed_policy_runs_full_episode(self):
         env = make_env()
-        step_info, fitness_history = run_episode(env, fixed_policy(0))
+        step_info, _ = run_episode(env, fixed_policy(0))
         assert np.isfinite(step_info["best_y"])
-        assert len(fitness_history) >= 1
 
     def test_episode_advances_problem_idx(self):
         env = make_env()
@@ -858,7 +858,9 @@ def test_fitness_history_step_fe_within_budget(self):
             assert 1 <= fe <= max_fe
 
     def test_fitness_history_step_accumulated_across_checkpoints(self):
-        """Full episode fitness history must contain at least as many points as one step."""
+        """fitness_history_step records improvements over the probe best.
+        The probe in reset() may already be the episode's best, so this
+        list can legitimately be empty; verify it is a list of valid tuples."""
         env = make_env()
         env.reset()
         all_history = []
@@ -868,8 +870,9 @@ def test_fitness_history_step_accumulated_across_checkpoints(self):
             done = terminated or truncated
             all_history.extend(info["fitness_history_step"])
 
-        # At minimum one improvement in the first checkpoint (from inf)
-        assert len(all_history) >= 1
+        assert isinstance(all_history, list)
+        for fe, y in all_history:
+            assert isinstance(fe, int) and isinstance(y, float)
 
     def test_fitness_history_step_fe_monotone_across_episode(self):
         """FE values accumulated across all checkpoints must be strictly increasing."""
diff --git a/tests/test_heterogeneous_portfolios.py b/tests/test_heterogeneous_portfolios.py
@@ -370,6 +370,7 @@ def test_reset_clears_state_between_episodes(self, spec, fe_mult):
         env.reset()
         drain(env)
         env.reset()
-        assert env._n_fe == 0
-        assert env._best_y == float("inf")
+        # reset() runs a random probe, so _n_fe > 0 and _best_y is finite
+        assert env._n_fe > 0
+        assert np.isfinite(env._best_y)
         assert env._optimizer_state == {}
diff --git a/tests/test_parallel_envs.py b/tests/test_parallel_envs.py
@@ -169,8 +169,9 @@ def test_reset_clears_all_episode_state(self):
         env.step(0)
         env.step(0)  # mid-episode
         env.reset()  # full reset
-        assert env._n_fe == 0
-        assert env._best_y == float("inf")
+        # reset() runs a random probe, so _n_fe > 0 and _best_y is finite
+        assert env._n_fe > 0
+        assert np.isfinite(env._best_y)
         assert env._checkpoint_idx == 0
         assert env._choices_history == []
         assert env._optimizer_state == {}
@@ -202,10 +203,11 @@ def test_best_y_is_independent(self):
         env_b = make_env(suite=suite)
         env_a.reset()
         env_b.reset()
+        best_y_before = env_b._best_y  # probe value set during reset
 
         env_a.step(0)
 
-        assert env_b._best_y == float("inf")
+        assert env_b._best_y == best_y_before
 
     def test_optimizer_state_does_not_leak(self):
         """Warm-start population in env_a must not appear in env_b."""