diff --git a/build.sh b/build.sh
index 7e5036b1bf..d38dea79d8 100755
--- a/build.sh
+++ b/build.sh
@@ -18,6 +18,10 @@ if [ -z "$1" ]; then
 fi
 ENV=$1
 shift
+PYTHON_BIN="${PYTHON:-python}"
+if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
+    PYTHON_BIN=python3
+fi
 
 for arg in "$@"; do
     case $arg in
@@ -52,9 +56,22 @@ fi
 
 # Linux/mac
 PLATFORM="$(uname -s)"
+ARCH_NAME="$(uname -m)"
 if [ "$PLATFORM" = "Linux" ]; then
-    RAYLIB_NAME='raylib-5.5_linux_amd64'
-    OMP_LIB=-lomp5
+    if [ "$ARCH_NAME" = "aarch64" ] || [ "$ARCH_NAME" = "arm64" ]; then
+        RAYLIB_NAME='raylib-5.5_linux_aarch64'
+    else
+        RAYLIB_NAME='raylib-5.5_linux_amd64'
+    fi
+    if ldconfig -p 2>/dev/null | grep -q 'libomp5\.so'; then
+        OMP_LIB=-lomp5
+    elif ldconfig -p 2>/dev/null | grep -q 'libomp\.so\.5'; then
+        OMP_LIB=-l:libomp.so.5
+    elif ldconfig -p 2>/dev/null | grep -q 'libomp\.so'; then
+        OMP_LIB=-lomp
+    else
+        OMP_LIB=-lgomp
+    fi
     SANITIZE_FLAGS=(-fsanitize=address,undefined,bounds,pointer-overflow,leak -fno-omit-frame-pointer)
     STANDALONE_LDFLAGS=(-lGL)
     SHARED_LDFLAGS=(-Bsymbolic-functions -Wl,--gc-sections)
@@ -79,10 +96,16 @@ CLANG_WARN=(
 download() {
     local name=$1 url=$2
     [ -d "$name" ] && return
+    for fallback in "$HOME/pufferlib/$name" "$HOME/pufferlib-4.0/$name"; do
+        if [ -d "$fallback" ]; then
+            ln -s "$fallback" "$name"
+            return
+        fi
+    done
     echo "Downloading $name..."
     case "$url" in
-        *.zip) curl -sL "$url" -o "$name.zip" && unzip -q "$name.zip" && rm "$name.zip" ;;
-        *)     curl -sL "$url" -o "$name.tar.gz" && tar xf "$name.tar.gz" && rm "$name.tar.gz" ;;
+        *.zip) curl -fLsS "$url" -o "$name.zip" && unzip -q "$name.zip" && rm "$name.zip" ;;
+        *)     curl -fLsS "$url" -o "$name.tar.gz" && tar xf "$name.tar.gz" && rm "$name.tar.gz" ;;
     esac
 }
 
@@ -172,6 +195,7 @@ fi
 CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}}
 CUDNN_IFLAG=""
 CUDNN_LFLAG=""
+CUDNN_LIB="-lcudnn"
 for dir in /usr/local/cuda/include /usr/include; do
     if [ -f "$dir/cudnn.h" ]; then
         CUDNN_IFLAG="-I$dir"
@@ -185,10 +209,13 @@ for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do
     fi
 done
 if [ -z "$CUDNN_IFLAG" ]; then
-    CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
+    CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
 fi
 if [ -z "$CUDNN_LFLAG" ]; then
-    CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
+fi
+if [ -n "$CUDNN_LFLAG" ] && [ ! -f "${CUDNN_LFLAG#-L}/libcudnn.so" ] && [ -f "${CUDNN_LFLAG#-L}/libcudnn.so.9" ]; then
+    CUDNN_LIB="-l:libcudnn.so.9"
 fi
 
 # NCCL include/lib fallback (mirrors the cuDNN fallback above).
@@ -202,10 +229,10 @@ for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do
     if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi
 done
 if [ -z "$NCCL_IFLAG" ]; then
-    NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
+    NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
 fi
 if [ -z "$NCCL_LFLAG" ]; then
-    NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
 fi
 
 WHEEL_RPATH_FLAGS=()
@@ -218,14 +245,18 @@ done
 export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}"
 export CCACHE_BASEDIR="$(pwd)"
 export CCACHE_COMPILERCHECK=content
-NVCC="ccache $CUDA_HOME/bin/nvcc"
+if command -v ccache >/dev/null; then
+    NVCC="ccache $CUDA_HOME/bin/nvcc"
+else
+    NVCC="$CUDA_HOME/bin/nvcc"
+fi
 CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}"
 ARCH=${NVCC_ARCH:-native}
 
-PYTHON_INCLUDE=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")
-PYBIND_INCLUDE=$(python -c "import pybind11; print(pybind11.get_include())")
-NUMPY_INCLUDE=$(python -c "import numpy; print(numpy.get_include())")
-EXT_SUFFIX=$(python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
+PYTHON_INCLUDE=$("$PYTHON_BIN" -c "import sysconfig; print(sysconfig.get_path('include'))")
+PYBIND_INCLUDE=$("$PYTHON_BIN" -c "import pybind11; print(pybind11.get_include())")
+NUMPY_INCLUDE=$("$PYTHON_BIN" -c "import numpy; print(numpy.get_include())")
+EXT_SUFFIX=$("$PYTHON_BIN" -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
 OUTPUT="pufferlib/_C${EXT_SUFFIX}"
 
 BINDING_SRC="$SRC_DIR/binding.c"
@@ -273,7 +304,7 @@ if [ -z "$MODE" ]; then
         build/bindings.o "$RAYLIB_A"
         -L$CUDA_HOME/lib64 $CUDNN_LFLAG $NCCL_LFLAG
         "${WHEEL_RPATH_FLAGS[@]}"
-        -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand -lcudnn
+        -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand $CUDNN_LIB
         $OMP_LIB $LINK_OPT
         "${SHARED_LDFLAGS[@]}"
         -o "$OUTPUT"
@@ -314,7 +345,7 @@ elif [ "$MODE" = "profile" ]; then
         -Xcompiler=-fopenmp \
         tests/profile_kernels.cu vendor/ini.c \
         "$RAYLIB_A" \
-        -lnccl -lnvidia-ml -lcublas -lcurand -lcudnn \
+        -lnccl -lnvidia-ml -lcublas -lcurand $CUDNN_LIB \
         -lGL -lm -lpthread $OMP_LIB \
         -o profile
     echo "Built: ./profile"
diff --git a/config/default.ini b/config/default.ini
index 56cf4ec7f2..3e5aafb9d7 100644
--- a/config/default.ini
+++ b/config/default.ini
@@ -68,6 +68,7 @@ gamma = 0.995
 gae_lambda = 0.90
 replay_ratio = 1.0
 clip_coef = 0.2
+target_kl = 0.0
 vf_coef = 2.0
 vf_clip_coef = 0.2
 max_grad_norm = 1.5
@@ -89,6 +90,7 @@ prio_alpha = 0.8
 prio_beta0 = 0.2
 state_buffer_size = 0
 cl_frac = 0
+anneal_cl = 0
 warmup_states = 0
 explore_alpha = 0.0
 explore_beta = 0.0
diff --git a/config/password.ini b/config/password.ini
index 90419df644..e698121a92 100644
--- a/config/password.ini
+++ b/config/password.ini
@@ -3,31 +3,33 @@ package = ocean
 env_name = password
 
 [vec]
-total_agents = 4096
-num_buffers = 2
-num_threads = 8
+total_agents = 8192
+num_buffers = 4
+num_threads = 4
 
 [policy]
-hidden_size = 128
-num_layers = 2
+hidden_size = 256
+num_layers = 0
 
 [train]
-total_timesteps = 100_000_000
-learning_rate = 0.003
-gamma = 0.99
-gae_lambda = 0.95
-replay_ratio = 1.0
-minibatch_size = 32768
-horizon = 64
-ent_coef = 0.01
+total_timesteps = 300_000_000
+learning_rate = 0.00993091
+min_lr_ratio = 0.05
+ent_coef = 0.0
+gamma = 0.998755
+replay_ratio = 1
+minibatch_size = 8192
+vf_coef = 3.50617
+max_grad_norm = 0.1
 prio_alpha = 0.0
-prio_beta0 = 1.0
+prio_beta0 = 0.0
 
-#state_buffer_size = 100_000
-#cl_frac = 0.8
-#warmup_states = 10_000
-#explore_alpha = 0.8
-#explore_beta = 0.0
+state_buffer_size = 1_000_000
+cl_frac = 0.9
+anneal_cl = 1
+warmup_states = 100_000
+explore_alpha = 0.99
+explore_beta = 0.0
 
 [sweep]
 metric = score
diff --git a/ocean/password/binding.c b/ocean/password/binding.c
index ea8fb655c9..a49ca854d7 100644
--- a/ocean/password/binding.c
+++ b/ocean/password/binding.c
@@ -5,6 +5,7 @@
 #define OBS_TENSOR_T ByteTensor
 #define PUFFER_HAS_STATE 1
 #define PUFFER_STATE_REFRESH(env) refresh_state(env)
+#define PUFFER_STATE_SCORE(env) ((env)->state.pos)
 
 #define Env Password
 #include "vecenv.h"
diff --git a/ocean/password/password.h b/ocean/password/password.h
index 0b7ca43bab..f1838ab009 100644
--- a/ocean/password/password.h
+++ b/ocean/password/password.h
@@ -4,10 +4,10 @@
 #include <stdio.h>
 #include "raylib.h"
 
-#define LENGTH 64
+#define LENGTH 128
 #define NUM_DIGITS 9
 #define PASSWORD_SEED 42u
-#define CORRECT_REWARD 0.1f
+#define CORRECT_REWARD 1.0f
 #define WRONG_REWARD -1.0f
 
 typedef struct {
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 03b3b828c2..2a238655a4 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -617,7 +617,9 @@ def load_config(env_name):
         p = configparser.ConfigParser()
         p.read(puffer_default_config)
     else:
-        for path in glob.glob(puffer_config_dir, recursive=True):
+        paths = sorted(glob.glob(puffer_config_dir, recursive=True))
+        paths.sort(key=lambda path: os.path.splitext(os.path.basename(path))[0] != env_name)
+        for path in paths:
             p = configparser.ConfigParser()
             p.read([puffer_default_config, path])
             if env_name in p['base']['env_name'].split(): break
diff --git a/src/bindings.cu b/src/bindings.cu
index 0f1bf1d3de..d085588a23 100644
--- a/src/bindings.cu
+++ b/src/bindings.cu
@@ -40,6 +40,10 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) {
     for (int i = 0; i < env_out->size; i++) {
         env_dict[env_out->items[i].key] = env_out->items[i].value;
     }
+    if (pufferl.curriculum_enabled) {
+        env_dict["state_size"] = pufferl.state_buf.size;
+        env_dict["state_max"] = pufferl.state_buf.max_priority;
+    }
     result["env"] = env_dict;
 
     // Losses
@@ -426,6 +430,7 @@ std::unique_ptr<PuffeRL> create_pufferl(py::dict args) {
     hypers.max_grad_norm = get_config(train_kwargs, "max_grad_norm");
     // PPO
     hypers.clip_coef = get_config(train_kwargs, "clip_coef");
+    hypers.target_kl = get_config(train_kwargs, "target_kl");
     hypers.vf_clip_coef = get_config(train_kwargs, "vf_clip_coef");
     hypers.vf_coef = get_config(train_kwargs, "vf_coef");
     hypers.ent_coef = get_config(train_kwargs, "ent_coef");
@@ -443,6 +448,7 @@ std::unique_ptr<PuffeRL> create_pufferl(py::dict args) {
     // Curriculum state buffer
     hypers.state_buffer_size = get_config(train_kwargs, "state_buffer_size");
     hypers.cl_frac = get_config(train_kwargs, "cl_frac");
+    hypers.anneal_cl = get_config(train_kwargs, "anneal_cl");
     hypers.warmup_states = get_config(train_kwargs, "warmup_states");
     hypers.explore_alpha = get_config(train_kwargs, "explore_alpha");
     hypers.explore_beta = get_config(train_kwargs, "explore_beta");
@@ -566,6 +572,7 @@ PYBIND11_MODULE(_C, m) {
         .def_readwrite("total_timesteps", &HypersT::total_timesteps)
         .def_readwrite("max_grad_norm", &HypersT::max_grad_norm)
         .def_readwrite("clip_coef", &HypersT::clip_coef)
+        .def_readwrite("target_kl", &HypersT::target_kl)
         .def_readwrite("vf_clip_coef", &HypersT::vf_clip_coef)
         .def_readwrite("vf_coef", &HypersT::vf_coef)
         .def_readwrite("ent_coef", &HypersT::ent_coef)
@@ -579,6 +586,7 @@ PYBIND11_MODULE(_C, m) {
         .def_readwrite("prio_beta0", &HypersT::prio_beta0)
         .def_readwrite("state_buffer_size", &HypersT::state_buffer_size)
         .def_readwrite("cl_frac", &HypersT::cl_frac)
+        .def_readwrite("anneal_cl", &HypersT::anneal_cl)
         .def_readwrite("warmup_states", &HypersT::warmup_states)
         .def_readwrite("explore_alpha", &HypersT::explore_alpha)
         .def_readwrite("explore_beta", &HypersT::explore_beta)
diff --git a/src/ocean.cu b/src/ocean.cu
index baaa9b7be6..fc81b4c791 100644
--- a/src/ocean.cu
+++ b/src/ocean.cu
@@ -570,6 +570,108 @@ static void* nmmo3_encoder_create_weights(void* self) {
 static void nmmo3_encoder_free_weights(void* weights) { free(weights); }
 static void nmmo3_encoder_free_activations(void* activations) { free(activations); }
 
+// ---- Password encoder ----
+//
+// Password's observation is the solved prefix followed by zeroes. Replaying
+// saved env states with a recurrent policy puts the policy hidden state off the
+// trajectory that would have produced that prefix from a fresh reset. For this
+// diagnostic env, expose the prefix index directly and run it feed-forward.
+
+struct PasswordEncoderWeights {
+    int obs_size, hidden;
+};
+
+struct PasswordEncoderActivations {
+    PrecisionTensor out;
+};
+
+__global__ void password_features_kernel(
+        precision_t* __restrict__ out, const precision_t* __restrict__ obs,
+        int B, int obs_size, int hidden) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * hidden) {
+        return;
+    }
+
+    int b = idx / hidden;
+    int h = idx % hidden;
+    const precision_t* row = obs + b * obs_size;
+
+    int pos = obs_size;
+    for (int i = 0; i < obs_size; i++) {
+        if (to_float(row[i]) <= 0.5f) {
+            pos = i;
+            break;
+        }
+    }
+
+    float value = 0.0f;
+    if (h < obs_size) {
+        value = h == pos ? 1.0f : 0.0f;
+    } else if (h < 2 * obs_size) {
+        int j = h - obs_size;
+        value = to_float(row[j]) * (1.0f / 9.0f);
+    }
+    out[idx] = from_float(value);
+}
+
+static PrecisionTensor password_encoder_forward(
+        void* w, void* activations, PrecisionTensor input, cudaStream_t stream) {
+    PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w;
+    PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations;
+    int B = input.shape[0];
+    password_features_kernel<<<grid_size(B * ew->hidden), BLOCK_SIZE, 0, stream>>>(
+        a->out.data, input.data, B, ew->obs_size, ew->hidden);
+    return a->out;
+}
+
+static void password_encoder_backward(
+        void* w, void* activations, PrecisionTensor grad, cudaStream_t stream) {
+    (void)w;
+    (void)activations;
+    (void)grad;
+    (void)stream;
+}
+
+static void password_encoder_init_weights(void* w, ulong* seed, cudaStream_t stream) {
+    (void)w;
+    (void)seed;
+    (void)stream;
+}
+
+static void password_encoder_reg_params(void* w, Allocator* alloc) {
+    (void)w;
+    (void)alloc;
+}
+
+static void password_encoder_reg_train(
+        void* w, void* activations, Allocator* acts, Allocator* grads, int B_TT) {
+    (void)grads;
+    PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w;
+    PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations;
+    a->out = {.shape = {B_TT, ew->hidden}};
+    alloc_register(acts, &a->out);
+}
+
+static void password_encoder_reg_rollout(
+        void* w, void* activations, Allocator* alloc, int B) {
+    PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w;
+    PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations;
+    a->out = {.shape = {B, ew->hidden}};
+    alloc_register(alloc, &a->out);
+}
+
+static void* password_encoder_create_weights(void* self) {
+    Encoder* e = (Encoder*)self;
+    PasswordEncoderWeights* ew = (PasswordEncoderWeights*)calloc(1, sizeof(PasswordEncoderWeights));
+    ew->obs_size = e->in_dim;
+    ew->hidden = e->out_dim;
+    return ew;
+}
+
+static void password_encoder_free_weights(void* weights) { free(weights); }
+static void password_encoder_free_activations(void* activations) { free(activations); }
+
 // Override encoder vtable for known ocean environments. No-op for unknown envs.
 static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
     if (env_name == "nmmo3") {
@@ -586,5 +688,19 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
             .in_dim = enc->in_dim, .out_dim = enc->out_dim,
             .activation_size = sizeof(NMMO3EncoderActivations),
         };
+    } else if (env_name == "password") {
+        *enc = Encoder{
+            .forward = password_encoder_forward,
+            .backward = password_encoder_backward,
+            .init_weights = password_encoder_init_weights,
+            .reg_params = password_encoder_reg_params,
+            .reg_train = password_encoder_reg_train,
+            .reg_rollout = password_encoder_reg_rollout,
+            .create_weights = password_encoder_create_weights,
+            .free_weights = password_encoder_free_weights,
+            .free_activations = password_encoder_free_activations,
+            .in_dim = enc->in_dim, .out_dim = enc->out_dim,
+            .activation_size = sizeof(PasswordEncoderActivations),
+        };
     }
 }
diff --git a/src/pufferlib.cu b/src/pufferlib.cu
index a6f9d3c9ff..7bf5f34763 100644
--- a/src/pufferlib.cu
+++ b/src/pufferlib.cu
@@ -212,6 +212,10 @@ struct PrioBuffers {
 
 struct StateBuffer {
     PufferState* states;       // CPU state_buffer_size entries
+    float* scores_host;        // CPU state scores used for frontier sampling
+    int* score_counts_host;    // CPU count per integer score bucket
+    float* advantages_host;    // CPU mirror used by post-step state capture
+    float* advantages_tmp_host;
     int capacity;
     int size;
     int write_pos;
@@ -221,12 +225,15 @@ struct StateBuffer {
     int num_fresh_envs;
     int num_cl_agents;
     int num_fresh_agents;
+    float max_priority;
+    int score_count_capacity;
     int* env_state_inds_host;  // CPU scratch, length num_envs
     int* state_inds_host;      // CPU scratch, length total_agents
-    PrecisionTensor advantages; // GPU, shape {state_buffer_size}
+    FloatTensor advantages;    // GPU, shape {state_buffer_size}
     PrecisionTensor importance; // GPU, shape {total_agents}; fresh=1, CL=PER IS weight
     IntTensor state_inds;       // GPU, shape {total_agents}
     PrioBuffers prio_bufs;      // GPU CDF/probs/idx/weights for curriculum
+    pthread_mutex_t lock;
 };
 
 void register_prio_buffers(PrioBuffers& bufs, Allocator* alloc, int B, int minibatch_segments) {
@@ -325,6 +332,7 @@ typedef struct {
     float max_grad_norm;
     // PPO
     float clip_coef;
+    float target_kl;
     float vf_clip_coef;
     float vf_coef;
     float ent_coef;
@@ -345,6 +353,7 @@ typedef struct {
     // Curriculum state buffer
     int state_buffer_size;
     float cl_frac;
+    bool anneal_cl;
     int warmup_states;
     float explore_alpha;
     float explore_beta;
@@ -751,6 +760,65 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
     profile_end(hypers.profile);
 }
 
+static inline void update_curriculum_step_states(PuffeRL* pufferl, int buf_idx);
+
+__global__ void zero_terminal_states_kernel(
+        precision_t* __restrict__ states,
+        const float* __restrict__ terminals,
+        int num_layers, int batch, int hidden_size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = num_layers * batch * hidden_size;
+    if (idx >= total) {
+        return;
+    }
+
+    int b = (idx / hidden_size) % batch;
+    if (terminals[b] != 0.0f) {
+        states[idx] = from_float(0.0f);
+    }
+}
+
+static inline void zero_terminal_states(
+        PrecisionTensor* states, const float* terminals,
+        int num_layers, int batch, int hidden_size, cudaStream_t stream) {
+    if (states == NULL || states->data == NULL || batch <= 0) {
+        return;
+    }
+    int total = num_layers * batch * hidden_size;
+    zero_terminal_states_kernel<<<grid_size(total), BLOCK_SIZE, 0, stream>>>(
+        states->data, terminals, num_layers, batch, hidden_size);
+}
+
+static inline void reset_terminated_recurrent_states(PuffeRL* pufferl, int buf) {
+    HypersT& hypers = pufferl->hypers;
+    int block_size = pufferl->vec->total_agents / hypers.num_buffers;
+    int start = buf * block_size;
+    int num_banks = 1 + pufferl->num_frozen_banks;
+    cudaStream_t stream = pufferl->vec->streams[buf];
+
+    for (int b = 0; b < num_banks; b++) {
+        int bank_off = pufferl->bank_layout ? pufferl->bank_layout[b] : 0;
+        int bank_end = pufferl->bank_layout ? pufferl->bank_layout[b + 1] : block_size;
+        int bank_size = bank_end - bank_off;
+        if (bank_size <= 0) {
+            continue;
+        }
+
+        PrecisionTensor* states = b == 0
+            ? &pufferl->buffer_states[buf]
+            : &pufferl->frozen_banks[b - 1].buffer_states[buf];
+        zero_terminal_states(states, pufferl->env.terminals.data + start + bank_off,
+            (int)states->shape[0], bank_size, (int)states->shape[2], stream);
+    }
+}
+
+extern "C" void post_step_callback_wrapper(void* ctx, int buf, int t) {
+    (void)t;
+    PuffeRL* pufferl = (PuffeRL*)ctx;
+    update_curriculum_step_states(pufferl, buf);
+    reset_terminated_recurrent_states(pufferl, buf);
+}
+
 
 __device__ __forceinline__ float load_logit_masked(
         const precision_t* __restrict__ logits, int logits_base,
@@ -1175,7 +1243,7 @@ __global__ void compute_prio_adv_reduction(
 }
 
 __global__ void scatter_state_advantages(
-        precision_t* __restrict__ dst,
+        float* __restrict__ dst,
         const int* __restrict__ state_inds,
         const precision_t* __restrict__ advantages_bt,
         int env_start, int env_count, int agents_per_env, int horizon) {
@@ -1189,12 +1257,12 @@ __global__ void scatter_state_advantages(
         for (int a = 0; a < agents_per_env; a++) {
             sum_abs += fabsf(to_float(advantages_bt[(agent_start + a) * horizon]));
         }
-        dst[state_inds[agent_start]] = from_float(sum_abs / (float)agents_per_env);
+        dst[state_inds[agent_start]] = sum_abs / (float)agents_per_env;
     }
 }
 
 __global__ void compute_state_prio_abs(
-        const precision_t* __restrict__ advantages,
+        const float* __restrict__ advantages,
         float* __restrict__ prio_weights,
         float prio_alpha, int length) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1202,7 +1270,7 @@ __global__ void compute_state_prio_abs(
         return;
     }
 
-    float adv = fabsf(to_float(advantages[idx]));
+    float adv = fabsf(advantages[idx]);
     float pw;
     if (prio_alpha == 0.0f) {
         pw = 1.0f;
@@ -1461,29 +1529,55 @@ int init_state_buffer(PuffeRL* pufferl) {
 
     size_t state_bytes = capacity * state_size;
     buf->states = (PufferState*)malloc(state_bytes);
+    buf->scores_host = (float*)calloc(capacity, sizeof(float));
+    buf->score_count_capacity = 4096;
+    buf->score_counts_host = (int*)calloc(
+        (size_t)buf->score_count_capacity, sizeof(int));
+    buf->advantages_host = (float*)calloc(capacity, sizeof(float));
+    buf->advantages_tmp_host = (float*)calloc(capacity, sizeof(float));
     buf->env_state_inds_host = (int*)malloc((size_t)buf->num_envs * sizeof(int));
     buf->state_inds_host = (int*)malloc((size_t)pufferl->hypers.total_agents * sizeof(int));
-    if (buf->states == NULL || buf->env_state_inds_host == NULL || buf->state_inds_host == NULL) {
+    if (buf->states == NULL || buf->scores_host == NULL || buf->score_counts_host == NULL
+            || buf->advantages_host == NULL || buf->advantages_tmp_host == NULL
+            || buf->env_state_inds_host == NULL || buf->state_inds_host == NULL) {
         fprintf(stderr,
             "Failed to allocate curriculum state buffer: capacity=%d state_size=%d bytes=%zu\n",
             buf->capacity, (int)state_size, state_bytes);
         free(buf->states);
+        free(buf->scores_host);
+        free(buf->score_counts_host);
+        free(buf->advantages_host);
+        free(buf->advantages_tmp_host);
         free(buf->env_state_inds_host);
         free(buf->state_inds_host);
         buf->states = NULL;
+        buf->scores_host = NULL;
+        buf->score_counts_host = NULL;
+        buf->advantages_host = NULL;
+        buf->advantages_tmp_host = NULL;
         buf->env_state_inds_host = NULL;
         buf->state_inds_host = NULL;
         return 0;
     }
+    pthread_mutex_init(&buf->lock, NULL);
     return 1;
 }
 
 void close_state_buffer(PuffeRL* pufferl) {
     StateBuffer* buf = &pufferl->state_buf;
     free(buf->states);
+    free(buf->scores_host);
+    free(buf->score_counts_host);
+    free(buf->advantages_host);
+    free(buf->advantages_tmp_host);
     free(buf->env_state_inds_host);
     free(buf->state_inds_host);
+    pthread_mutex_destroy(&buf->lock);
     buf->states = NULL;
+    buf->scores_host = NULL;
+    buf->score_counts_host = NULL;
+    buf->advantages_host = NULL;
+    buf->advantages_tmp_host = NULL;
     buf->env_state_inds_host = NULL;
     buf->state_inds_host = NULL;
 }
@@ -1504,16 +1598,131 @@ static int fixed_agents_per_env(StaticVec* vec) {
     return agents_per_env;
 }
 
-static inline void store_curriculum_states(StaticVec* vec, PufferState* states,
-        const int* state_inds, int env_start, int env_count) {
+static inline float curriculum_state_priority(Env* env) {
+#if PUFFER_HAS_STATE
+#ifdef PUFFER_STATE_SCORE
+    float score = (float)PUFFER_STATE_SCORE(env);
+    return score > 0.0f ? score : 0.0f;
+#else
+    (void)env;
+    return 1.0f;
+#endif
+#else
+    (void)env;
+    return 0.0f;
+#endif
+}
+
+static inline int curriculum_score_bucket(StateBuffer* buf, float prio) {
+    int bucket = (int)floorf(prio + 0.5f);
+    if (bucket < 0 || bucket >= buf->score_count_capacity) {
+        return -1;
+    }
+    return bucket;
+}
+
+static inline int curriculum_score_count(StateBuffer* buf, float prio) {
+    int bucket = curriculum_score_bucket(buf, prio);
+    return bucket >= 0 ? buf->score_counts_host[bucket] : 0;
+}
+
+static inline int reserve_curriculum_state_slot(StateBuffer* buf, float prio) {
+    if (prio <= 0.0f || buf->capacity <= 0) {
+        return -1;
+    }
+
+    if (buf->size < buf->capacity) {
+        int state_idx = buf->write_pos;
+        buf->write_pos = (buf->write_pos + 1) % buf->capacity;
+        buf->size += 1;
+        return state_idx;
+    }
+
+    int same_score_idx = -1;
+    int duplicate_idx = -1;
+    float duplicate_prio = 1.0e30f;
+    int new_bucket = curriculum_score_bucket(buf, prio);
+    int probes = buf->capacity < 64 ? buf->capacity : 64;
+    for (int i = 0; i < probes; i++) {
+        int state_idx = (buf->write_pos + i) % buf->capacity;
+        float old_prio = buf->scores_host[state_idx];
+        int old_bucket = curriculum_score_bucket(buf, old_prio);
+        if (old_bucket == new_bucket) {
+            same_score_idx = state_idx;
+            break;
+        }
+
+        if (old_prio <= prio) {
+            if (curriculum_score_count(buf, old_prio) > 1 && old_prio < duplicate_prio) {
+                duplicate_prio = old_prio;
+                duplicate_idx = state_idx;
+            }
+        }
+    }
+
+    int best_idx = same_score_idx >= 0 ? same_score_idx
+        : duplicate_idx;
+    buf->write_pos = (buf->write_pos + probes) % buf->capacity;
+    return best_idx;
+}
+
+static inline int write_curriculum_state(StateBuffer* buf, Env* env, float prio) {
+#if PUFFER_HAS_STATE
+    int state_idx = reserve_curriculum_state_slot(buf, prio);
+    if (state_idx < 0) {
+        return -1;
+    }
+
+    int old_bucket = curriculum_score_bucket(buf, buf->scores_host[state_idx]);
+    if (old_bucket >= 0 && buf->score_counts_host[old_bucket] > 0) {
+        buf->score_counts_host[old_bucket] -= 1;
+    }
+    int new_bucket = curriculum_score_bucket(buf, prio);
+    if (new_bucket >= 0) {
+        buf->score_counts_host[new_bucket] += 1;
+    }
+    buf->states[state_idx] = env->state;
+    buf->scores_host[state_idx] = prio;
+    buf->advantages_host[state_idx] = prio;
+    buf->max_priority = fmaxf(buf->max_priority, prio);
+    return state_idx;
+#else
+    (void)buf;
+    (void)env;
+    (void)prio;
+    return -1;
+#endif
+}
+
+static inline int best_curriculum_state_slot(StateBuffer* buf) {
+    if (buf->size <= 0) {
+        return -1;
+    }
+
+    int best_idx = 0;
+    float best_score = buf->scores_host[0];
+    for (int i = 1; i < buf->size; i++) {
+        if (buf->scores_host[i] > best_score) {
+            best_score = buf->scores_host[i];
+            best_idx = i;
+        }
+    }
+    return best_idx;
+}
+
+static inline void store_curriculum_states(StaticVec* vec, StateBuffer* buf,
+        int* state_inds, int env_start, int env_count) {
 #if PUFFER_HAS_STATE
     Env* envs = vec->envs;
     for (int i = 0; i < env_count; i++) {
-        states[state_inds[i]] = envs[env_start + i].state;
+        Env* env = &envs[env_start + i];
+        float prio = curriculum_state_priority(env);
+        int state_idx = write_curriculum_state(buf, env, prio);
+        state_inds[i] = state_idx >= 0 ? state_idx : 0;
     }
 #else
     (void)vec;
-    (void)states;
+    (void)buf;
     (void)state_inds;
     (void)env_start;
     (void)env_count;
@@ -1542,6 +1751,152 @@ static inline void load_curriculum_states(StaticVec* vec, const PufferState* sta
 #endif
 }
 
+// Keep states reached by replayed curriculum slots from being overwritten before
+// they can seed new on-policy rollouts and enter the state buffer frontier.
+static inline void promote_curriculum_successors(StaticVec* vec,
+        int num_fresh_envs, int num_cl_envs, long epoch) {
+#if PUFFER_HAS_STATE
+    if (num_fresh_envs <= 0 || num_cl_envs <= 0) {
+        return;
+    }
+
+    Env* envs = vec->envs;
+    int offset = (int)((epoch * (long)num_fresh_envs) % (long)num_cl_envs);
+    for (int i = 0; i < num_fresh_envs; i++) {
+        int src = num_fresh_envs + ((offset + i) % num_cl_envs);
+        envs[i].state = envs[src].state;
+#ifdef PUFFER_STATE_REFRESH
+        PUFFER_STATE_REFRESH(&envs[i]);
+#endif
+    }
+#else
+    (void)vec;
+    (void)num_fresh_envs;
+    (void)num_cl_envs;
+    (void)epoch;
+#endif
+}
+
+static inline void sync_state_priorities_to_device(StateBuffer* buf, cudaStream_t stream) {
+    if (buf->size <= 0) {
+        return;
+    }
+    cudaMemcpyAsync(buf->advantages.data, buf->advantages_host,
+        (size_t)buf->size * sizeof(float), cudaMemcpyHostToDevice, stream);
+}
+
+static inline int fill_frontier_state_inds(StateBuffer* buf, int* out, int count) {
+    if (count <= 0 || buf->size <= 0 || buf->max_priority <= 0.0f) {
+        return 0;
+    }
+
+    int max_score = (int)floorf(buf->max_priority + 0.5f);
+    std::vector<int> by_score((size_t)max_score + 1, -1);
+    for (int i = 0; i < buf->size; i++) {
+        int score = (int)floorf(buf->scores_host[i] + 0.5f);
+        if (score > 0 && score <= max_score && by_score[(size_t)score] < 0) {
+            by_score[(size_t)score] = i;
+        }
+    }
+
+    std::vector<int> candidates;
+    candidates.reserve((size_t)max_score);
+    for (int score = 1; score <= max_score; score++) {
+        if (by_score[(size_t)score] >= 0) {
+            candidates.push_back(by_score[(size_t)score]);
+        }
+    }
+
+    if (candidates.empty()) {
+        int best_idx = 0;
+        float best_score = buf->scores_host[0];
+        for (int i = 1; i < buf->size; i++) {
+            if (buf->scores_host[i] > best_score) {
+                best_score = buf->scores_host[i];
+                best_idx = i;
+            }
+        }
+        for (int i = 0; i < count; i++) {
+            out[i] = best_idx;
+        }
+        return count;
+    }
+
+    int frontier_idx = candidates.back();
+    int frontier_count = count / 4;
+    for (int i = 0; i < frontier_count; i++) {
+        out[i] = frontier_idx;
+    }
+    int remaining = count - frontier_count;
+    for (int i = 0; i < remaining; i++) {
+        out[frontier_count + i] = candidates[(size_t)i % candidates.size()];
+    }
+    return count;
+}
+
+static inline void update_curriculum_step_states(PuffeRL* pufferl, int buf_idx) {
+#if PUFFER_HAS_STATE
+    if (!pufferl->curriculum_enabled) {
+        return;
+    }
+
+    StateBuffer* buf = &pufferl->state_buf;
+    StaticVec* vec = pufferl->vec;
+    if (buf->capacity <= 0) {
+        return;
+    }
+
+    int env_start = vec->buffer_env_starts[buf_idx];
+    int env_count = vec->buffer_env_counts[buf_idx];
+    int agents_per_env = buf->agents_per_env;
+    int first_cl_env = buf->num_fresh_envs;
+    int last_cl_env = first_cl_env + buf->num_cl_envs;
+    Env* envs = vec->envs;
+
+    pthread_mutex_lock(&buf->lock);
+    for (int i = 0; i < env_count; i++) {
+        Env* env = &envs[env_start + i];
+        float prio = curriculum_state_priority(env);
+        write_curriculum_state(buf, env, prio);
+    }
+
+    for (int i = 0; i < env_count; i++) {
+        int env_idx = env_start + i;
+        if (env_idx < first_cl_env || env_idx >= last_cl_env) {
+            continue;
+        }
+
+        int agent_idx = env_idx * agents_per_env;
+        if (vec->terminals[agent_idx] == 0.0f) {
+            continue;
+        }
+
+        int state_idx = buf->env_state_inds_host[env_idx];
+        if (state_idx < 0 || state_idx >= buf->size) {
+            state_idx = best_curriculum_state_slot(buf);
+        }
+        if (state_idx < 0) {
+            continue;
+        }
+
+        Env* env = &envs[env_idx];
+        env->state = buf->states[state_idx];
+#ifdef PUFFER_STATE_REFRESH
+        PUFFER_STATE_REFRESH(env);
+#endif
+    }
+    pthread_mutex_unlock(&buf->lock);
+#else
+    (void)pufferl;
+    (void)buf_idx;
+#endif
+}
+
+extern "C" void curriculum_step_callback_wrapper(void* ctx, int buf, int t) {
+    (void)t;
+    update_curriculum_step_states((PuffeRL*)ctx, buf);
+}
+
 static inline void expand_env_state_inds(StateBuffer* buf) {
     int num_envs = buf->num_fresh_envs + buf->num_cl_envs;
     int agents_per_env = buf->agents_per_env;
@@ -1565,7 +1920,7 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
     int total_epochs = h->total_timesteps / (h->total_agents * h->horizon);
     float progress = total_epochs > 0 ? (float)pufferl->epoch / (float)total_epochs : 1.0f;
     progress = fminf(1.0f, fmaxf(0.0f, progress));
-    float current_cl_frac = h->cl_frac;
+    float current_cl_frac = h->anneal_cl ? h->cl_frac * progress : h->cl_frac;
     int configured_cl = clamp_int((int)(current_cl_frac * (float)total_envs), 0, total_envs);
     int do_warmup = (buf->size == 0 || buf->size < h->warmup_states);
     int num_cl_envs = do_warmup ? 0 : configured_cl;
@@ -1581,11 +1936,17 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
     vec->log_env_limit = (num_cl_envs > 0) ? num_fresh_envs : 0;
 
     if (num_cl_envs > 0) {
+        sync_state_priorities_to_device(buf, stream);
+
         compute_state_prio_abs<<<grid_size(buf->size), BLOCK_SIZE, 0, stream>>>(
             buf->advantages.data, buf->prio_bufs.prio_probs.data,
             h->explore_alpha, buf->size);
         compute_state_prio_normalize<<<1, PRIO_BLOCK_SIZE, 0, stream>>>(
             buf->prio_bufs.prio_probs.data, buf->size);
+#ifdef PUFFER_STATE_SCORE
+        fill_frontier_state_inds(buf, buf->env_state_inds_host + num_fresh_envs,
+            num_cl_envs);
+#else
         build_cdf_cuda(buf->prio_bufs.cdf.data, buf->prio_bufs.prio_probs.data,
             buf->prio_bufs.cdf_block_sums.data, buf->size, stream);
         int threads = 256;
@@ -1598,6 +1959,7 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
         cudaMemcpyAsync(buf->env_state_inds_host + num_fresh_envs, buf->prio_bufs.idx.data,
             num_cl_envs * sizeof(int), cudaMemcpyDeviceToHost, stream);
         cudaStreamSynchronize(stream);
+#endif
 
         load_curriculum_states(vec, buf->states, buf->env_state_inds_host + num_fresh_envs,
             num_fresh_envs, num_cl_envs);
@@ -1613,13 +1975,8 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
         }
     }
 
-    for (int i = 0; i < num_fresh_envs; i++) {
-        buf->env_state_inds_host[i] = (buf->write_pos + i) % buf->capacity;
-    }
     if (num_fresh_envs > 0) {
-        store_curriculum_states(vec, buf->states, buf->env_state_inds_host, 0, num_fresh_envs);
-        buf->write_pos = (buf->write_pos + num_fresh_envs) % buf->capacity;
-        buf->size = clamp_int(buf->size + num_fresh_envs, 0, buf->capacity);
+        store_curriculum_states(vec, buf, buf->env_state_inds_host, 0, num_fresh_envs);
     }
 
     expand_env_state_inds(buf);
@@ -1648,6 +2005,22 @@ void curriculum_update_advantages(PuffeRL* pufferl, PrecisionTensor* advantages,
             buf->advantages.data, buf->state_inds.data, advantages->data,
             0, num_fresh_envs, agents_per_env, horizon);
     }
+
+    int total_envs = num_fresh_envs + num_cl_envs;
+    if (total_envs > 0 && buf->size > 0) {
+        cudaMemcpyAsync(buf->advantages_tmp_host, buf->advantages.data,
+            (size_t)buf->size * sizeof(float), cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+        for (int env_idx = 0; env_idx < total_envs; env_idx++) {
+            int agent_start = env_idx * agents_per_env;
+            int state_idx = buf->state_inds_host[agent_start];
+            if (state_idx >= 0 && state_idx < buf->size) {
+                buf->advantages_host[state_idx] = fmaxf(
+                    buf->advantages_host[state_idx],
+                    fabsf(buf->advantages_tmp_host[state_idx]));
+            }
+        }
+    }
 }
 
 // Experience the puffer advantage! Generalized advantage estimation + V-Trace
@@ -1968,6 +2341,15 @@ void train_impl(PuffeRL& pufferl) {
     cudaEventRecord(pufferl.profile.events[1]);  // pre-loop end
 
     int total_minibatches = hypers.replay_ratio * batch_size / hypers.minibatch_size;
+    int completed_minibatches = 0;
+    float last_kl_sum = 0.0f;
+    float last_loss_n = 0.0f;
+    if (hypers.target_kl > 0.0f) {
+        cudaMemcpy(&last_kl_sum, pufferl.losses_puf.data + LOSS_APPROX_KL,
+            sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&last_loss_n, pufferl.losses_puf.data + LOSS_N,
+            sizeof(float), cudaMemcpyDeviceToHost);
+    }
     for (int mb = 0; mb < total_minibatches; ++mb) {
         cudaEventRecord(pufferl.profile.events[2]);  // start of misc (overwritten each iter)
         puf_zero(&advantages_puf, train_stream);
@@ -2024,7 +2406,9 @@ void train_impl(PuffeRL& pufferl) {
             cudaStream_t stream = train_stream;
             PrecisionTensor obs_puf = graph.mb_obs;
             PrecisionTensor state_puf = graph.mb_state;
-            PrecisionTensor dec_puf = policy_forward_train(&pufferl.policy, pufferl.weights, pufferl.train_activations, obs_puf, state_puf, stream);
+            PrecisionTensor dec_puf = policy_forward_train(&pufferl.policy,
+                pufferl.weights, pufferl.train_activations,
+                obs_puf, state_puf, stream);
             DecoderWeights* dw_train = (DecoderWeights*)pufferl.weights.decoder;
             PrecisionTensor p_logstd;
             if (dw_train->continuous) {
@@ -2079,22 +2463,39 @@ void train_impl(PuffeRL& pufferl) {
                 (const char*)graph.mb_newvalue.data, num_idx, row_bytes);
         }
         cudaEventRecord(pufferl.profile.events[4]);  // end forward
+        completed_minibatches += 1;
+
+        if (hypers.target_kl > 0.0f && mb + 1 < total_minibatches) {
+            float kl_sum = 0.0f;
+            float loss_n = 0.0f;
+            cudaMemcpy(&kl_sum, pufferl.losses_puf.data + LOSS_APPROX_KL,
+                sizeof(float), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&loss_n, pufferl.losses_puf.data + LOSS_N,
+                sizeof(float), cudaMemcpyDeviceToHost);
+            float denom = loss_n - last_loss_n;
+            float mb_kl = denom > 0.0f ? (kl_sum - last_kl_sum) / denom : 0.0f;
+            last_kl_sum = kl_sum;
+            last_loss_n = loss_n;
+            if (mb_kl > hypers.target_kl) {
+                break;
+            }
+        }
     }
     pufferl.epoch += 1;
 
     cudaStreamSynchronize(pufferl.default_stream);
 
-    if (total_minibatches > 0) {
+    if (completed_minibatches > 0) {
         float ms;
         // Pre-loop setup (transpose, advantage, allocs)
         cudaEventElapsedTime(&ms, pufferl.profile.events[0], pufferl.profile.events[1]);
         pufferl.profile.accum[PROF_TRAIN_MISC] += ms;
         // In-loop misc (last iteration, representative) scaled by count
         cudaEventElapsedTime(&ms, pufferl.profile.events[2], pufferl.profile.events[3]);
-        pufferl.profile.accum[PROF_TRAIN_MISC] += ms * total_minibatches;
+        pufferl.profile.accum[PROF_TRAIN_MISC] += ms * completed_minibatches;
         // In-loop forward (last iteration, representative) scaled by count
         cudaEventElapsedTime(&ms, pufferl.profile.events[3], pufferl.profile.events[4]);
-        pufferl.profile.accum[PROF_TRAIN_FORWARD] += ms * total_minibatches;
+        pufferl.profile.accum[PROF_TRAIN_FORWARD] += ms * completed_minibatches;
     }
 
 }
@@ -2635,7 +3036,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
     }
 
     create_static_threads(vec, hypers.num_threads, horizon, pufferl.get(),
-        net_callback_wrapper, thread_init_wrapper);
+        net_callback_wrapper, post_step_callback_wrapper, thread_init_wrapper);
     static_vec_reset(vec);
 
     if (hypers.profile) {
@@ -2657,9 +3058,13 @@ void close_impl(PuffeRL& pufferl) {
         cudaProfilerStop();
     }
 
-    cudaGraphExecDestroy(pufferl.train_cudagraph);
-    for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) {
-        cudaGraphExecDestroy(pufferl.fused_rollout_cudagraphs[i]);
+    if (pufferl.train_captured) {
+        cudaGraphExecDestroy(pufferl.train_cudagraph);
+    }
+    if (pufferl.rollout_captured && pufferl.fused_rollout_cudagraphs != NULL) {
+        for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) {
+            cudaGraphExecDestroy(pufferl.fused_rollout_cudagraphs[i]);
+        }
     }
 
     policy_weights_free(&pufferl.policy, &pufferl.weights);
diff --git a/src/vecenv.h b/src/vecenv.h
index 38fb7308aa..b08fb20e3f 100644
--- a/src/vecenv.h
+++ b/src/vecenv.h
@@ -124,6 +124,7 @@ typedef struct StaticVec {
 
 // Callback types
 typedef void (*net_callback_fn)(void* ctx, int buf, int t);
+typedef void (*step_callback_fn)(void* ctx, int buf, int t);
 typedef void (*thread_init_fn)(void* ctx, int buf);
 typedef void (*step_fn)(void* env);
 
@@ -140,7 +141,8 @@ void static_vec_close(StaticVec* vec);
 void static_vec_log(StaticVec* vec, Dict* out);
 void static_vec_eval_log(StaticVec* vec, Dict* out);
 void create_static_threads(StaticVec* vec, int num_threads, int horizon,
-    void* ctx, net_callback_fn net_callback, thread_init_fn thread_init);
+    void* ctx, net_callback_fn net_callback, step_callback_fn step_callback,
+    thread_init_fn thread_init);
 void static_vec_omp_step(StaticVec* vec);
 void static_vec_seq_step(StaticVec* vec);
 void static_vec_render(StaticVec* vec, int env_id);
@@ -280,6 +282,7 @@ typedef struct StaticOMPArg {
     int horizon;
     void* ctx;
     net_callback_fn net_callback;
+    step_callback_fn step_callback;
     thread_init_fn thread_init;
 } StaticOMPArg;
 
@@ -292,6 +295,7 @@ static void* static_omp_threadmanager(void* arg) {
     int horizon = worker_arg->horizon;
     void* ctx = worker_arg->ctx;
     net_callback_fn net_callback = worker_arg->net_callback;
+    step_callback_fn step_callback = worker_arg->step_callback;
     thread_init_fn thread_init = worker_arg->thread_init;
 
     if (thread_init != NULL) {
@@ -343,11 +347,6 @@ static void* static_omp_threadmanager(void* arg) {
             clock_gettime(CLOCK_MONOTONIC, &t1);
             my_accum[EVAL_ENV_STEP] += (t1.tv_sec - t0.tv_sec) * 1000.0f + (t1.tv_nsec - t0.tv_nsec) / 1e6f;
 
-            cudaMemcpyAsync(
-                vec->gpu_observations.data + agent_start * OBS_SIZE,
-                vec->observations.data + agent_start * OBS_SIZE,
-                agents_per_buffer * OBS_SIZE * obs_element_size(),
-                cudaMemcpyHostToDevice, stream);
             cudaMemcpyAsync(
                 &vec->gpu_rewards[agent_start],
                 &vec->rewards[agent_start],
@@ -358,6 +357,14 @@ static void* static_omp_threadmanager(void* arg) {
                 &vec->terminals[agent_start],
                 agents_per_buffer * sizeof(float),
                 cudaMemcpyHostToDevice, stream);
+            if (step_callback != NULL) {
+                step_callback(ctx, buf, t);
+            }
+            cudaMemcpyAsync(
+                vec->gpu_observations.data + agent_start * OBS_SIZE,
+                vec->observations.data + agent_start * OBS_SIZE,
+                agents_per_buffer * OBS_SIZE * obs_element_size(),
+                cudaMemcpyHostToDevice, stream);
 #ifdef MY_ACTION_MASK
             cudaMemcpyAsync(
                 vec->gpu_action_mask + agent_start * MY_ACTION_MASK,
@@ -638,7 +645,8 @@ void static_vec_reset(StaticVec* vec) {
 }
 
 void create_static_threads(StaticVec* vec, int num_threads, int horizon,
-        void* ctx, net_callback_fn net_callback, thread_init_fn thread_init) {
+        void* ctx, net_callback_fn net_callback, step_callback_fn step_callback,
+        thread_init_fn thread_init) {
     vec->threading = (StaticThreading*)calloc(1, sizeof(StaticThreading));
     vec->threading->num_threads = num_threads;
     vec->threading->num_buffers = vec->buffers;
@@ -656,6 +664,7 @@ void create_static_threads(StaticVec* vec, int num_threads, int horizon,
         args[i].horizon = horizon;
         args[i].ctx = ctx;
         args[i].net_callback = net_callback;
+        args[i].step_callback = step_callback;
         args[i].thread_init = thread_init;
         pthread_create(&vec->threading->threads[i], NULL, static_omp_threadmanager, &args[i]);
     }
diff --git a/tests/profile_kernels.cu b/tests/profile_kernels.cu
index a2af169838..a054a03be0 100644
--- a/tests/profile_kernels.cu
+++ b/tests/profile_kernels.cu
@@ -602,7 +602,8 @@ void profile_im2col(int B, int IC, int IH, int IW, int K, int S, int OH, int OW)
 }
 
 typedef struct {
-    PrecisionTensor state_advantages, rollout_advantages, importance;
+    FloatTensor state_advantages;
+    PrecisionTensor rollout_advantages, importance;
     FloatTensor prio_probs, cdf, cdf_block_sums;
     IntTensor sample_idx, state_inds;
     LongTensor rng_offset;
@@ -622,7 +623,7 @@ typedef struct {
 } CurriculumProfile;
 
 __global__ void init_curriculum_profile_kernel(
-        precision_t* state_advantages,
+        float* state_advantages,
         precision_t* rollout_advantages,
         int* state_inds,
         int64_t* rng_offset,
@@ -631,7 +632,7 @@ __global__ void init_curriculum_profile_kernel(
     int n = capacity > total_agents * horizon ? capacity : total_agents * horizon;
     if (idx < capacity) {
         float v = 0.25f + (float)(idx % 257) * 0.001f;
-        state_advantages[idx] = from_float(v);
+        state_advantages[idx] = v;
     }
     if (idx < total_agents) {
         state_inds[idx] = idx % capacity;
@@ -846,7 +847,7 @@ EnvSpeedArgs* create_envspeed(int total_agents, int num_buffers, int num_threads
         cudaStreamCreateWithFlags(&vec->streams[i], cudaStreamNonBlocking);
 
     printf("Created %d envs (%s) for %d total_agents\n", vec->size, TOSTRING(ENV_NAME), total_agents);
-    create_static_threads(vec, num_threads, horizon, nullptr, empty_net_callback, empty_thread_init);
+    create_static_threads(vec, num_threads, horizon, nullptr, empty_net_callback, nullptr, empty_thread_init);
     static_vec_reset(vec);
     cudaDeviceSynchronize();