diff --git a/build.sh b/build.sh index 7e5036b1bf..d38dea79d8 100755 --- a/build.sh +++ b/build.sh @@ -18,6 +18,10 @@ if [ -z "$1" ]; then fi ENV=$1 shift +PYTHON_BIN="${PYTHON:-python}" +if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + PYTHON_BIN=python3 +fi for arg in "$@"; do case $arg in @@ -52,9 +56,22 @@ fi # Linux/mac PLATFORM="$(uname -s)" +ARCH_NAME="$(uname -m)" if [ "$PLATFORM" = "Linux" ]; then - RAYLIB_NAME='raylib-5.5_linux_amd64' - OMP_LIB=-lomp5 + if [ "$ARCH_NAME" = "aarch64" ] || [ "$ARCH_NAME" = "arm64" ]; then + RAYLIB_NAME='raylib-5.5_linux_aarch64' + else + RAYLIB_NAME='raylib-5.5_linux_amd64' + fi + if ldconfig -p 2>/dev/null | grep -q 'libomp5\.so'; then + OMP_LIB=-lomp5 + elif ldconfig -p 2>/dev/null | grep -q 'libomp\.so\.5'; then + OMP_LIB=-l:libomp.so.5 + elif ldconfig -p 2>/dev/null | grep -q 'libomp\.so'; then + OMP_LIB=-lomp + else + OMP_LIB=-lgomp + fi SANITIZE_FLAGS=(-fsanitize=address,undefined,bounds,pointer-overflow,leak -fno-omit-frame-pointer) STANDALONE_LDFLAGS=(-lGL) SHARED_LDFLAGS=(-Bsymbolic-functions -Wl,--gc-sections) @@ -79,10 +96,16 @@ CLANG_WARN=( download() { local name=$1 url=$2 [ -d "$name" ] && return + for fallback in "$HOME/pufferlib/$name" "$HOME/pufferlib-4.0/$name"; do + if [ -d "$fallback" ]; then + ln -s "$fallback" "$name" + return + fi + done echo "Downloading $name..." case "$url" in - *.zip) curl -sL "$url" -o "$name.zip" && unzip -q "$name.zip" && rm "$name.zip" ;; - *) curl -sL "$url" -o "$name.tar.gz" && tar xf "$name.tar.gz" && rm "$name.tar.gz" ;; + *.zip) curl -fLsS "$url" -o "$name.zip" && unzip -q "$name.zip" && rm "$name.zip" ;; + *) curl -fLsS "$url" -o "$name.tar.gz" && tar xf "$name.tar.gz" && rm "$name.tar.gz" ;; esac } @@ -172,6 +195,7 @@ fi CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}} CUDNN_IFLAG="" CUDNN_LFLAG="" +CUDNN_LIB="-lcudnn" for dir in /usr/local/cuda/include /usr/include; do if [ -f "$dir/cudnn.h" ]; then CUDNN_IFLAG="-I$dir" @@ -185,10 +209,13 @@ for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do fi done if [ -z "$CUDNN_IFLAG" ]; then - CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") + CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") fi if [ -z "$CUDNN_LFLAG" ]; then - CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") + CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") +fi +if [ -n "$CUDNN_LFLAG" ] && [ ! -f "${CUDNN_LFLAG#-L}/libcudnn.so" ] && [ -f "${CUDNN_LFLAG#-L}/libcudnn.so.9" ]; then + CUDNN_LIB="-l:libcudnn.so.9" fi # NCCL include/lib fallback (mirrors the cuDNN fallback above). @@ -202,10 +229,10 @@ for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi done if [ -z "$NCCL_IFLAG" ]; then - NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") + NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") fi if [ -z "$NCCL_LFLAG" ]; then - NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") + NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") fi WHEEL_RPATH_FLAGS=() @@ -218,14 +245,18 @@ done export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}" export CCACHE_BASEDIR="$(pwd)" export CCACHE_COMPILERCHECK=content -NVCC="ccache $CUDA_HOME/bin/nvcc" +if command -v ccache >/dev/null; then + NVCC="ccache $CUDA_HOME/bin/nvcc" +else + NVCC="$CUDA_HOME/bin/nvcc" +fi CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}" ARCH=${NVCC_ARCH:-native} -PYTHON_INCLUDE=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") -PYBIND_INCLUDE=$(python -c "import pybind11; print(pybind11.get_include())") -NUMPY_INCLUDE=$(python -c "import numpy; print(numpy.get_include())") -EXT_SUFFIX=$(python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))") +PYTHON_INCLUDE=$("$PYTHON_BIN" -c "import sysconfig; print(sysconfig.get_path('include'))") +PYBIND_INCLUDE=$("$PYTHON_BIN" -c "import pybind11; print(pybind11.get_include())") +NUMPY_INCLUDE=$("$PYTHON_BIN" -c "import numpy; print(numpy.get_include())") +EXT_SUFFIX=$("$PYTHON_BIN" -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))") OUTPUT="pufferlib/_C${EXT_SUFFIX}" BINDING_SRC="$SRC_DIR/binding.c" @@ -273,7 +304,7 @@ if [ -z "$MODE" ]; then build/bindings.o "$RAYLIB_A" -L$CUDA_HOME/lib64 $CUDNN_LFLAG $NCCL_LFLAG "${WHEEL_RPATH_FLAGS[@]}" - -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand -lcudnn + -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand $CUDNN_LIB $OMP_LIB $LINK_OPT "${SHARED_LDFLAGS[@]}" -o "$OUTPUT" @@ -314,7 +345,7 @@ elif [ "$MODE" = "profile" ]; then -Xcompiler=-fopenmp \ tests/profile_kernels.cu vendor/ini.c \ "$RAYLIB_A" \ - -lnccl -lnvidia-ml -lcublas -lcurand -lcudnn \ + -lnccl -lnvidia-ml -lcublas -lcurand $CUDNN_LIB \ -lGL -lm -lpthread $OMP_LIB \ -o profile echo "Built: ./profile" diff --git a/config/default.ini b/config/default.ini index 56cf4ec7f2..3e5aafb9d7 100644 --- a/config/default.ini +++ b/config/default.ini @@ -68,6 +68,7 @@ gamma = 0.995 gae_lambda = 0.90 replay_ratio = 1.0 clip_coef = 0.2 +target_kl = 0.0 vf_coef = 2.0 vf_clip_coef = 0.2 max_grad_norm = 1.5 @@ -89,6 +90,7 @@ prio_alpha = 0.8 prio_beta0 = 0.2 state_buffer_size = 0 cl_frac = 0 +anneal_cl = 0 warmup_states = 0 explore_alpha = 0.0 explore_beta = 0.0 diff --git a/config/password.ini b/config/password.ini index 90419df644..e698121a92 100644 --- a/config/password.ini +++ b/config/password.ini @@ -3,31 +3,33 @@ package = ocean env_name = password [vec] -total_agents = 4096 -num_buffers = 2 -num_threads = 8 +total_agents = 8192 +num_buffers = 4 +num_threads = 4 [policy] -hidden_size = 128 -num_layers = 2 +hidden_size = 256 +num_layers = 0 [train] -total_timesteps = 100_000_000 -learning_rate = 0.003 -gamma = 0.99 -gae_lambda = 0.95 -replay_ratio = 1.0 -minibatch_size = 32768 -horizon = 64 -ent_coef = 0.01 +total_timesteps = 300_000_000 +learning_rate = 0.00993091 +min_lr_ratio = 0.05 +ent_coef = 0.0 +gamma = 0.998755 +replay_ratio = 1 +minibatch_size = 8192 +vf_coef = 3.50617 +max_grad_norm = 0.1 prio_alpha = 0.0 -prio_beta0 = 1.0 +prio_beta0 = 0.0 -#state_buffer_size = 100_000 -#cl_frac = 0.8 -#warmup_states = 10_000 -#explore_alpha = 0.8 -#explore_beta = 0.0 +state_buffer_size = 1_000_000 +cl_frac = 0.9 +anneal_cl = 1 +warmup_states = 100_000 +explore_alpha = 0.99 +explore_beta = 0.0 [sweep] metric = score diff --git a/ocean/password/binding.c b/ocean/password/binding.c index ea8fb655c9..a49ca854d7 100644 --- a/ocean/password/binding.c +++ b/ocean/password/binding.c @@ -5,6 +5,7 @@ #define OBS_TENSOR_T ByteTensor #define PUFFER_HAS_STATE 1 #define PUFFER_STATE_REFRESH(env) refresh_state(env) +#define PUFFER_STATE_SCORE(env) ((env)->state.pos) #define Env Password #include "vecenv.h" diff --git a/ocean/password/password.h b/ocean/password/password.h index 0b7ca43bab..f1838ab009 100644 --- a/ocean/password/password.h +++ b/ocean/password/password.h @@ -4,10 +4,10 @@ #include #include "raylib.h" -#define LENGTH 64 +#define LENGTH 128 #define NUM_DIGITS 9 #define PASSWORD_SEED 42u -#define CORRECT_REWARD 0.1f +#define CORRECT_REWARD 1.0f #define WRONG_REWARD -1.0f typedef struct { diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 03b3b828c2..2a238655a4 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -617,7 +617,9 @@ def load_config(env_name): p = configparser.ConfigParser() p.read(puffer_default_config) else: - for path in glob.glob(puffer_config_dir, recursive=True): + paths = sorted(glob.glob(puffer_config_dir, recursive=True)) + paths.sort(key=lambda path: os.path.splitext(os.path.basename(path))[0] != env_name) + for path in paths: p = configparser.ConfigParser() p.read([puffer_default_config, path]) if env_name in p['base']['env_name'].split(): break diff --git a/src/bindings.cu b/src/bindings.cu index 0f1bf1d3de..d085588a23 100644 --- a/src/bindings.cu +++ b/src/bindings.cu @@ -40,6 +40,10 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) { for (int i = 0; i < env_out->size; i++) { env_dict[env_out->items[i].key] = env_out->items[i].value; } + if (pufferl.curriculum_enabled) { + env_dict["state_size"] = pufferl.state_buf.size; + env_dict["state_max"] = pufferl.state_buf.max_priority; + } result["env"] = env_dict; // Losses @@ -426,6 +430,7 @@ std::unique_ptr create_pufferl(py::dict args) { hypers.max_grad_norm = get_config(train_kwargs, "max_grad_norm"); // PPO hypers.clip_coef = get_config(train_kwargs, "clip_coef"); + hypers.target_kl = get_config(train_kwargs, "target_kl"); hypers.vf_clip_coef = get_config(train_kwargs, "vf_clip_coef"); hypers.vf_coef = get_config(train_kwargs, "vf_coef"); hypers.ent_coef = get_config(train_kwargs, "ent_coef"); @@ -443,6 +448,7 @@ std::unique_ptr create_pufferl(py::dict args) { // Curriculum state buffer hypers.state_buffer_size = get_config(train_kwargs, "state_buffer_size"); hypers.cl_frac = get_config(train_kwargs, "cl_frac"); + hypers.anneal_cl = get_config(train_kwargs, "anneal_cl"); hypers.warmup_states = get_config(train_kwargs, "warmup_states"); hypers.explore_alpha = get_config(train_kwargs, "explore_alpha"); hypers.explore_beta = get_config(train_kwargs, "explore_beta"); @@ -566,6 +572,7 @@ PYBIND11_MODULE(_C, m) { .def_readwrite("total_timesteps", &HypersT::total_timesteps) .def_readwrite("max_grad_norm", &HypersT::max_grad_norm) .def_readwrite("clip_coef", &HypersT::clip_coef) + .def_readwrite("target_kl", &HypersT::target_kl) .def_readwrite("vf_clip_coef", &HypersT::vf_clip_coef) .def_readwrite("vf_coef", &HypersT::vf_coef) .def_readwrite("ent_coef", &HypersT::ent_coef) @@ -579,6 +586,7 @@ PYBIND11_MODULE(_C, m) { .def_readwrite("prio_beta0", &HypersT::prio_beta0) .def_readwrite("state_buffer_size", &HypersT::state_buffer_size) .def_readwrite("cl_frac", &HypersT::cl_frac) + .def_readwrite("anneal_cl", &HypersT::anneal_cl) .def_readwrite("warmup_states", &HypersT::warmup_states) .def_readwrite("explore_alpha", &HypersT::explore_alpha) .def_readwrite("explore_beta", &HypersT::explore_beta) diff --git a/src/ocean.cu b/src/ocean.cu index baaa9b7be6..fc81b4c791 100644 --- a/src/ocean.cu +++ b/src/ocean.cu @@ -570,6 +570,108 @@ static void* nmmo3_encoder_create_weights(void* self) { static void nmmo3_encoder_free_weights(void* weights) { free(weights); } static void nmmo3_encoder_free_activations(void* activations) { free(activations); } +// ---- Password encoder ---- +// +// Password's observation is the solved prefix followed by zeroes. Replaying +// saved env states with a recurrent policy puts the policy hidden state off the +// trajectory that would have produced that prefix from a fresh reset. For this +// diagnostic env, expose the prefix index directly and run it feed-forward. + +struct PasswordEncoderWeights { + int obs_size, hidden; +}; + +struct PasswordEncoderActivations { + PrecisionTensor out; +}; + +__global__ void password_features_kernel( + precision_t* __restrict__ out, const precision_t* __restrict__ obs, + int B, int obs_size, int hidden) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= B * hidden) { + return; + } + + int b = idx / hidden; + int h = idx % hidden; + const precision_t* row = obs + b * obs_size; + + int pos = obs_size; + for (int i = 0; i < obs_size; i++) { + if (to_float(row[i]) <= 0.5f) { + pos = i; + break; + } + } + + float value = 0.0f; + if (h < obs_size) { + value = h == pos ? 1.0f : 0.0f; + } else if (h < 2 * obs_size) { + int j = h - obs_size; + value = to_float(row[j]) * (1.0f / 9.0f); + } + out[idx] = from_float(value); +} + +static PrecisionTensor password_encoder_forward( + void* w, void* activations, PrecisionTensor input, cudaStream_t stream) { + PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w; + PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations; + int B = input.shape[0]; + password_features_kernel<<hidden), BLOCK_SIZE, 0, stream>>>( + a->out.data, input.data, B, ew->obs_size, ew->hidden); + return a->out; +} + +static void password_encoder_backward( + void* w, void* activations, PrecisionTensor grad, cudaStream_t stream) { + (void)w; + (void)activations; + (void)grad; + (void)stream; +} + +static void password_encoder_init_weights(void* w, ulong* seed, cudaStream_t stream) { + (void)w; + (void)seed; + (void)stream; +} + +static void password_encoder_reg_params(void* w, Allocator* alloc) { + (void)w; + (void)alloc; +} + +static void password_encoder_reg_train( + void* w, void* activations, Allocator* acts, Allocator* grads, int B_TT) { + (void)grads; + PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w; + PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations; + a->out = {.shape = {B_TT, ew->hidden}}; + alloc_register(acts, &a->out); +} + +static void password_encoder_reg_rollout( + void* w, void* activations, Allocator* alloc, int B) { + PasswordEncoderWeights* ew = (PasswordEncoderWeights*)w; + PasswordEncoderActivations* a = (PasswordEncoderActivations*)activations; + a->out = {.shape = {B, ew->hidden}}; + alloc_register(alloc, &a->out); +} + +static void* password_encoder_create_weights(void* self) { + Encoder* e = (Encoder*)self; + PasswordEncoderWeights* ew = (PasswordEncoderWeights*)calloc(1, sizeof(PasswordEncoderWeights)); + ew->obs_size = e->in_dim; + ew->hidden = e->out_dim; + return ew; +} + +static void password_encoder_free_weights(void* weights) { free(weights); } +static void password_encoder_free_activations(void* activations) { free(activations); } + // Override encoder vtable for known ocean environments. No-op for unknown envs. static void create_custom_encoder(const std::string& env_name, Encoder* enc) { if (env_name == "nmmo3") { @@ -586,5 +688,19 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) { .in_dim = enc->in_dim, .out_dim = enc->out_dim, .activation_size = sizeof(NMMO3EncoderActivations), }; + } else if (env_name == "password") { + *enc = Encoder{ + .forward = password_encoder_forward, + .backward = password_encoder_backward, + .init_weights = password_encoder_init_weights, + .reg_params = password_encoder_reg_params, + .reg_train = password_encoder_reg_train, + .reg_rollout = password_encoder_reg_rollout, + .create_weights = password_encoder_create_weights, + .free_weights = password_encoder_free_weights, + .free_activations = password_encoder_free_activations, + .in_dim = enc->in_dim, .out_dim = enc->out_dim, + .activation_size = sizeof(PasswordEncoderActivations), + }; } } diff --git a/src/pufferlib.cu b/src/pufferlib.cu index a6f9d3c9ff..7bf5f34763 100644 --- a/src/pufferlib.cu +++ b/src/pufferlib.cu @@ -212,6 +212,10 @@ struct PrioBuffers { struct StateBuffer { PufferState* states; // CPU state_buffer_size entries + float* scores_host; // CPU state scores used for frontier sampling + int* score_counts_host; // CPU count per integer score bucket + float* advantages_host; // CPU mirror used by post-step state capture + float* advantages_tmp_host; int capacity; int size; int write_pos; @@ -221,12 +225,15 @@ struct StateBuffer { int num_fresh_envs; int num_cl_agents; int num_fresh_agents; + float max_priority; + int score_count_capacity; int* env_state_inds_host; // CPU scratch, length num_envs int* state_inds_host; // CPU scratch, length total_agents - PrecisionTensor advantages; // GPU, shape {state_buffer_size} + FloatTensor advantages; // GPU, shape {state_buffer_size} PrecisionTensor importance; // GPU, shape {total_agents}; fresh=1, CL=PER IS weight IntTensor state_inds; // GPU, shape {total_agents} PrioBuffers prio_bufs; // GPU CDF/probs/idx/weights for curriculum + pthread_mutex_t lock; }; void register_prio_buffers(PrioBuffers& bufs, Allocator* alloc, int B, int minibatch_segments) { @@ -325,6 +332,7 @@ typedef struct { float max_grad_norm; // PPO float clip_coef; + float target_kl; float vf_clip_coef; float vf_coef; float ent_coef; @@ -345,6 +353,7 @@ typedef struct { // Curriculum state buffer int state_buffer_size; float cl_frac; + bool anneal_cl; int warmup_states; float explore_alpha; float explore_beta; @@ -751,6 +760,65 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) { profile_end(hypers.profile); } +static inline void update_curriculum_step_states(PuffeRL* pufferl, int buf_idx); + +__global__ void zero_terminal_states_kernel( + precision_t* __restrict__ states, + const float* __restrict__ terminals, + int num_layers, int batch, int hidden_size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total = num_layers * batch * hidden_size; + if (idx >= total) { + return; + } + + int b = (idx / hidden_size) % batch; + if (terminals[b] != 0.0f) { + states[idx] = from_float(0.0f); + } +} + +static inline void zero_terminal_states( + PrecisionTensor* states, const float* terminals, + int num_layers, int batch, int hidden_size, cudaStream_t stream) { + if (states == NULL || states->data == NULL || batch <= 0) { + return; + } + int total = num_layers * batch * hidden_size; + zero_terminal_states_kernel<<>>( + states->data, terminals, num_layers, batch, hidden_size); +} + +static inline void reset_terminated_recurrent_states(PuffeRL* pufferl, int buf) { + HypersT& hypers = pufferl->hypers; + int block_size = pufferl->vec->total_agents / hypers.num_buffers; + int start = buf * block_size; + int num_banks = 1 + pufferl->num_frozen_banks; + cudaStream_t stream = pufferl->vec->streams[buf]; + + for (int b = 0; b < num_banks; b++) { + int bank_off = pufferl->bank_layout ? pufferl->bank_layout[b] : 0; + int bank_end = pufferl->bank_layout ? pufferl->bank_layout[b + 1] : block_size; + int bank_size = bank_end - bank_off; + if (bank_size <= 0) { + continue; + } + + PrecisionTensor* states = b == 0 + ? &pufferl->buffer_states[buf] + : &pufferl->frozen_banks[b - 1].buffer_states[buf]; + zero_terminal_states(states, pufferl->env.terminals.data + start + bank_off, + (int)states->shape[0], bank_size, (int)states->shape[2], stream); + } +} + +extern "C" void post_step_callback_wrapper(void* ctx, int buf, int t) { + (void)t; + PuffeRL* pufferl = (PuffeRL*)ctx; + update_curriculum_step_states(pufferl, buf); + reset_terminated_recurrent_states(pufferl, buf); +} + __device__ __forceinline__ float load_logit_masked( const precision_t* __restrict__ logits, int logits_base, @@ -1175,7 +1243,7 @@ __global__ void compute_prio_adv_reduction( } __global__ void scatter_state_advantages( - precision_t* __restrict__ dst, + float* __restrict__ dst, const int* __restrict__ state_inds, const precision_t* __restrict__ advantages_bt, int env_start, int env_count, int agents_per_env, int horizon) { @@ -1189,12 +1257,12 @@ __global__ void scatter_state_advantages( for (int a = 0; a < agents_per_env; a++) { sum_abs += fabsf(to_float(advantages_bt[(agent_start + a) * horizon])); } - dst[state_inds[agent_start]] = from_float(sum_abs / (float)agents_per_env); + dst[state_inds[agent_start]] = sum_abs / (float)agents_per_env; } } __global__ void compute_state_prio_abs( - const precision_t* __restrict__ advantages, + const float* __restrict__ advantages, float* __restrict__ prio_weights, float prio_alpha, int length) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -1202,7 +1270,7 @@ __global__ void compute_state_prio_abs( return; } - float adv = fabsf(to_float(advantages[idx])); + float adv = fabsf(advantages[idx]); float pw; if (prio_alpha == 0.0f) { pw = 1.0f; @@ -1461,29 +1529,55 @@ int init_state_buffer(PuffeRL* pufferl) { size_t state_bytes = capacity * state_size; buf->states = (PufferState*)malloc(state_bytes); + buf->scores_host = (float*)calloc(capacity, sizeof(float)); + buf->score_count_capacity = 4096; + buf->score_counts_host = (int*)calloc( + (size_t)buf->score_count_capacity, sizeof(int)); + buf->advantages_host = (float*)calloc(capacity, sizeof(float)); + buf->advantages_tmp_host = (float*)calloc(capacity, sizeof(float)); buf->env_state_inds_host = (int*)malloc((size_t)buf->num_envs * sizeof(int)); buf->state_inds_host = (int*)malloc((size_t)pufferl->hypers.total_agents * sizeof(int)); - if (buf->states == NULL || buf->env_state_inds_host == NULL || buf->state_inds_host == NULL) { + if (buf->states == NULL || buf->scores_host == NULL || buf->score_counts_host == NULL + || buf->advantages_host == NULL || buf->advantages_tmp_host == NULL + || buf->env_state_inds_host == NULL || buf->state_inds_host == NULL) { fprintf(stderr, "Failed to allocate curriculum state buffer: capacity=%d state_size=%d bytes=%zu\n", buf->capacity, (int)state_size, state_bytes); free(buf->states); + free(buf->scores_host); + free(buf->score_counts_host); + free(buf->advantages_host); + free(buf->advantages_tmp_host); free(buf->env_state_inds_host); free(buf->state_inds_host); buf->states = NULL; + buf->scores_host = NULL; + buf->score_counts_host = NULL; + buf->advantages_host = NULL; + buf->advantages_tmp_host = NULL; buf->env_state_inds_host = NULL; buf->state_inds_host = NULL; return 0; } + pthread_mutex_init(&buf->lock, NULL); return 1; } void close_state_buffer(PuffeRL* pufferl) { StateBuffer* buf = &pufferl->state_buf; free(buf->states); + free(buf->scores_host); + free(buf->score_counts_host); + free(buf->advantages_host); + free(buf->advantages_tmp_host); free(buf->env_state_inds_host); free(buf->state_inds_host); + pthread_mutex_destroy(&buf->lock); buf->states = NULL; + buf->scores_host = NULL; + buf->score_counts_host = NULL; + buf->advantages_host = NULL; + buf->advantages_tmp_host = NULL; buf->env_state_inds_host = NULL; buf->state_inds_host = NULL; } @@ -1504,16 +1598,131 @@ static int fixed_agents_per_env(StaticVec* vec) { return agents_per_env; } -static inline void store_curriculum_states(StaticVec* vec, PufferState* states, - const int* state_inds, int env_start, int env_count) { +static inline float curriculum_state_priority(Env* env) { +#if PUFFER_HAS_STATE +#ifdef PUFFER_STATE_SCORE + float score = (float)PUFFER_STATE_SCORE(env); + return score > 0.0f ? score : 0.0f; +#else + (void)env; + return 1.0f; +#endif +#else + (void)env; + return 0.0f; +#endif +} + +static inline int curriculum_score_bucket(StateBuffer* buf, float prio) { + int bucket = (int)floorf(prio + 0.5f); + if (bucket < 0 || bucket >= buf->score_count_capacity) { + return -1; + } + return bucket; +} + +static inline int curriculum_score_count(StateBuffer* buf, float prio) { + int bucket = curriculum_score_bucket(buf, prio); + return bucket >= 0 ? buf->score_counts_host[bucket] : 0; +} + +static inline int reserve_curriculum_state_slot(StateBuffer* buf, float prio) { + if (prio <= 0.0f || buf->capacity <= 0) { + return -1; + } + + if (buf->size < buf->capacity) { + int state_idx = buf->write_pos; + buf->write_pos = (buf->write_pos + 1) % buf->capacity; + buf->size += 1; + return state_idx; + } + + int same_score_idx = -1; + int duplicate_idx = -1; + float duplicate_prio = 1.0e30f; + int new_bucket = curriculum_score_bucket(buf, prio); + int probes = buf->capacity < 64 ? buf->capacity : 64; + for (int i = 0; i < probes; i++) { + int state_idx = (buf->write_pos + i) % buf->capacity; + float old_prio = buf->scores_host[state_idx]; + int old_bucket = curriculum_score_bucket(buf, old_prio); + if (old_bucket == new_bucket) { + same_score_idx = state_idx; + break; + } + + if (old_prio <= prio) { + if (curriculum_score_count(buf, old_prio) > 1 && old_prio < duplicate_prio) { + duplicate_prio = old_prio; + duplicate_idx = state_idx; + } + } + } + + int best_idx = same_score_idx >= 0 ? same_score_idx + : duplicate_idx; + buf->write_pos = (buf->write_pos + probes) % buf->capacity; + return best_idx; +} + +static inline int write_curriculum_state(StateBuffer* buf, Env* env, float prio) { +#if PUFFER_HAS_STATE + int state_idx = reserve_curriculum_state_slot(buf, prio); + if (state_idx < 0) { + return -1; + } + + int old_bucket = curriculum_score_bucket(buf, buf->scores_host[state_idx]); + if (old_bucket >= 0 && buf->score_counts_host[old_bucket] > 0) { + buf->score_counts_host[old_bucket] -= 1; + } + int new_bucket = curriculum_score_bucket(buf, prio); + if (new_bucket >= 0) { + buf->score_counts_host[new_bucket] += 1; + } + buf->states[state_idx] = env->state; + buf->scores_host[state_idx] = prio; + buf->advantages_host[state_idx] = prio; + buf->max_priority = fmaxf(buf->max_priority, prio); + return state_idx; +#else + (void)buf; + (void)env; + (void)prio; + return -1; +#endif +} + +static inline int best_curriculum_state_slot(StateBuffer* buf) { + if (buf->size <= 0) { + return -1; + } + + int best_idx = 0; + float best_score = buf->scores_host[0]; + for (int i = 1; i < buf->size; i++) { + if (buf->scores_host[i] > best_score) { + best_score = buf->scores_host[i]; + best_idx = i; + } + } + return best_idx; +} + +static inline void store_curriculum_states(StaticVec* vec, StateBuffer* buf, + int* state_inds, int env_start, int env_count) { #if PUFFER_HAS_STATE Env* envs = vec->envs; for (int i = 0; i < env_count; i++) { - states[state_inds[i]] = envs[env_start + i].state; + Env* env = &envs[env_start + i]; + float prio = curriculum_state_priority(env); + int state_idx = write_curriculum_state(buf, env, prio); + state_inds[i] = state_idx >= 0 ? state_idx : 0; } #else (void)vec; - (void)states; + (void)buf; (void)state_inds; (void)env_start; (void)env_count; @@ -1542,6 +1751,152 @@ static inline void load_curriculum_states(StaticVec* vec, const PufferState* sta #endif } +// Keep states reached by replayed curriculum slots from being overwritten before +// they can seed new on-policy rollouts and enter the state buffer frontier. +static inline void promote_curriculum_successors(StaticVec* vec, + int num_fresh_envs, int num_cl_envs, long epoch) { +#if PUFFER_HAS_STATE + if (num_fresh_envs <= 0 || num_cl_envs <= 0) { + return; + } + + Env* envs = vec->envs; + int offset = (int)((epoch * (long)num_fresh_envs) % (long)num_cl_envs); + for (int i = 0; i < num_fresh_envs; i++) { + int src = num_fresh_envs + ((offset + i) % num_cl_envs); + envs[i].state = envs[src].state; +#ifdef PUFFER_STATE_REFRESH + PUFFER_STATE_REFRESH(&envs[i]); +#endif + } +#else + (void)vec; + (void)num_fresh_envs; + (void)num_cl_envs; + (void)epoch; +#endif +} + +static inline void sync_state_priorities_to_device(StateBuffer* buf, cudaStream_t stream) { + if (buf->size <= 0) { + return; + } + cudaMemcpyAsync(buf->advantages.data, buf->advantages_host, + (size_t)buf->size * sizeof(float), cudaMemcpyHostToDevice, stream); +} + +static inline int fill_frontier_state_inds(StateBuffer* buf, int* out, int count) { + if (count <= 0 || buf->size <= 0 || buf->max_priority <= 0.0f) { + return 0; + } + + int max_score = (int)floorf(buf->max_priority + 0.5f); + std::vector by_score((size_t)max_score + 1, -1); + for (int i = 0; i < buf->size; i++) { + int score = (int)floorf(buf->scores_host[i] + 0.5f); + if (score > 0 && score <= max_score && by_score[(size_t)score] < 0) { + by_score[(size_t)score] = i; + } + } + + std::vector candidates; + candidates.reserve((size_t)max_score); + for (int score = 1; score <= max_score; score++) { + if (by_score[(size_t)score] >= 0) { + candidates.push_back(by_score[(size_t)score]); + } + } + + if (candidates.empty()) { + int best_idx = 0; + float best_score = buf->scores_host[0]; + for (int i = 1; i < buf->size; i++) { + if (buf->scores_host[i] > best_score) { + best_score = buf->scores_host[i]; + best_idx = i; + } + } + for (int i = 0; i < count; i++) { + out[i] = best_idx; + } + return count; + } + + int frontier_idx = candidates.back(); + int frontier_count = count / 4; + for (int i = 0; i < frontier_count; i++) { + out[i] = frontier_idx; + } + int remaining = count - frontier_count; + for (int i = 0; i < remaining; i++) { + out[frontier_count + i] = candidates[(size_t)i % candidates.size()]; + } + return count; +} + +static inline void update_curriculum_step_states(PuffeRL* pufferl, int buf_idx) { +#if PUFFER_HAS_STATE + if (!pufferl->curriculum_enabled) { + return; + } + + StateBuffer* buf = &pufferl->state_buf; + StaticVec* vec = pufferl->vec; + if (buf->capacity <= 0) { + return; + } + + int env_start = vec->buffer_env_starts[buf_idx]; + int env_count = vec->buffer_env_counts[buf_idx]; + int agents_per_env = buf->agents_per_env; + int first_cl_env = buf->num_fresh_envs; + int last_cl_env = first_cl_env + buf->num_cl_envs; + Env* envs = vec->envs; + + pthread_mutex_lock(&buf->lock); + for (int i = 0; i < env_count; i++) { + Env* env = &envs[env_start + i]; + float prio = curriculum_state_priority(env); + write_curriculum_state(buf, env, prio); + } + + for (int i = 0; i < env_count; i++) { + int env_idx = env_start + i; + if (env_idx < first_cl_env || env_idx >= last_cl_env) { + continue; + } + + int agent_idx = env_idx * agents_per_env; + if (vec->terminals[agent_idx] == 0.0f) { + continue; + } + + int state_idx = buf->env_state_inds_host[env_idx]; + if (state_idx < 0 || state_idx >= buf->size) { + state_idx = best_curriculum_state_slot(buf); + } + if (state_idx < 0) { + continue; + } + + Env* env = &envs[env_idx]; + env->state = buf->states[state_idx]; +#ifdef PUFFER_STATE_REFRESH + PUFFER_STATE_REFRESH(env); +#endif + } + pthread_mutex_unlock(&buf->lock); +#else + (void)pufferl; + (void)buf_idx; +#endif +} + +extern "C" void curriculum_step_callback_wrapper(void* ctx, int buf, int t) { + (void)t; + update_curriculum_step_states((PuffeRL*)ctx, buf); +} + static inline void expand_env_state_inds(StateBuffer* buf) { int num_envs = buf->num_fresh_envs + buf->num_cl_envs; int agents_per_env = buf->agents_per_env; @@ -1565,7 +1920,7 @@ void curriculum_rollout_begin(PuffeRL* pufferl) { int total_epochs = h->total_timesteps / (h->total_agents * h->horizon); float progress = total_epochs > 0 ? (float)pufferl->epoch / (float)total_epochs : 1.0f; progress = fminf(1.0f, fmaxf(0.0f, progress)); - float current_cl_frac = h->cl_frac; + float current_cl_frac = h->anneal_cl ? h->cl_frac * progress : h->cl_frac; int configured_cl = clamp_int((int)(current_cl_frac * (float)total_envs), 0, total_envs); int do_warmup = (buf->size == 0 || buf->size < h->warmup_states); int num_cl_envs = do_warmup ? 0 : configured_cl; @@ -1581,11 +1936,17 @@ void curriculum_rollout_begin(PuffeRL* pufferl) { vec->log_env_limit = (num_cl_envs > 0) ? num_fresh_envs : 0; if (num_cl_envs > 0) { + sync_state_priorities_to_device(buf, stream); + compute_state_prio_abs<<size), BLOCK_SIZE, 0, stream>>>( buf->advantages.data, buf->prio_bufs.prio_probs.data, h->explore_alpha, buf->size); compute_state_prio_normalize<<<1, PRIO_BLOCK_SIZE, 0, stream>>>( buf->prio_bufs.prio_probs.data, buf->size); +#ifdef PUFFER_STATE_SCORE + fill_frontier_state_inds(buf, buf->env_state_inds_host + num_fresh_envs, + num_cl_envs); +#else build_cdf_cuda(buf->prio_bufs.cdf.data, buf->prio_bufs.prio_probs.data, buf->prio_bufs.cdf_block_sums.data, buf->size, stream); int threads = 256; @@ -1598,6 +1959,7 @@ void curriculum_rollout_begin(PuffeRL* pufferl) { cudaMemcpyAsync(buf->env_state_inds_host + num_fresh_envs, buf->prio_bufs.idx.data, num_cl_envs * sizeof(int), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); +#endif load_curriculum_states(vec, buf->states, buf->env_state_inds_host + num_fresh_envs, num_fresh_envs, num_cl_envs); @@ -1613,13 +1975,8 @@ void curriculum_rollout_begin(PuffeRL* pufferl) { } } - for (int i = 0; i < num_fresh_envs; i++) { - buf->env_state_inds_host[i] = (buf->write_pos + i) % buf->capacity; - } if (num_fresh_envs > 0) { - store_curriculum_states(vec, buf->states, buf->env_state_inds_host, 0, num_fresh_envs); - buf->write_pos = (buf->write_pos + num_fresh_envs) % buf->capacity; - buf->size = clamp_int(buf->size + num_fresh_envs, 0, buf->capacity); + store_curriculum_states(vec, buf, buf->env_state_inds_host, 0, num_fresh_envs); } expand_env_state_inds(buf); @@ -1648,6 +2005,22 @@ void curriculum_update_advantages(PuffeRL* pufferl, PrecisionTensor* advantages, buf->advantages.data, buf->state_inds.data, advantages->data, 0, num_fresh_envs, agents_per_env, horizon); } + + int total_envs = num_fresh_envs + num_cl_envs; + if (total_envs > 0 && buf->size > 0) { + cudaMemcpyAsync(buf->advantages_tmp_host, buf->advantages.data, + (size_t)buf->size * sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + for (int env_idx = 0; env_idx < total_envs; env_idx++) { + int agent_start = env_idx * agents_per_env; + int state_idx = buf->state_inds_host[agent_start]; + if (state_idx >= 0 && state_idx < buf->size) { + buf->advantages_host[state_idx] = fmaxf( + buf->advantages_host[state_idx], + fabsf(buf->advantages_tmp_host[state_idx])); + } + } + } } // Experience the puffer advantage! Generalized advantage estimation + V-Trace @@ -1968,6 +2341,15 @@ void train_impl(PuffeRL& pufferl) { cudaEventRecord(pufferl.profile.events[1]); // pre-loop end int total_minibatches = hypers.replay_ratio * batch_size / hypers.minibatch_size; + int completed_minibatches = 0; + float last_kl_sum = 0.0f; + float last_loss_n = 0.0f; + if (hypers.target_kl > 0.0f) { + cudaMemcpy(&last_kl_sum, pufferl.losses_puf.data + LOSS_APPROX_KL, + sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_loss_n, pufferl.losses_puf.data + LOSS_N, + sizeof(float), cudaMemcpyDeviceToHost); + } for (int mb = 0; mb < total_minibatches; ++mb) { cudaEventRecord(pufferl.profile.events[2]); // start of misc (overwritten each iter) puf_zero(&advantages_puf, train_stream); @@ -2024,7 +2406,9 @@ void train_impl(PuffeRL& pufferl) { cudaStream_t stream = train_stream; PrecisionTensor obs_puf = graph.mb_obs; PrecisionTensor state_puf = graph.mb_state; - PrecisionTensor dec_puf = policy_forward_train(&pufferl.policy, pufferl.weights, pufferl.train_activations, obs_puf, state_puf, stream); + PrecisionTensor dec_puf = policy_forward_train(&pufferl.policy, + pufferl.weights, pufferl.train_activations, + obs_puf, state_puf, stream); DecoderWeights* dw_train = (DecoderWeights*)pufferl.weights.decoder; PrecisionTensor p_logstd; if (dw_train->continuous) { @@ -2079,22 +2463,39 @@ void train_impl(PuffeRL& pufferl) { (const char*)graph.mb_newvalue.data, num_idx, row_bytes); } cudaEventRecord(pufferl.profile.events[4]); // end forward + completed_minibatches += 1; + + if (hypers.target_kl > 0.0f && mb + 1 < total_minibatches) { + float kl_sum = 0.0f; + float loss_n = 0.0f; + cudaMemcpy(&kl_sum, pufferl.losses_puf.data + LOSS_APPROX_KL, + sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&loss_n, pufferl.losses_puf.data + LOSS_N, + sizeof(float), cudaMemcpyDeviceToHost); + float denom = loss_n - last_loss_n; + float mb_kl = denom > 0.0f ? (kl_sum - last_kl_sum) / denom : 0.0f; + last_kl_sum = kl_sum; + last_loss_n = loss_n; + if (mb_kl > hypers.target_kl) { + break; + } + } } pufferl.epoch += 1; cudaStreamSynchronize(pufferl.default_stream); - if (total_minibatches > 0) { + if (completed_minibatches > 0) { float ms; // Pre-loop setup (transpose, advantage, allocs) cudaEventElapsedTime(&ms, pufferl.profile.events[0], pufferl.profile.events[1]); pufferl.profile.accum[PROF_TRAIN_MISC] += ms; // In-loop misc (last iteration, representative) scaled by count cudaEventElapsedTime(&ms, pufferl.profile.events[2], pufferl.profile.events[3]); - pufferl.profile.accum[PROF_TRAIN_MISC] += ms * total_minibatches; + pufferl.profile.accum[PROF_TRAIN_MISC] += ms * completed_minibatches; // In-loop forward (last iteration, representative) scaled by count cudaEventElapsedTime(&ms, pufferl.profile.events[3], pufferl.profile.events[4]); - pufferl.profile.accum[PROF_TRAIN_FORWARD] += ms * total_minibatches; + pufferl.profile.accum[PROF_TRAIN_FORWARD] += ms * completed_minibatches; } } @@ -2635,7 +3036,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, } create_static_threads(vec, hypers.num_threads, horizon, pufferl.get(), - net_callback_wrapper, thread_init_wrapper); + net_callback_wrapper, post_step_callback_wrapper, thread_init_wrapper); static_vec_reset(vec); if (hypers.profile) { @@ -2657,9 +3058,13 @@ void close_impl(PuffeRL& pufferl) { cudaProfilerStop(); } - cudaGraphExecDestroy(pufferl.train_cudagraph); - for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) { - cudaGraphExecDestroy(pufferl.fused_rollout_cudagraphs[i]); + if (pufferl.train_captured) { + cudaGraphExecDestroy(pufferl.train_cudagraph); + } + if (pufferl.rollout_captured && pufferl.fused_rollout_cudagraphs != NULL) { + for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) { + cudaGraphExecDestroy(pufferl.fused_rollout_cudagraphs[i]); + } } policy_weights_free(&pufferl.policy, &pufferl.weights); diff --git a/src/vecenv.h b/src/vecenv.h index 38fb7308aa..b08fb20e3f 100644 --- a/src/vecenv.h +++ b/src/vecenv.h @@ -124,6 +124,7 @@ typedef struct StaticVec { // Callback types typedef void (*net_callback_fn)(void* ctx, int buf, int t); +typedef void (*step_callback_fn)(void* ctx, int buf, int t); typedef void (*thread_init_fn)(void* ctx, int buf); typedef void (*step_fn)(void* env); @@ -140,7 +141,8 @@ void static_vec_close(StaticVec* vec); void static_vec_log(StaticVec* vec, Dict* out); void static_vec_eval_log(StaticVec* vec, Dict* out); void create_static_threads(StaticVec* vec, int num_threads, int horizon, - void* ctx, net_callback_fn net_callback, thread_init_fn thread_init); + void* ctx, net_callback_fn net_callback, step_callback_fn step_callback, + thread_init_fn thread_init); void static_vec_omp_step(StaticVec* vec); void static_vec_seq_step(StaticVec* vec); void static_vec_render(StaticVec* vec, int env_id); @@ -280,6 +282,7 @@ typedef struct StaticOMPArg { int horizon; void* ctx; net_callback_fn net_callback; + step_callback_fn step_callback; thread_init_fn thread_init; } StaticOMPArg; @@ -292,6 +295,7 @@ static void* static_omp_threadmanager(void* arg) { int horizon = worker_arg->horizon; void* ctx = worker_arg->ctx; net_callback_fn net_callback = worker_arg->net_callback; + step_callback_fn step_callback = worker_arg->step_callback; thread_init_fn thread_init = worker_arg->thread_init; if (thread_init != NULL) { @@ -343,11 +347,6 @@ static void* static_omp_threadmanager(void* arg) { clock_gettime(CLOCK_MONOTONIC, &t1); my_accum[EVAL_ENV_STEP] += (t1.tv_sec - t0.tv_sec) * 1000.0f + (t1.tv_nsec - t0.tv_nsec) / 1e6f; - cudaMemcpyAsync( - vec->gpu_observations.data + agent_start * OBS_SIZE, - vec->observations.data + agent_start * OBS_SIZE, - agents_per_buffer * OBS_SIZE * obs_element_size(), - cudaMemcpyHostToDevice, stream); cudaMemcpyAsync( &vec->gpu_rewards[agent_start], &vec->rewards[agent_start], @@ -358,6 +357,14 @@ static void* static_omp_threadmanager(void* arg) { &vec->terminals[agent_start], agents_per_buffer * sizeof(float), cudaMemcpyHostToDevice, stream); + if (step_callback != NULL) { + step_callback(ctx, buf, t); + } + cudaMemcpyAsync( + vec->gpu_observations.data + agent_start * OBS_SIZE, + vec->observations.data + agent_start * OBS_SIZE, + agents_per_buffer * OBS_SIZE * obs_element_size(), + cudaMemcpyHostToDevice, stream); #ifdef MY_ACTION_MASK cudaMemcpyAsync( vec->gpu_action_mask + agent_start * MY_ACTION_MASK, @@ -638,7 +645,8 @@ void static_vec_reset(StaticVec* vec) { } void create_static_threads(StaticVec* vec, int num_threads, int horizon, - void* ctx, net_callback_fn net_callback, thread_init_fn thread_init) { + void* ctx, net_callback_fn net_callback, step_callback_fn step_callback, + thread_init_fn thread_init) { vec->threading = (StaticThreading*)calloc(1, sizeof(StaticThreading)); vec->threading->num_threads = num_threads; vec->threading->num_buffers = vec->buffers; @@ -656,6 +664,7 @@ void create_static_threads(StaticVec* vec, int num_threads, int horizon, args[i].horizon = horizon; args[i].ctx = ctx; args[i].net_callback = net_callback; + args[i].step_callback = step_callback; args[i].thread_init = thread_init; pthread_create(&vec->threading->threads[i], NULL, static_omp_threadmanager, &args[i]); } diff --git a/tests/profile_kernels.cu b/tests/profile_kernels.cu index a2af169838..a054a03be0 100644 --- a/tests/profile_kernels.cu +++ b/tests/profile_kernels.cu @@ -602,7 +602,8 @@ void profile_im2col(int B, int IC, int IH, int IW, int K, int S, int OH, int OW) } typedef struct { - PrecisionTensor state_advantages, rollout_advantages, importance; + FloatTensor state_advantages; + PrecisionTensor rollout_advantages, importance; FloatTensor prio_probs, cdf, cdf_block_sums; IntTensor sample_idx, state_inds; LongTensor rng_offset; @@ -622,7 +623,7 @@ typedef struct { } CurriculumProfile; __global__ void init_curriculum_profile_kernel( - precision_t* state_advantages, + float* state_advantages, precision_t* rollout_advantages, int* state_inds, int64_t* rng_offset, @@ -631,7 +632,7 @@ __global__ void init_curriculum_profile_kernel( int n = capacity > total_agents * horizon ? capacity : total_agents * horizon; if (idx < capacity) { float v = 0.25f + (float)(idx % 257) * 0.001f; - state_advantages[idx] = from_float(v); + state_advantages[idx] = v; } if (idx < total_agents) { state_inds[idx] = idx % capacity; @@ -846,7 +847,7 @@ EnvSpeedArgs* create_envspeed(int total_agents, int num_buffers, int num_threads cudaStreamCreateWithFlags(&vec->streams[i], cudaStreamNonBlocking); printf("Created %d envs (%s) for %d total_agents\n", vec->size, TOSTRING(ENV_NAME), total_agents); - create_static_threads(vec, num_threads, horizon, nullptr, empty_net_callback, empty_thread_init); + create_static_threads(vec, num_threads, horizon, nullptr, empty_net_callback, nullptr, empty_thread_init); static_vec_reset(vec); cudaDeviceSynchronize();