Single compilation unit, maybe 3% perf penalty

jsuarez5341 · jsuarez5341 · commit 514b7cc0abe6 · 2026-05-18T22:48:49.000Z
diff --git a/build.sh b/build.sh
@@ -57,7 +57,7 @@ if [ "$PLATFORM" = "Linux" ]; then
     OMP_LIB=-lomp5
     SANITIZE_FLAGS=(-fsanitize=address,undefined,bounds,pointer-overflow,leak -fno-omit-frame-pointer)
     STANDALONE_LDFLAGS=(-lGL)
-    SHARED_LDFLAGS=(-Bsymbolic-functions)
+    SHARED_LDFLAGS=(-Bsymbolic-functions -Wl,--gc-sections)
 else
     RAYLIB_NAME='raylib-5.5_macos'
     OMP_LIB=-lomp
@@ -131,8 +131,8 @@ if [ -n "$DEBUG" ] || [ "$MODE" = "local" ]; then
     LINK_OPT="-g"
 else
     CLANG_OPT=(-O2 -DNDEBUG "${CLANG_WARN[@]}")
-    NVCC_OPT="-O2 --threads 0"
-    LINK_OPT="-O2"
+    NVCC_OPT="-O3 --threads 0"
+    LINK_OPT="-O3"
 fi
 if [ "$MODE" = "local" ] || [ "$MODE" = "fast" ]; then
     FLAGS=(
@@ -238,42 +238,39 @@ if [ ! -f "$BINDING_SRC" ]; then
     exit 1
 fi
 
-echo "Compiling static library for $ENV..."
-${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
-    -I. -Isrc -I$SRC_DIR -Ivendor \
-    -I./$RAYLIB_NAME/include -I$CUDA_HOME/include \
-    -DPLATFORM_DESKTOP \
-    -fno-semantic-interposition -fvisibility=hidden \
-    -fPIC -fopenmp \
-    "$BINDING_SRC" -o "$STATIC_OBJ"
-ar rcs "$STATIC_LIB" "$STATIC_OBJ"
-
-# Brittle hack: have to extract the tensor type from the static lib to build trainer
-OBS_TENSOR_T=$(awk '/^#define OBS_TENSOR_T/{print $3}' "$BINDING_SRC")
-if [ -z "$OBS_TENSOR_T" ]; then
-    echo "Error: Could not find OBS_TENSOR_T in $BINDING_SRC"
-    exit 1
+if [ "$MODE" = "cpu" ]; then
+    echo "Compiling static library for $ENV..."
+    ${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
+        -I. -Isrc -I$SRC_DIR -Ivendor \
+        -I./$RAYLIB_NAME/include -I$CUDA_HOME/include \
+        -DPLATFORM_DESKTOP \
+        -fno-semantic-interposition -fvisibility=hidden \
+        -fPIC -fopenmp \
+        "$BINDING_SRC" -o "$STATIC_OBJ"
+    ar rcs "$STATIC_LIB" "$STATIC_OBJ"
 fi
 
 if [ -z "$MODE" ]; then
-    echo "Compiling CUDA ($ARCH) training backend..."
+    echo "Compiling CUDA ($ARCH) training backend with $ENV binding..."
     $NVCC -c -arch=$ARCH -Xcompiler -fPIC \
         -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \
         -Xcompiler=-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION \
         -Xcompiler=-DPLATFORM_DESKTOP \
         -std=c++17 \
-        -I. -Isrc \
+        -I. -Isrc -I$SRC_DIR -Ivendor \
         -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \
         -I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \
         -Xcompiler=-fopenmp \
-        -DOBS_TENSOR_T=$OBS_TENSOR_T \
+        -Xcompiler=-ffunction-sections \
+        -Xcompiler=-fdata-sections \
+        -DENV_BINDING_SRC=\"$BINDING_SRC\" \
         -DENV_NAME=$ENV \
         $PRECISION $NVCC_OPT \
         src/bindings.cu -o build/bindings.o
 
     LINK_CMD=(
         ${CXX:-g++} -shared -fPIC -fopenmp
-        build/bindings.o "$STATIC_LIB" "$RAYLIB_A"
+        build/bindings.o "$RAYLIB_A"
         -L$CUDA_HOME/lib64 $CUDNN_LFLAG $NCCL_LFLAG
         "${WHEEL_RPATH_FLAGS[@]}"
         -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand -lcudnn
@@ -292,7 +289,6 @@ elif [ "$MODE" = "cpu" ]; then
         -std=c++17 \
         -I. -Isrc \
         -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE \
-        -DOBS_TENSOR_T=$OBS_TENSOR_T \
         -DENV_NAME=$ENV \
         $PRECISION $LINK_OPT \
         src/bindings_cpu.cpp -o build/bindings_cpu.o
@@ -311,13 +307,13 @@ elif [ "$MODE" = "profile" ]; then
     $NVCC $NVCC_OPT -arch=$ARCH -std=c++17 \
         -I. -Isrc -I$SRC_DIR -Ivendor \
         -I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \
-        -DOBS_TENSOR_T=$OBS_TENSOR_T \
         -DENV_NAME=$ENV \
+        -DENV_BINDING_SRC=\"$BINDING_SRC\" \
         -Xcompiler=-DPLATFORM_DESKTOP \
         $PRECISION \
         -Xcompiler=-fopenmp \
         tests/profile_kernels.cu vendor/ini.c \
-        "$STATIC_LIB" "$RAYLIB_A" \
+        "$RAYLIB_A" \
         -lnccl -lnvidia-ml -lcublas -lcurand -lcudnn \
         -lGL -lm -lpthread $OMP_LIB \
         -o profile
diff --git a/config/breakout.ini b/config/breakout.ini
@@ -49,6 +49,12 @@ vf_coef = 1.2195502588297364
 vtrace_c_clip = 1.0830442742115065
 vtrace_rho_clip = 2.1017317041552603
 
+state_buffer_size = 0
+cl_frac = 0
+warmup_states = 0
+explore_alpha = 0.0
+explore_beta = 0.0
+
 #total_timesteps = 50_000_000
 #learning_rate = 0.045759
 #beta1 = 0.9542662897340632
diff --git a/config/default.ini b/config/default.ini
@@ -1,4 +1,4 @@
-gbase]
+[base]
 env_name = None
 
 # Multi-GPU (single GPU defaults)
diff --git a/ocean/breakout/binding.c b/ocean/breakout/binding.c
@@ -3,8 +3,7 @@
 #define NUM_ATNS 1
 #define ACT_SIZES {3}
 #define OBS_TENSOR_T FloatTensor
-#define PUFFER_STATE_T State
-#define PUFFER_STATE_SIZE ((int)sizeof(State))
+#define PUFFER_HAS_STATE 1
 #define PUFFER_STATE_REFRESH(env) compute_observations(env)
 
 #define Env Breakout
diff --git a/ocean/breakout/breakout.h b/ocean/breakout/breakout.h
@@ -520,6 +520,7 @@ void c_step(Breakout* env) {
     compute_observations(env);
 }
 
+#ifndef PUFFER_PERF_NO_RENDER
 Color BRICK_COLORS[6] = {RED, ORANGE, YELLOW, GREEN, SKYBLUE, BLUE};
 
 Client* make_client(Breakout* env) {
@@ -601,3 +602,4 @@ void c_render(Breakout* env) {
 
     //PlaySound(client->sound);
 }
+#endif
diff --git a/ocean/incremental_maze/binding.c b/ocean/incremental_maze/binding.c
@@ -3,8 +3,7 @@
 #define NUM_ATNS 1
 #define ACT_SIZES {5}
 #define OBS_TENSOR_T ByteTensor
-#define PUFFER_STATE_T State
-#define PUFFER_STATE_SIZE ((int)sizeof(State))
+#define PUFFER_HAS_STATE 1
 #define PUFFER_STATE_REFRESH(env) compute_observations(env)
 
 #define MY_VEC_INIT
@@ -22,7 +21,7 @@ Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_coun
     int num_levels = INCREMENTAL_NUM_LEVELS;
 
     // Generate maze levels (shared across all envs)
-    State* levels = calloc(num_levels, sizeof(State));
+    State* levels = (State*)calloc(num_levels, sizeof(State));
 
     for (int i = 0; i < num_levels; i++) {
         int sz = INCREMENTAL_MIN_SIZE + 2*i;
diff --git a/ocean/maze/binding.c b/ocean/maze/binding.c
@@ -3,8 +3,7 @@
 #define NUM_ATNS 1
 #define ACT_SIZES {5}
 #define OBS_TENSOR_T ByteTensor
-#define PUFFER_STATE_T State
-#define PUFFER_STATE_SIZE ((int)sizeof(State))
+#define PUFFER_HAS_STATE 1
 #define PUFFER_STATE_REFRESH(env) compute_observations(env)
 
 #define MY_VEC_INIT
@@ -29,7 +28,7 @@ Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_coun
     }
 
     // Generate maze levels (shared across all envs)
-    State* levels = calloc(num_maps, sizeof(State));
+    State* levels = (State*)calloc(num_maps, sizeof(State));
 
     unsigned int map_rng = 42;
     for (int i = 0; i < num_maps; i++) {
diff --git a/src/bindings.cu b/src/bindings.cu
@@ -1,5 +1,9 @@
 // bindings.cpp - Python bindings for pufferlib (torch-free)
 
+#ifdef ENV_BINDING_SRC
+#include ENV_BINDING_SRC
+#endif
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
@@ -620,11 +624,11 @@ PYBIND11_MODULE(_C, m) {
         .def_readonly("obs_elem_size", &VecEnv::obs_elem_size)
         .def_readonly("gpu",           &VecEnv::gpu)
         // GPU buffer pointers — wrap with torch.from_blob(..., device='cuda')
-        .def_property_readonly("gpu_obs_ptr",       [](VecEnv& ve) { return (long long)ve.vec->gpu_observations; })
+        .def_property_readonly("gpu_obs_ptr",       [](VecEnv& ve) { return (long long)ve.vec->gpu_observations.data; })
         .def_property_readonly("gpu_rewards_ptr",   [](VecEnv& ve) { return (long long)ve.vec->gpu_rewards; })
         .def_property_readonly("gpu_terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_terminals; })
         // CPU buffer pointers (same as gpu_ in CPU mode since they alias)
-        .def_property_readonly("obs_ptr",       [](VecEnv& ve) { return (long long)ve.vec->observations; })
+        .def_property_readonly("obs_ptr",       [](VecEnv& ve) { return (long long)ve.vec->observations.data; })
         .def_property_readonly("rewards_ptr",   [](VecEnv& ve) { return (long long)ve.vec->rewards; })
         .def_property_readonly("terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->terminals; })
         .def("reset", &vec_reset)
diff --git a/src/bindings_cpu.cpp b/src/bindings_cpu.cpp
@@ -176,7 +176,7 @@ PYBIND11_MODULE(_C, m) {
         .def_readonly("obs_dtype", &VecEnv::obs_dtype)
         .def_readonly("obs_elem_size", &VecEnv::obs_elem_size)
         .def_property_readonly("gpu", [](VecEnv&) { return 0; })
-        .def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations; })
+        .def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations.data; })
         .def_property_readonly("rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->rewards; })
         .def_property_readonly("terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->terminals; })
         .def("reset", &vec_reset)
diff --git a/src/kernels.cu b/src/kernels.cu
@@ -11,9 +11,9 @@
 #include <stdlib.h>
 
 #include <cuda_bf16.h>
+#include "precision.h"
 
 #ifdef PRECISION_FLOAT
-typedef float precision_t;
 constexpr bool USE_BF16 = false;
 constexpr int PRECISION_SIZE = 4;
 static constexpr cudaDataType_t CUBLAS_PRECISION = CUDA_R_32F;
@@ -22,7 +22,6 @@ static constexpr cublasComputeType_t CUBLAS_COMPUTE_PRECISION = CUBLAS_COMPUTE_3
 #define to_float(x) (x)
 #define from_float(x) (x)
 #else
-typedef __nv_bfloat16 precision_t;
 constexpr bool USE_BF16 = true;
 constexpr int PRECISION_SIZE = 2;
 static constexpr cudaDataType_t CUBLAS_PRECISION = CUDA_R_16BF;
diff --git a/src/pufferlib.cu b/src/pufferlib.cu
@@ -206,8 +206,7 @@ struct PrioBuffers {
 };
 
 struct StateBuffer {
-    void* states;              // CPU state_buffer_size * state_size bytes
-    int state_size;
+    PufferState* states;       // CPU state_buffer_size entries
     int capacity;
     int size;
     int write_pos;
@@ -272,10 +271,7 @@ struct EnvBuf {
 StaticVec* create_environments(int num_buffers, int total_agents,
         const std::string& env_name, Dict* vec_kwargs, Dict* env_kwargs, EnvBuf& env) {
     StaticVec* vec = create_static_vec(total_agents, num_buffers, 1, vec_kwargs, env_kwargs);
-    env.obs = {
-        .data = (decltype(env.obs.data))vec->gpu_observations,
-        .shape = {total_agents, get_obs_size()},
-    };
+    env.obs = vec->gpu_observations;
     env.actions = { .data = (float*)vec->gpu_actions, .shape = {total_agents, get_num_atns()} };
     env.rewards = { .data = (float*)vec->gpu_rewards, .shape = {total_agents} };
     env.terminals = { .data = (float*)vec->gpu_terminals, .shape = {total_agents} };
@@ -1384,21 +1380,20 @@ static inline int clamp_int(int v, int lo, int hi) {
 
 int init_state_buffer(PuffeRL* pufferl) {
     StateBuffer* buf = &pufferl->state_buf;
-    buf->state_size = get_state_size();
     size_t capacity = (size_t)buf->capacity;
-    size_t state_size = (size_t)buf->state_size;
-    if (state_size == 0 || capacity > ((size_t)-1) / state_size) {
+    size_t state_size = sizeof(PufferState);
+    if (!PUFFER_HAS_STATE || state_size == 0 || capacity > ((size_t)-1) / state_size) {
         fprintf(stderr, "Failed to allocate curriculum state buffer: invalid size\n");
         return 0;
     }
 
     size_t state_bytes = capacity * state_size;
-    buf->states = malloc(state_bytes);
+    buf->states = (PufferState*)malloc(state_bytes);
     buf->state_inds_host = (int*)malloc((size_t)pufferl->hypers.total_agents * sizeof(int));
     if (buf->states == NULL || buf->state_inds_host == NULL) {
         fprintf(stderr,
             "Failed to allocate curriculum state buffer: capacity=%d state_size=%d bytes=%zu\n",
-            buf->capacity, buf->state_size, state_bytes);
+            buf->capacity, (int)state_size, state_bytes);
         free(buf->states);
         free(buf->state_inds_host);
         buf->states = NULL;
@@ -1416,6 +1411,44 @@ void close_state_buffer(PuffeRL* pufferl) {
     buf->state_inds_host = NULL;
 }
 
+static inline void store_curriculum_states(StaticVec* vec, PufferState* states,
+        const int* state_inds, int env_start, int env_count) {
+#if PUFFER_HAS_STATE
+    Env* envs = vec->envs;
+    for (int i = 0; i < env_count; i++) {
+        states[state_inds[i]] = envs[env_start + i].state;
+    }
+#else
+    (void)vec;
+    (void)states;
+    (void)state_inds;
+    (void)env_start;
+    (void)env_count;
+    assert(0 && "state curriculum requires PUFFER_HAS_STATE");
+#endif
+}
+
+static inline void load_curriculum_states(StaticVec* vec, const PufferState* states,
+        const int* state_inds, int env_start, int env_count) {
+#if PUFFER_HAS_STATE
+    Env* envs = vec->envs;
+    for (int i = 0; i < env_count; i++) {
+        Env* env = &envs[env_start + i];
+        env->state = states[state_inds[i]];
+#ifdef PUFFER_STATE_REFRESH
+        PUFFER_STATE_REFRESH(env);
+#endif
+    }
+#else
+    (void)vec;
+    (void)states;
+    (void)state_inds;
+    (void)env_start;
+    (void)env_count;
+    assert(0 && "state curriculum requires PUFFER_HAS_STATE");
+#endif
+}
+
 void curriculum_rollout_begin(PuffeRL* pufferl) {
     HypersT* h = &pufferl->hypers;
     StateBuffer* buf = &pufferl->state_buf;
@@ -1454,10 +1487,10 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
             num_cl * sizeof(int), cudaMemcpyDeviceToHost, stream);
         cudaStreamSynchronize(stream);
 
-        static_vec_load_states(vec, buf->states, buf->state_inds_host + num_fresh,
+        load_curriculum_states(vec, buf->states, buf->state_inds_host + num_fresh,
             num_fresh, num_cl);
         if (vec->gpu) {
-            cudaMemcpy(vec->gpu_observations, vec->observations,
+            cudaMemcpy(vec->gpu_observations.data, vec->observations.data,
                 (size_t)vec->total_agents * get_obs_size() * get_obs_elem_size(),
                 cudaMemcpyHostToDevice);
             if (vec->action_mask_size > 0) {
@@ -1472,7 +1505,7 @@ void curriculum_rollout_begin(PuffeRL* pufferl) {
         buf->state_inds_host[i] = (buf->write_pos + i) % buf->capacity;
     }
     if (num_fresh > 0) {
-        static_vec_store_states(vec, buf->states, buf->state_inds_host, 0, num_fresh);
+        store_curriculum_states(vec, buf->states, buf->state_inds_host, 0, num_fresh);
         buf->write_pos = (buf->write_pos + num_fresh) % buf->capacity;
         buf->size = clamp_int(buf->size + num_fresh, 0, buf->capacity);
     }
@@ -2216,8 +2249,8 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         (int)(hypers.cl_frac * (float)hypers.total_agents), 0, hypers.total_agents);
     pufferl->curriculum_enabled = hypers.state_buffer_size > 0 && initial_num_cl_envs > 0;
     if (pufferl->curriculum_enabled) {
-        assert(static_vec_has_state(vec) && "state_buffer_size > 0 requires PUFFER_STATE_T env support");
-        assert(get_state_size() > 0 && "state_buffer_size > 0 requires nonzero env state size");
+        assert(PUFFER_HAS_STATE && "state_buffer_size > 0 requires env State support");
+        assert(sizeof(PufferState) > 0 && "state_buffer_size > 0 requires nonzero env state size");
         assert(hypers.warmup_states >= 0 && "warmup_states must be nonnegative");
         assert(hypers.warmup_states <= hypers.state_buffer_size
             && "warmup_states must be <= state_buffer_size");
diff --git a/src/tensor.h b/src/tensor.h
@@ -26,6 +26,8 @@ typedef struct {
 } IntTensor;
 
 #ifdef __CUDACC__
+#include "precision.h"
+
 typedef struct {
     precision_t* data;
     int64_t shape[PUF_MAX_DIMS];
diff --git a/src/vecenv.h b/src/vecenv.h

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-gbase]`
	`1`	`+[base]`
`2`	`2`	`env_name = None`
`3`	`3`
`4`	`4`	`# Multi-GPU (single GPU defaults)`
Original file line number	Diff line number	Diff line change
`@@ -520,6 +520,7 @@ void c_step(Breakout* env) {`
`520`	`520`	`compute_observations(env);`
`521`	`521`	`}`
`522`	`522`
	`523`	`+#ifndef PUFFER_PERF_NO_RENDER`
`523`	`524`	`Color BRICK_COLORS[6] = {RED, ORANGE, YELLOW, GREEN, SKYBLUE, BLUE};`
`524`	`525`
`525`	`526`	`Client* make_client(Breakout* env) {`
`@@ -601,3 +602,4 @@ void c_render(Breakout* env) {`
`601`	`602`
`602`	`603`	`//PlaySound(client->sound);`
`603`	`604`	`}`
	`605`	`+#endif`