Skip to content

Commit 514b7cc

Browse files
committed
Single compilation unit, maybe 3% perf penalty
1 parent 65bc394 commit 514b7cc

13 files changed

Lines changed: 175 additions & 154 deletions

File tree

build.sh

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ if [ "$PLATFORM" = "Linux" ]; then
5757
OMP_LIB=-lomp5
5858
SANITIZE_FLAGS=(-fsanitize=address,undefined,bounds,pointer-overflow,leak -fno-omit-frame-pointer)
5959
STANDALONE_LDFLAGS=(-lGL)
60-
SHARED_LDFLAGS=(-Bsymbolic-functions)
60+
SHARED_LDFLAGS=(-Bsymbolic-functions -Wl,--gc-sections)
6161
else
6262
RAYLIB_NAME='raylib-5.5_macos'
6363
OMP_LIB=-lomp
@@ -131,8 +131,8 @@ if [ -n "$DEBUG" ] || [ "$MODE" = "local" ]; then
131131
LINK_OPT="-g"
132132
else
133133
CLANG_OPT=(-O2 -DNDEBUG "${CLANG_WARN[@]}")
134-
NVCC_OPT="-O2 --threads 0"
135-
LINK_OPT="-O2"
134+
NVCC_OPT="-O3 --threads 0"
135+
LINK_OPT="-O3"
136136
fi
137137
if [ "$MODE" = "local" ] || [ "$MODE" = "fast" ]; then
138138
FLAGS=(
@@ -238,42 +238,39 @@ if [ ! -f "$BINDING_SRC" ]; then
238238
exit 1
239239
fi
240240

241-
echo "Compiling static library for $ENV..."
242-
${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
243-
-I. -Isrc -I$SRC_DIR -Ivendor \
244-
-I./$RAYLIB_NAME/include -I$CUDA_HOME/include \
245-
-DPLATFORM_DESKTOP \
246-
-fno-semantic-interposition -fvisibility=hidden \
247-
-fPIC -fopenmp \
248-
"$BINDING_SRC" -o "$STATIC_OBJ"
249-
ar rcs "$STATIC_LIB" "$STATIC_OBJ"
250-
251-
# Brittle hack: have to extract the tensor type from the static lib to build trainer
252-
OBS_TENSOR_T=$(awk '/^#define OBS_TENSOR_T/{print $3}' "$BINDING_SRC")
253-
if [ -z "$OBS_TENSOR_T" ]; then
254-
echo "Error: Could not find OBS_TENSOR_T in $BINDING_SRC"
255-
exit 1
241+
if [ "$MODE" = "cpu" ]; then
242+
echo "Compiling static library for $ENV..."
243+
${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
244+
-I. -Isrc -I$SRC_DIR -Ivendor \
245+
-I./$RAYLIB_NAME/include -I$CUDA_HOME/include \
246+
-DPLATFORM_DESKTOP \
247+
-fno-semantic-interposition -fvisibility=hidden \
248+
-fPIC -fopenmp \
249+
"$BINDING_SRC" -o "$STATIC_OBJ"
250+
ar rcs "$STATIC_LIB" "$STATIC_OBJ"
256251
fi
257252

258253
if [ -z "$MODE" ]; then
259-
echo "Compiling CUDA ($ARCH) training backend..."
254+
echo "Compiling CUDA ($ARCH) training backend with $ENV binding..."
260255
$NVCC -c -arch=$ARCH -Xcompiler -fPIC \
261256
-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \
262257
-Xcompiler=-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION \
263258
-Xcompiler=-DPLATFORM_DESKTOP \
264259
-std=c++17 \
265-
-I. -Isrc \
260+
-I. -Isrc -I$SRC_DIR -Ivendor \
266261
-I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \
267262
-I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \
268263
-Xcompiler=-fopenmp \
269-
-DOBS_TENSOR_T=$OBS_TENSOR_T \
264+
-Xcompiler=-ffunction-sections \
265+
-Xcompiler=-fdata-sections \
266+
-DENV_BINDING_SRC=\"$BINDING_SRC\" \
270267
-DENV_NAME=$ENV \
271268
$PRECISION $NVCC_OPT \
272269
src/bindings.cu -o build/bindings.o
273270

274271
LINK_CMD=(
275272
${CXX:-g++} -shared -fPIC -fopenmp
276-
build/bindings.o "$STATIC_LIB" "$RAYLIB_A"
273+
build/bindings.o "$RAYLIB_A"
277274
-L$CUDA_HOME/lib64 $CUDNN_LFLAG $NCCL_LFLAG
278275
"${WHEEL_RPATH_FLAGS[@]}"
279276
-lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand -lcudnn
@@ -292,7 +289,6 @@ elif [ "$MODE" = "cpu" ]; then
292289
-std=c++17 \
293290
-I. -Isrc \
294291
-I$PYTHON_INCLUDE -I$PYBIND_INCLUDE \
295-
-DOBS_TENSOR_T=$OBS_TENSOR_T \
296292
-DENV_NAME=$ENV \
297293
$PRECISION $LINK_OPT \
298294
src/bindings_cpu.cpp -o build/bindings_cpu.o
@@ -311,13 +307,13 @@ elif [ "$MODE" = "profile" ]; then
311307
$NVCC $NVCC_OPT -arch=$ARCH -std=c++17 \
312308
-I. -Isrc -I$SRC_DIR -Ivendor \
313309
-I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \
314-
-DOBS_TENSOR_T=$OBS_TENSOR_T \
315310
-DENV_NAME=$ENV \
311+
-DENV_BINDING_SRC=\"$BINDING_SRC\" \
316312
-Xcompiler=-DPLATFORM_DESKTOP \
317313
$PRECISION \
318314
-Xcompiler=-fopenmp \
319315
tests/profile_kernels.cu vendor/ini.c \
320-
"$STATIC_LIB" "$RAYLIB_A" \
316+
"$RAYLIB_A" \
321317
-lnccl -lnvidia-ml -lcublas -lcurand -lcudnn \
322318
-lGL -lm -lpthread $OMP_LIB \
323319
-o profile

config/breakout.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ vf_coef = 1.2195502588297364
4949
vtrace_c_clip = 1.0830442742115065
5050
vtrace_rho_clip = 2.1017317041552603
5151

52+
state_buffer_size = 0
53+
cl_frac = 0
54+
warmup_states = 0
55+
explore_alpha = 0.0
56+
explore_beta = 0.0
57+
5258
#total_timesteps = 50_000_000
5359
#learning_rate = 0.045759
5460
#beta1 = 0.9542662897340632

config/default.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
gbase]
1+
[base]
22
env_name = None
33

44
# Multi-GPU (single GPU defaults)

ocean/breakout/binding.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
#define NUM_ATNS 1
44
#define ACT_SIZES {3}
55
#define OBS_TENSOR_T FloatTensor
6-
#define PUFFER_STATE_T State
7-
#define PUFFER_STATE_SIZE ((int)sizeof(State))
6+
#define PUFFER_HAS_STATE 1
87
#define PUFFER_STATE_REFRESH(env) compute_observations(env)
98

109
#define Env Breakout

ocean/breakout/breakout.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,7 @@ void c_step(Breakout* env) {
520520
compute_observations(env);
521521
}
522522

523+
#ifndef PUFFER_PERF_NO_RENDER
523524
Color BRICK_COLORS[6] = {RED, ORANGE, YELLOW, GREEN, SKYBLUE, BLUE};
524525

525526
Client* make_client(Breakout* env) {
@@ -601,3 +602,4 @@ void c_render(Breakout* env) {
601602

602603
//PlaySound(client->sound);
603604
}
605+
#endif

ocean/incremental_maze/binding.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
#define NUM_ATNS 1
44
#define ACT_SIZES {5}
55
#define OBS_TENSOR_T ByteTensor
6-
#define PUFFER_STATE_T State
7-
#define PUFFER_STATE_SIZE ((int)sizeof(State))
6+
#define PUFFER_HAS_STATE 1
87
#define PUFFER_STATE_REFRESH(env) compute_observations(env)
98

109
#define MY_VEC_INIT
@@ -22,7 +21,7 @@ Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_coun
2221
int num_levels = INCREMENTAL_NUM_LEVELS;
2322

2423
// Generate maze levels (shared across all envs)
25-
State* levels = calloc(num_levels, sizeof(State));
24+
State* levels = (State*)calloc(num_levels, sizeof(State));
2625

2726
for (int i = 0; i < num_levels; i++) {
2827
int sz = INCREMENTAL_MIN_SIZE + 2*i;

ocean/maze/binding.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
#define NUM_ATNS 1
44
#define ACT_SIZES {5}
55
#define OBS_TENSOR_T ByteTensor
6-
#define PUFFER_STATE_T State
7-
#define PUFFER_STATE_SIZE ((int)sizeof(State))
6+
#define PUFFER_HAS_STATE 1
87
#define PUFFER_STATE_REFRESH(env) compute_observations(env)
98

109
#define MY_VEC_INIT
@@ -29,7 +28,7 @@ Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_coun
2928
}
3029

3130
// Generate maze levels (shared across all envs)
32-
State* levels = calloc(num_maps, sizeof(State));
31+
State* levels = (State*)calloc(num_maps, sizeof(State));
3332

3433
unsigned int map_rng = 42;
3534
for (int i = 0; i < num_maps; i++) {

src/bindings.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
// bindings.cpp - Python bindings for pufferlib (torch-free)
22

3+
#ifdef ENV_BINDING_SRC
4+
#include ENV_BINDING_SRC
5+
#endif
6+
37
#include <pybind11/pybind11.h>
48
#include <pybind11/stl.h>
59
#include <pybind11/numpy.h>
@@ -620,11 +624,11 @@ PYBIND11_MODULE(_C, m) {
620624
.def_readonly("obs_elem_size", &VecEnv::obs_elem_size)
621625
.def_readonly("gpu", &VecEnv::gpu)
622626
// GPU buffer pointers — wrap with torch.from_blob(..., device='cuda')
623-
.def_property_readonly("gpu_obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_observations; })
627+
.def_property_readonly("gpu_obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_observations.data; })
624628
.def_property_readonly("gpu_rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_rewards; })
625629
.def_property_readonly("gpu_terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_terminals; })
626630
// CPU buffer pointers (same as gpu_ in CPU mode since they alias)
627-
.def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations; })
631+
.def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations.data; })
628632
.def_property_readonly("rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->rewards; })
629633
.def_property_readonly("terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->terminals; })
630634
.def("reset", &vec_reset)

src/bindings_cpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ PYBIND11_MODULE(_C, m) {
176176
.def_readonly("obs_dtype", &VecEnv::obs_dtype)
177177
.def_readonly("obs_elem_size", &VecEnv::obs_elem_size)
178178
.def_property_readonly("gpu", [](VecEnv&) { return 0; })
179-
.def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations; })
179+
.def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations.data; })
180180
.def_property_readonly("rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->rewards; })
181181
.def_property_readonly("terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->terminals; })
182182
.def("reset", &vec_reset)

src/kernels.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
#include <stdlib.h>
1212

1313
#include <cuda_bf16.h>
14+
#include "precision.h"
1415

1516
#ifdef PRECISION_FLOAT
16-
typedef float precision_t;
1717
constexpr bool USE_BF16 = false;
1818
constexpr int PRECISION_SIZE = 4;
1919
static constexpr cudaDataType_t CUBLAS_PRECISION = CUDA_R_32F;
@@ -22,7 +22,6 @@ static constexpr cublasComputeType_t CUBLAS_COMPUTE_PRECISION = CUBLAS_COMPUTE_3
2222
#define to_float(x) (x)
2323
#define from_float(x) (x)
2424
#else
25-
typedef __nv_bfloat16 precision_t;
2625
constexpr bool USE_BF16 = true;
2726
constexpr int PRECISION_SIZE = 2;
2827
static constexpr cudaDataType_t CUBLAS_PRECISION = CUDA_R_16BF;

0 commit comments

Comments
 (0)