@@ -57,7 +57,7 @@ if [ "$PLATFORM" = "Linux" ]; then
5757 OMP_LIB=-lomp5
5858 SANITIZE_FLAGS=(-fsanitize=address,undefined,bounds,pointer-overflow,leak -fno-omit-frame-pointer)
5959 STANDALONE_LDFLAGS=(-lGL)
60- SHARED_LDFLAGS=(-Bsymbolic-functions)
60+ SHARED_LDFLAGS=(-Bsymbolic-functions -Wl,--gc-sections )
6161else
6262 RAYLIB_NAME=' raylib-5.5_macos'
6363 OMP_LIB=-lomp
@@ -131,8 +131,8 @@ if [ -n "$DEBUG" ] || [ "$MODE" = "local" ]; then
131131 LINK_OPT=" -g"
132132else
133133 CLANG_OPT=(-O2 -DNDEBUG " ${CLANG_WARN[@]} " )
134- NVCC_OPT=" -O2 --threads 0"
135- LINK_OPT=" -O2 "
134+ NVCC_OPT=" -O3 --threads 0"
135+ LINK_OPT=" -O3 "
136136fi
137137if [ " $MODE " = " local" ] || [ " $MODE " = " fast" ]; then
138138 FLAGS=(
@@ -238,42 +238,39 @@ if [ ! -f "$BINDING_SRC" ]; then
238238 exit 1
239239fi
240240
241- echo " Compiling static library for $ENV ..."
242- ${CC:- clang} -c " ${CLANG_OPT[@]} " $EXTRA_CFLAGS \
243- -I. -Isrc -I$SRC_DIR -Ivendor \
244- -I./$RAYLIB_NAME /include -I$CUDA_HOME /include \
245- -DPLATFORM_DESKTOP \
246- -fno-semantic-interposition -fvisibility=hidden \
247- -fPIC -fopenmp \
248- " $BINDING_SRC " -o " $STATIC_OBJ "
249- ar rcs " $STATIC_LIB " " $STATIC_OBJ "
250-
251- # Brittle hack: have to extract the tensor type from the static lib to build trainer
252- OBS_TENSOR_T=$( awk ' /^#define OBS_TENSOR_T/{print $3}' " $BINDING_SRC " )
253- if [ -z " $OBS_TENSOR_T " ]; then
254- echo " Error: Could not find OBS_TENSOR_T in $BINDING_SRC "
255- exit 1
241+ if [ " $MODE " = " cpu" ]; then
242+ echo " Compiling static library for $ENV ..."
243+ ${CC:- clang} -c " ${CLANG_OPT[@]} " $EXTRA_CFLAGS \
244+ -I. -Isrc -I$SRC_DIR -Ivendor \
245+ -I./$RAYLIB_NAME /include -I$CUDA_HOME /include \
246+ -DPLATFORM_DESKTOP \
247+ -fno-semantic-interposition -fvisibility=hidden \
248+ -fPIC -fopenmp \
249+ " $BINDING_SRC " -o " $STATIC_OBJ "
250+ ar rcs " $STATIC_LIB " " $STATIC_OBJ "
256251fi
257252
258253if [ -z " $MODE " ]; then
259- echo " Compiling CUDA ($ARCH ) training backend..."
254+ echo " Compiling CUDA ($ARCH ) training backend with $ENV binding ..."
260255 $NVCC -c -arch=$ARCH -Xcompiler -fPIC \
261256 -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \
262257 -Xcompiler=-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION \
263258 -Xcompiler=-DPLATFORM_DESKTOP \
264259 -std=c++17 \
265- -I. -Isrc \
260+ -I. -Isrc -I $SRC_DIR -Ivendor \
266261 -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \
267262 -I$CUDA_HOME /include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME /include \
268263 -Xcompiler=-fopenmp \
269- -DOBS_TENSOR_T=$OBS_TENSOR_T \
264+ -Xcompiler=-ffunction-sections \
265+ -Xcompiler=-fdata-sections \
266+ -DENV_BINDING_SRC=\" $BINDING_SRC \" \
270267 -DENV_NAME=$ENV \
271268 $PRECISION $NVCC_OPT \
272269 src/bindings.cu -o build/bindings.o
273270
274271 LINK_CMD=(
275272 ${CXX:- g++} -shared -fPIC -fopenmp
276- build/bindings.o " $STATIC_LIB " " $ RAYLIB_A"
273+ build/bindings.o " $RAYLIB_A "
277274 -L$CUDA_HOME /lib64 $CUDNN_LFLAG $NCCL_LFLAG
278275 " ${WHEEL_RPATH_FLAGS[@]} "
279276 -lcudart -lnccl -lnvidia-ml -lcublas -lcusolver -lcurand -lcudnn
@@ -292,7 +289,6 @@ elif [ "$MODE" = "cpu" ]; then
292289 -std=c++17 \
293290 -I. -Isrc \
294291 -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE \
295- -DOBS_TENSOR_T=$OBS_TENSOR_T \
296292 -DENV_NAME=$ENV \
297293 $PRECISION $LINK_OPT \
298294 src/bindings_cpu.cpp -o build/bindings_cpu.o
@@ -311,13 +307,13 @@ elif [ "$MODE" = "profile" ]; then
311307 $NVCC $NVCC_OPT -arch=$ARCH -std=c++17 \
312308 -I. -Isrc -I$SRC_DIR -Ivendor \
313309 -I$CUDA_HOME /include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME /include \
314- -DOBS_TENSOR_T=$OBS_TENSOR_T \
315310 -DENV_NAME=$ENV \
311+ -DENV_BINDING_SRC=\" $BINDING_SRC \" \
316312 -Xcompiler=-DPLATFORM_DESKTOP \
317313 $PRECISION \
318314 -Xcompiler=-fopenmp \
319315 tests/profile_kernels.cu vendor/ini.c \
320- " $STATIC_LIB " " $ RAYLIB_A" \
316+ " $RAYLIB_A " \
321317 -lnccl -lnvidia-ml -lcublas -lcurand -lcudnn \
322318 -lGL -lm -lpthread $OMP_LIB \
323319 -o profile
0 commit comments