diff --git a/deepray/__init__.py b/deepray/__init__.py index 7f9ba80b..3447f5a1 100644 --- a/deepray/__init__.py +++ b/deepray/__init__.py @@ -48,7 +48,7 @@ def init(): logger.debug(f"sys.argv = {sys.argv}") # sys.argv from Horovod flags.FLAGS(sys.argv, known_only=True) - gpus = tf.config.experimental.list_physical_devices("GPU") + gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) diff --git a/deepray/custom_ops/BUILD b/deepray/custom_ops/BUILD index 78b85040..ab0bc47f 100644 --- a/deepray/custom_ops/BUILD +++ b/deepray/custom_ops/BUILD @@ -14,7 +14,7 @@ py_library( # "//deepray/custom_ops/multiplex_4:multiplex_4_op", "//deepray/custom_ops/parquet_dataset", "//deepray/custom_ops/seq2seq", - # "//deepray/custom_ops/simple_hash_table", + "//deepray/custom_ops/simple_hash_table", # "//deepray/custom_ops/sleep:sleep_op", "//deepray/custom_ops/training_ops", "//deepray/custom_ops/unique_ops", diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc index a603fee0..059dc51a 100644 --- a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc +++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc @@ -93,8 +93,6 @@ class SimpleHashTableResource : public ::tensorflow::ResourceBase { Status Import(const Tensor& keys, const Tensor& values) { const auto key_values = keys.flat(); const auto value_values = values.flat(); - LOG(INFO) << "key_values = " << key_values; - LOG(INFO) << "value_values = " << value_values; mutex_lock l(mu_); table_.clear(); diff --git a/tools/build_base_container.sh b/tools/build_base_container.sh index e325a461..12278c54 100755 --- a/tools/build_base_container.sh +++ b/tools/build_base_container.sh @@ -9,10 +9,7 @@ OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/base_container.Dockerfile \ - --network=host \ --progress=plain \ - --build-arg http_proxy=http://127.0.0.1:7890 \ - --build-arg https_proxy=http://127.0.0.1:7890 \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg TF_VERSION=${TF_VERSION} \ --build-arg PY_VERSION=${PY_VERSION} \ diff --git a/tools/build_dev_container.sh b/tools/build_dev_container.sh index 66653e57..f452ec2e 100755 --- a/tools/build_dev_container.sh +++ b/tools/build_dev_container.sh @@ -4,8 +4,8 @@ set -x -e PY_VERSION=${1:-"3.10"} TF_VERSION=${2:-"2.15.1"} -OS_VERSION=${3:-"20.04"} -CUDA_VERSION=${4:-"12.2.2"} +CUDA_VERSION=${3:-"12.2.2"} +OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/dev_container.Dockerfile \ diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile index d349ff29..794511d2 100644 --- a/tools/docker/base_container.Dockerfile +++ b/tools/docker/base_container.Dockerfile @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1.4 # Copyright 2025 The Deepray Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,6 +47,8 @@ ARG PY_VERSION # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi +COPY --from=openmpi_builder /opt/ucx /opt/ucx +COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version diff --git a/tools/docker/dev_container.Dockerfile b/tools/docker/dev_container.Dockerfile index 237682b8..b31a9d4d 100644 --- a/tools/docker/dev_container.Dockerfile +++ b/tools/docker/dev_container.Dockerfile @@ -1,4 +1,4 @@ -#syntax=docker/dockerfile:1.4 +# syntax=docker/dockerfile:1.4 # Copyright 2025 The Deepray Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -58,6 +58,8 @@ ARG PY_VERSION # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi +COPY --from=openmpi_builder /opt/ucx /opt/ucx +COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version diff --git a/tools/install_deps/install_cmake.sh b/tools/install_deps/install_cmake.sh index faaaaba0..f382423c 100644 --- a/tools/install_deps/install_cmake.sh +++ b/tools/install_deps/install_cmake.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail CMAKE_VERSION=${1:-"3.31.0"} @@ -27,12 +27,12 @@ rm /tmp/cmake-install.sh cat >bashrc.txt <<'EOF' export CMAKE_HOME=/opt/cmake export PATH="${CMAKE_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${CMAKE_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${CMAKE_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" EOF cat bashrc.txt >>/root/.bashrc export CMAKE_HOME=/opt/cmake export PATH="${CMAKE_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${CMAKE_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${CMAKE_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" cmake --version diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 6dca54ce..fb4c1f12 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -13,21 +13,47 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail OPENMPI_VERSION=${1:-"5.0.8"} +UCX_VERSION=${2:-"1.19.0"} +UCC_VERSION=${3:-"1.5.1"} +export BUILD_DIR=/tmp +export INSTALL_DIR=/opt +export UCX_DIR=${INSTALL_DIR}/ucx +export UCC_DIR=${INSTALL_DIR}/ucc +export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential + wget build-essential autoconf automake libtool git -# Install Open MPI -mkdir /tmp/openmpi && - cd /tmp/openmpi +# Install UCX +mkdir /tmp/ucx && cd /tmp/ucx +wget --no-check-certificate --progress=dot:mega -O ucx-${UCX_VERSION}.tar.gz https://github.com/openucx/ucx/archive/refs/tags/v${UCX_VERSION}.tar.gz +tar -zxf ucx-${UCX_VERSION}.tar.gz +cd ucx-${UCX_VERSION} +./autogen.sh +./configure --prefix=${UCX_DIR} +make -j $(nproc) +make install + +# Install UCC +mkdir /tmp/ucc && cd /tmp/ucc +wget --no-check-certificate --progress=dot:mega -O ucc-${UCC_VERSION}.tar.gz https://github.com/openucx/ucc/archive/refs/tags/v${UCC_VERSION}.tar.gz +tar -zxf ucc-${UCC_VERSION}.tar.gz +cd ucc-${UCC_VERSION} +./autogen.sh +./configure --prefix=${UCC_DIR} --with-ucx=${UCX_DIR} +make -j $(nproc) +make install + +# Install OpenMPI +mkdir /tmp/openmpi && cd /tmp/openmpi wget --no-check-certificate --progress=dot:mega -O openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OPENMPI_VERSION}.tar.gz tar -zxf openmpi-${OPENMPI_VERSION}.tar.gz cd openmpi-${OPENMPI_VERSION} -./configure --enable-orterun-prefix-by-default --prefix=/opt/openmpi +./configure --enable-orterun-prefix-by-default --prefix=${OMPI_DIR} --with-ucx=${UCX_DIR} --with-ucc=${UCC_DIR} make -j $(nproc) make install ldconfig @@ -36,13 +62,13 @@ ldconfig cat >bashrc.txt <<'EOF' export OPENMPI_HOME=/opt/openmpi export PATH="${OPENMPI_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" EOF cat bashrc.txt >>/root/.bashrc export OPENMPI_HOME=/opt/openmpi export PATH="${OPENMPI_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" mpirun --version && - rm -rf /tmp/openmpi + rm -rf ${BUILD_DIR} diff --git a/tools/install_deps/install_python.sh b/tools/install_deps/install_python.sh index cc388959..a2dfaae4 100644 --- a/tools/install_deps/install_python.sh +++ b/tools/install_deps/install_python.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail PY_VERSION=${1:-"3.10"}