From 2f7249dd055d379d2aa6fdda0eebd4dcb2d6620c Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 20:47:48 +0800 Subject: [PATCH 01/11] [ThirdParty] Enable ucx for openmpi --- deepray/__init__.py | 2 +- deepray/custom_ops/BUILD | 2 +- .../simple_hash_table_kernel.cc | 2 -- tools/docker/base_container.Dockerfile | 2 ++ tools/install_deps/install_openmpi.sh | 28 +++++++++++++++++-- 5 files changed, 29 insertions(+), 7 deletions(-) diff --git a/deepray/__init__.py b/deepray/__init__.py index 7f9ba80b..3447f5a1 100644 --- a/deepray/__init__.py +++ b/deepray/__init__.py @@ -48,7 +48,7 @@ def init(): logger.debug(f"sys.argv = {sys.argv}") # sys.argv from Horovod flags.FLAGS(sys.argv, known_only=True) - gpus = tf.config.experimental.list_physical_devices("GPU") + gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) diff --git a/deepray/custom_ops/BUILD b/deepray/custom_ops/BUILD index 78b85040..ab0bc47f 100644 --- a/deepray/custom_ops/BUILD +++ b/deepray/custom_ops/BUILD @@ -14,7 +14,7 @@ py_library( # "//deepray/custom_ops/multiplex_4:multiplex_4_op", "//deepray/custom_ops/parquet_dataset", "//deepray/custom_ops/seq2seq", - # "//deepray/custom_ops/simple_hash_table", + "//deepray/custom_ops/simple_hash_table", # "//deepray/custom_ops/sleep:sleep_op", "//deepray/custom_ops/training_ops", "//deepray/custom_ops/unique_ops", diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc index a603fee0..059dc51a 100644 --- a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc +++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc @@ -93,8 +93,6 @@ class SimpleHashTableResource : public ::tensorflow::ResourceBase { Status Import(const Tensor& keys, const Tensor& values) { const auto key_values = keys.flat(); const auto value_values = values.flat(); - LOG(INFO) << "key_values = " << key_values; - LOG(INFO) << "value_values = " << value_values; mutex_lock l(mu_); table_.clear(); diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile index d349ff29..f3c77f55 100644 --- a/tools/docker/base_container.Dockerfile +++ b/tools/docker/base_container.Dockerfile @@ -90,6 +90,8 @@ RUN bash /install_deps/install_openssh.sh # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi +COPY --from=openmpi_builder /opt/ucx /opt/ucx +COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 6dca54ce..616c96df 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -16,18 +16,40 @@ set -x -e OPENMPI_VERSION=${1:-"5.0.8"} +export BUILD_DIR=/tmp +export INSTALL_DIR=/opt +export UCX_DIR=${INSTALL_DIR}/ucx +export UCC_DIR=${INSTALL_DIR}/ucc +export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ wget build-essential -# Install Open MPI +# Install UCX +git clone https://github.com/openucx/ucx.git ${BUILD_DIR}/ucx +cd ${BUILD_DIR}/ucx +git checkout v1.5.1 +./autogen.sh +./configure --prefix=${UCX_DIR} +make -j $(nproc) +make install + +# Install UCC +git clone https://github.com/openucx/ucc.git ${BUILD_DIR}/ucc +cd ${BUILD_DIR}/ucc +git checkout v1.5.1 +./autogen.sh +./configure --prefix=${UCC_DIR} --with-ucx=${UCX_DIR} +make -j $(nproc) && make install + +# Install OpenMPI mkdir /tmp/openmpi && cd /tmp/openmpi wget --no-check-certificate --progress=dot:mega -O openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OPENMPI_VERSION}.tar.gz tar -zxf openmpi-${OPENMPI_VERSION}.tar.gz cd openmpi-${OPENMPI_VERSION} -./configure --enable-orterun-prefix-by-default --prefix=/opt/openmpi +./configure --enable-orterun-prefix-by-default --prefix=${OMPI_DIR} --with-ucx=${UCX_DIR} --with-ucc=${UCC_DIR} make -j $(nproc) make install ldconfig @@ -45,4 +67,4 @@ export PATH="${OPENMPI_HOME}/bin:${PATH}" export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib:${LD_LIBRARY_PATH}" mpirun --version && - rm -rf /tmp/openmpi + rm -rf ${BUILD_DIR} From ae15894181608beac5a03eee89bb9bcf57d59781 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 20:55:49 +0800 Subject: [PATCH 02/11] [CI] Fix CICD missing git --- tools/install_deps/install_openmpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 616c96df..1b4ade96 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -24,7 +24,7 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential + wget build-essential git # Install UCX git clone https://github.com/openucx/ucx.git ${BUILD_DIR}/ucx From 04b964f1aefa60b503c56aec3246f2c8b83db0cc Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 23:05:47 +0800 Subject: [PATCH 03/11] [CI] Fix CICD error --- tools/install_deps/install_openmpi.sh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 1b4ade96..48561722 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -16,6 +16,8 @@ set -x -e OPENMPI_VERSION=${1:-"5.0.8"} +UCX_VERSION=${2:-"1.19.0"} +UCC_VERSION=${3:-"1.5.1"} export BUILD_DIR=/tmp export INSTALL_DIR=/opt export UCX_DIR=${INSTALL_DIR}/ucx @@ -24,28 +26,29 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential git + wget build-essential # Install UCX -git clone https://github.com/openucx/ucx.git ${BUILD_DIR}/ucx -cd ${BUILD_DIR}/ucx -git checkout v1.5.1 +mkdir /tmp/ucx && cd /tmp/ucx +wget --no-check-certificate --progress=dot:mega -O ucx-${UCX_VERSION}.tar.gz https://github.com/openucx/ucx/archive/refs/tags/v${UCX_VERSION}.tar.gz +tar -zxf ucx-${UCX_VERSION}.tar.gz +cd ucx-${UCX_VERSION} ./autogen.sh ./configure --prefix=${UCX_DIR} make -j $(nproc) make install # Install UCC -git clone https://github.com/openucx/ucc.git ${BUILD_DIR}/ucc -cd ${BUILD_DIR}/ucc -git checkout v1.5.1 +mkdir /tmp/ucc && cd /tmp/ucc +wget --no-check-certificate --progress=dot:mega -O ucc-${UCC_VERSION}.tar.gz https://github.com/openucx/ucc/archive/refs/tags/v${UCC_VERSION}.tar.gz +tar -zxf ucc-${UCC_VERSION}.tar.gz +cd ucc-${UCC_VERSION} ./autogen.sh ./configure --prefix=${UCC_DIR} --with-ucx=${UCX_DIR} make -j $(nproc) && make install # Install OpenMPI -mkdir /tmp/openmpi && - cd /tmp/openmpi +mkdir /tmp/openmpi && cd /tmp/openmpi wget --no-check-certificate --progress=dot:mega -O openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OPENMPI_VERSION}.tar.gz tar -zxf openmpi-${OPENMPI_VERSION}.tar.gz cd openmpi-${OPENMPI_VERSION} From 31d6e916b99e22ac4960dd138f44cc61faae8817 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 23:15:38 +0800 Subject: [PATCH 04/11] [CI] Add autoconf for ucx building --- tools/install_deps/install_openmpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 48561722..2aee06ab 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -26,7 +26,7 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential + wget build-essential autoconf # Install UCX mkdir /tmp/ucx && cd /tmp/ucx From b3bbccc70a8bbbbac5626edc2eb709eee9c7e448 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 23:39:58 +0800 Subject: [PATCH 05/11] [CI] Add libtool for ucx building --- tools/install_deps/install_openmpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 2aee06ab..39208852 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -26,7 +26,7 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential autoconf + wget build-essential autoconf libtool # Install UCX mkdir /tmp/ucx && cd /tmp/ucx From 89c40c576ed0410480c8471cc59bf5914089a0e6 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Fri, 17 Oct 2025 23:47:19 +0800 Subject: [PATCH 06/11] [CI] Add automake for ucx building --- tools/install_deps/install_openmpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 39208852..8f32e975 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -26,7 +26,7 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential autoconf libtool + wget build-essential autoconf automake libtool # Install UCX mkdir /tmp/ucx && cd /tmp/ucx From 93b6137d4b858e521de95c0d2d9547a01f1a19c0 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Mon, 20 Oct 2025 12:07:56 +0800 Subject: [PATCH 07/11] [CI] Update CICD --- tools/install_deps/install_openmpi.sh | 7 ++++--- tools/install_deps/install_python.sh | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 8f32e975..0223e583 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail OPENMPI_VERSION=${1:-"5.0.8"} UCX_VERSION=${2:-"1.19.0"} @@ -26,7 +26,7 @@ export OMPI_DIR=${INSTALL_DIR}/openmpi apt-get update && apt-get install --no-install-recommends --yes \ - wget build-essential autoconf automake libtool + wget build-essential autoconf automake libtool git # Install UCX mkdir /tmp/ucx && cd /tmp/ucx @@ -45,7 +45,8 @@ tar -zxf ucc-${UCC_VERSION}.tar.gz cd ucc-${UCC_VERSION} ./autogen.sh ./configure --prefix=${UCC_DIR} --with-ucx=${UCX_DIR} -make -j $(nproc) && make install +make -j $(nproc) +make install # Install OpenMPI mkdir /tmp/openmpi && cd /tmp/openmpi diff --git a/tools/install_deps/install_python.sh b/tools/install_deps/install_python.sh index cc388959..a2dfaae4 100644 --- a/tools/install_deps/install_python.sh +++ b/tools/install_deps/install_python.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail PY_VERSION=${1:-"3.10"} From e23714561cf002412a293487ea00ce221fe130ab Mon Sep 17 00:00:00 2001 From: fuhailin Date: Mon, 20 Oct 2025 12:27:58 +0800 Subject: [PATCH 08/11] [CI] Smart append LD_LIBRARY_PATH in shell when nounset --- tools/install_deps/install_openmpi.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh index 0223e583..fb4c1f12 100644 --- a/tools/install_deps/install_openmpi.sh +++ b/tools/install_deps/install_openmpi.sh @@ -62,13 +62,13 @@ ldconfig cat >bashrc.txt <<'EOF' export OPENMPI_HOME=/opt/openmpi export PATH="${OPENMPI_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" EOF cat bashrc.txt >>/root/.bashrc export OPENMPI_HOME=/opt/openmpi export PATH="${OPENMPI_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${OPENMPI_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" mpirun --version && rm -rf ${BUILD_DIR} From d01648abc09d53118c9fc690676d93abfe47a36c Mon Sep 17 00:00:00 2001 From: fuhailin Date: Mon, 20 Oct 2025 16:17:59 +0800 Subject: [PATCH 09/11] [CI] Update CICD for dev_container --- tools/build_base_container.sh | 2 +- tools/build_dev_container.sh | 7 +++++-- tools/docker/dev_container.Dockerfile | 2 ++ tools/install_deps/install_cmake.sh | 6 +++--- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/build_base_container.sh b/tools/build_base_container.sh index e325a461..901aff47 100755 --- a/tools/build_base_container.sh +++ b/tools/build_base_container.sh @@ -10,9 +10,9 @@ OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/base_container.Dockerfile \ --network=host \ - --progress=plain \ --build-arg http_proxy=http://127.0.0.1:7890 \ --build-arg https_proxy=http://127.0.0.1:7890 \ + --progress=plain \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg TF_VERSION=${TF_VERSION} \ --build-arg PY_VERSION=${PY_VERSION} \ diff --git a/tools/build_dev_container.sh b/tools/build_dev_container.sh index 66653e57..f849e3e2 100755 --- a/tools/build_dev_container.sh +++ b/tools/build_dev_container.sh @@ -4,11 +4,14 @@ set -x -e PY_VERSION=${1:-"3.10"} TF_VERSION=${2:-"2.15.1"} -OS_VERSION=${3:-"20.04"} -CUDA_VERSION=${4:-"12.2.2"} +CUDA_VERSION=${3:-"12.2.2"} +OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/dev_container.Dockerfile \ + --network=host \ + --build-arg http_proxy=http://127.0.0.1:7890 \ + --build-arg https_proxy=http://127.0.0.1:7890 \ --progress=plain \ --build-arg TF_PACKAGE=tensorflow \ --build-arg PY_VERSION=${PY_VERSION} \ diff --git a/tools/docker/dev_container.Dockerfile b/tools/docker/dev_container.Dockerfile index 237682b8..d8d2db44 100644 --- a/tools/docker/dev_container.Dockerfile +++ b/tools/docker/dev_container.Dockerfile @@ -58,6 +58,8 @@ ARG PY_VERSION # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi +COPY --from=openmpi_builder /opt/ucx /opt/ucx +COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version diff --git a/tools/install_deps/install_cmake.sh b/tools/install_deps/install_cmake.sh index faaaaba0..f382423c 100644 --- a/tools/install_deps/install_cmake.sh +++ b/tools/install_deps/install_cmake.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -set -x -e +set -xeuo pipefail CMAKE_VERSION=${1:-"3.31.0"} @@ -27,12 +27,12 @@ rm /tmp/cmake-install.sh cat >bashrc.txt <<'EOF' export CMAKE_HOME=/opt/cmake export PATH="${CMAKE_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${CMAKE_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${CMAKE_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" EOF cat bashrc.txt >>/root/.bashrc export CMAKE_HOME=/opt/cmake export PATH="${CMAKE_HOME}/bin:${PATH}" -export LD_LIBRARY_PATH="${CMAKE_HOME}/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${CMAKE_HOME}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" cmake --version From 9e14ed0e9e7f500727c400ae8447ac4bd9c9cb84 Mon Sep 17 00:00:00 2001 From: fuhailin Date: Mon, 20 Oct 2025 17:15:44 +0800 Subject: [PATCH 10/11] [CI] Update dockerfile --- tools/build_base_container.sh | 3 --- tools/build_dev_container.sh | 3 --- tools/docker/base_container.Dockerfile | 1 + tools/docker/dev_container.Dockerfile | 2 +- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/build_base_container.sh b/tools/build_base_container.sh index 901aff47..12278c54 100755 --- a/tools/build_base_container.sh +++ b/tools/build_base_container.sh @@ -9,9 +9,6 @@ OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/base_container.Dockerfile \ - --network=host \ - --build-arg http_proxy=http://127.0.0.1:7890 \ - --build-arg https_proxy=http://127.0.0.1:7890 \ --progress=plain \ --build-arg CUDA_VERSION=${CUDA_VERSION} \ --build-arg TF_VERSION=${TF_VERSION} \ diff --git a/tools/build_dev_container.sh b/tools/build_dev_container.sh index f849e3e2..f452ec2e 100755 --- a/tools/build_dev_container.sh +++ b/tools/build_dev_container.sh @@ -9,9 +9,6 @@ OS_VERSION=${4:-"20.04"} docker build \ -f tools/docker/dev_container.Dockerfile \ - --network=host \ - --build-arg http_proxy=http://127.0.0.1:7890 \ - --build-arg https_proxy=http://127.0.0.1:7890 \ --progress=plain \ --build-arg TF_PACKAGE=tensorflow \ --build-arg PY_VERSION=${PY_VERSION} \ diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile index f3c77f55..7175f648 100644 --- a/tools/docker/base_container.Dockerfile +++ b/tools/docker/base_container.Dockerfile @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1.4 # Copyright 2025 The Deepray Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tools/docker/dev_container.Dockerfile b/tools/docker/dev_container.Dockerfile index d8d2db44..b31a9d4d 100644 --- a/tools/docker/dev_container.Dockerfile +++ b/tools/docker/dev_container.Dockerfile @@ -1,4 +1,4 @@ -#syntax=docker/dockerfile:1.4 +# syntax=docker/dockerfile:1.4 # Copyright 2025 The Deepray Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); From a43d78684385da151fa62269bf08e43da260ea8c Mon Sep 17 00:00:00 2001 From: fuhailin Date: Mon, 20 Oct 2025 21:59:57 +0800 Subject: [PATCH 11/11] [CI] Fix ucx lib for base container --- tools/docker/base_container.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile index 7175f648..794511d2 100644 --- a/tools/docker/base_container.Dockerfile +++ b/tools/docker/base_container.Dockerfile @@ -47,6 +47,8 @@ ARG PY_VERSION # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi +COPY --from=openmpi_builder /opt/ucx /opt/ucx +COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version @@ -91,8 +93,6 @@ RUN bash /install_deps/install_openssh.sh # Setup openmpi COPY --from=openmpi_builder /opt/openmpi /opt/openmpi -COPY --from=openmpi_builder /opt/ucx /opt/ucx -COPY --from=openmpi_builder /opt/ucc /opt/ucc ENV PATH=${PATH}:/opt/openmpi/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/openmpi/lib RUN mpirun --version