From b1b84ae8daace56444e9f6c6e45e312762f1440e Mon Sep 17 00:00:00 2001 From: liyuan Date: Wed, 23 Oct 2024 17:34:16 +0800 Subject: [PATCH 01/10] update v2410 rapids release Signed-off-by: liyuan --- spark-rapids/README.md | 12 ++++++------ spark-rapids/spark-rapids.sh | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 2e5863988..5636c6832 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -20,15 +20,15 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+. To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3 -* Apache Spark 3.0+ +* Apache Spark 3.2+ * Hardware Requirements - * NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later) + * NVIDIA Volta™ GPU architecture or better (V100, T4, A10/A100, L4 and later) * Multi-node clusters with homogenous GPU configuration * Software Requirements - * NVIDIA GPU driver 440.33+ - * CUDA v11.5/v11.0/v10.2/v10.1 + * NVIDIA GPU driver R470+ + * CUDA v11.0+ * NCCL 2.11.4+ - * Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11 + * Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8, Debian 10, Debian 11 This section describes how to create [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with @@ -63,7 +63,7 @@ export CUDA_VER=11.5 gcloud dataproc clusters create $CLUSTER_NAME \ --region $REGION \ - --image-version=2.0-ubuntu18 \ + --image-version=2.1-ubuntu20 \ --master-machine-type n1-standard-4 \ --master-boot-disk-size 200 \ --num-workers $NUM_WORKERS \ diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 492848340..7e359e7d9 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -216,7 +216,7 @@ else fi # Update SPARK RAPIDS config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.10.0" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) From 762c72a5b2481f8e70dbb15f0438db34d21ba2c3 Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 25 Oct 2024 15:26:19 +0800 Subject: [PATCH 02/10] update the readme doc Signed-off-by: liyuan --- spark-rapids/README.md | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 5636c6832..ff9b46953 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+. ## RAPIDS Accelerator For Apache Spark ### Prerequisites - -To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3 - -* Apache Spark 3.2+ -* Hardware Requirements - * NVIDIA Volta™ GPU architecture or better (V100, T4, A10/A100, L4 and later) - * Multi-node clusters with homogenous GPU configuration -* Software Requirements - * NVIDIA GPU driver R470+ - * CUDA v11.0+ - * NCCL 2.11.4+ - * Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8, Debian 10, Debian 11 +Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) +official doc for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html). This section describes how to create [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with @@ -59,11 +49,10 @@ export GCS_BUCKET= export REGION= export NUM_GPUS=1 export NUM_WORKERS=2 -export CUDA_VER=11.5 gcloud dataproc clusters create $CLUSTER_NAME \ --region $REGION \ - --image-version=2.1-ubuntu20 \ + --image-version=2.2-ubuntu22 \ --master-machine-type n1-standard-4 \ --master-boot-disk-size 200 \ --num-workers $NUM_WORKERS \ @@ -71,8 +60,6 @@ gcloud dataproc clusters create $CLUSTER_NAME \ --worker-machine-type n1-standard-8 \ --num-worker-local-ssds 1 \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \ - --optional-components=JUPYTER,ZEPPELIN \ - --metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \ --bucket $GCS_BUCKET \ --subnet=default \ --enable-component-gateway From 5bbac9a994ab29cf9f0df70aec45178efbf20dc7 Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 25 Oct 2024 15:27:38 +0800 Subject: [PATCH 03/10] update the readme doc Signed-off-by: liyuan --- spark-rapids/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index ff9b46953..92f55fea3 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -18,7 +18,7 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+. ### Prerequisites Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) -official doc for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html). +official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html). This section describes how to create [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with From 5e7302798ca8029b7bb78ec6c2d2b977f46c0bec Mon Sep 17 00:00:00 2001 From: liyuan Date: Tue, 24 Dec 2024 14:38:45 +0800 Subject: [PATCH 04/10] update v2412 version Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 7e359e7d9..aaab584e3 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -216,7 +216,7 @@ else fi # Update SPARK RAPIDS config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.10.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) From a12b4fe06f66d4f87c240d3872e8001a0c954534 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 18:36:25 -0800 Subject: [PATCH 05/10] do not recreate git clone on second pass --- spark-rapids/spark-rapids.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index aaab584e3..f80af0682 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() { mkdir -p "${WORKDIR}" pushd $_ # Fetch open souce kernel module with corresponding tag - git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ - --branch "${NVIDIA_DRIVER_VERSION}" --single-branch + test -d open-gpu-kernel-modules || \ + git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ + --branch "${NVIDIA_DRIVER_VERSION}" --single-branch cd ${WORKDIR}/open-gpu-kernel-modules # # build kernel modules From 8a2a9687ea4f5d453489fe5648fc00bee5c7c58b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 18:56:24 -0800 Subject: [PATCH 06/10] do not clone compute-gpu-monitoring if it is already extant --- spark-rapids/spark-rapids.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index f80af0682..31e14a469 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -526,7 +526,8 @@ function download_agent(){ mkdir -p /opt/google chmod 777 /opt/google cd /opt/google - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } function install_agent_dependency(){ From 57d95f2d6df266f8a65c5b6ba02472a276c5ac91 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 19:13:45 -0800 Subject: [PATCH 07/10] tricking the test framework into only running our tests while using new Dockerfile --- cloudbuild/Dockerfile | 22 +++++++++++++++------- cloudbuild/presubmit.sh | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index 94e6e6cb3..aebaffd84 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -9,16 +9,24 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html -ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg +ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \ + bazel_version=7.4.0 \ + bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \ + bazel_repo_file="/etc/apt/sources.list.d/bazel.list" \ + DEBIAN_FRONTEND=noninteractive RUN apt-get install -y -qq curl >/dev/null 2>&1 && \ apt-get clean -RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \ - gpg --dearmor -o "${bazel_kr_path}" -RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \ - dd of=/etc/apt/sources.list.d/bazel.list status=none && \ +RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ + gpg --import --no-default-keyring --keyring "${bazel_kr_path}" && \ + echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \ + dd of="${bazel_repo_file}" status=none && \ apt-get update -qq -RUN apt-get autoremove -y -qq && \ - apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ +RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ apt-get clean +# Set bazel-${bazel_version} as the default bazel alternative in this container +RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \ + update-alternatives --set bazel /usr/bin/bazel-${bazel_version} + USER ia-tests diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index eec7adb76..d8b2bed17 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue # remove before merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 From ab836653744034ffd18b5c1f48c7c4682298708e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 19:51:14 -0800 Subject: [PATCH 08/10] gathering timing data for some long-running sections of the installer --- spark-rapids/spark-rapids.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 31e14a469..0b4aabd57 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -261,7 +261,7 @@ IS_MIG_ENABLED=0 function execute_with_retries() { local -r cmd=$1 for ((i = 0; i < 10; i++)); do - if eval "$cmd"; then + if time eval "$cmd"; then return 0 fi sleep 5 @@ -452,7 +452,7 @@ function install_nvidia_gpu_driver() { curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \ -o cuda.run - bash cuda.run --silent --toolkit --no-opengl-libs + time bash cuda.run --silent --toolkit --no-opengl-libs rm cuda.run else # Install from repo provided by NV From 00caa8baecf75b9260f9826b1b640da0729d9c88 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 20:39:58 -0800 Subject: [PATCH 09/10] over-commitment on the disk space cleaned up --- spark-rapids/test_spark_rapids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 7af8e3154..6e03f2d62 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From f5acb7ef88510686e83cab0db32feaef5c763fee Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 20:55:42 -0800 Subject: [PATCH 10/10] revert to master for final squash+merge --- cloudbuild/Dockerfile | 22 +++++++--------------- cloudbuild/presubmit.sh | 1 - 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index aebaffd84..94e6e6cb3 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -9,24 +9,16 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html -ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \ - bazel_version=7.4.0 \ - bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \ - bazel_repo_file="/etc/apt/sources.list.d/bazel.list" \ - DEBIAN_FRONTEND=noninteractive +ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg RUN apt-get install -y -qq curl >/dev/null 2>&1 && \ apt-get clean -RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ - gpg --import --no-default-keyring --keyring "${bazel_kr_path}" && \ - echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \ - dd of="${bazel_repo_file}" status=none && \ +RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \ + gpg --dearmor -o "${bazel_kr_path}" +RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \ + dd of=/etc/apt/sources.list.d/bazel.list status=none && \ apt-get update -qq -RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ apt-get clean -# Set bazel-${bazel_version} as the default bazel alternative in this container -RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \ - update-alternatives --set bazel /usr/bin/bazel-${bazel_version} - USER ia-tests diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index d8b2bed17..eec7adb76 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,7 +70,6 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then - continue # remove before merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0