From b1b84ae8daace56444e9f6c6e45e312762f1440e Mon Sep 17 00:00:00 2001
From: liyuan <yuali@nvidia.com>
Date: Wed, 23 Oct 2024 17:34:16 +0800
Subject: [PATCH 01/10] update v2410 rapids release

Signed-off-by: liyuan <yuali@nvidia.com>
---
 spark-rapids/README.md       | 12 ++++++------
 spark-rapids/spark-rapids.sh |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/spark-rapids/README.md b/spark-rapids/README.md
index 2e5863988..5636c6832 100644
--- a/spark-rapids/README.md
+++ b/spark-rapids/README.md
@@ -20,15 +20,15 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
 
 To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3
 
-*   Apache Spark 3.0+
+*   Apache Spark 3.2+
 *   Hardware Requirements
-    *   NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later)
+    *   NVIDIA Volta™ GPU architecture or better (V100, T4, A10/A100, L4 and later)
     *   Multi-node clusters with homogenous GPU configuration
 *   Software Requirements
-    *   NVIDIA GPU driver 440.33+
-    *   CUDA v11.5/v11.0/v10.2/v10.1
+    *   NVIDIA GPU driver R470+
+    *   CUDA v11.0+
     *   NCCL 2.11.4+
-    *   Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11
+    *   Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8, Debian 10, Debian 11
 
 This section describes how to create
 [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with
@@ -63,7 +63,7 @@ export CUDA_VER=11.5
 
 gcloud dataproc clusters create $CLUSTER_NAME  \
     --region $REGION \
-    --image-version=2.0-ubuntu18 \
+    --image-version=2.1-ubuntu20 \
     --master-machine-type n1-standard-4 \
     --master-boot-disk-size 200 \
     --num-workers $NUM_WORKERS \
diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index 492848340..7e359e7d9 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -216,7 +216,7 @@ else
 fi
 
 # Update SPARK RAPIDS config
-readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+readonly DEFAULT_SPARK_RAPIDS_VERSION="24.10.0"
 readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
 readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 

From 762c72a5b2481f8e70dbb15f0438db34d21ba2c3 Mon Sep 17 00:00:00 2001
From: liyuan <yuali@nvidia.com>
Date: Fri, 25 Oct 2024 15:26:19 +0800
Subject: [PATCH 02/10] update the readme doc

Signed-off-by: liyuan <yuali@nvidia.com>
---
 spark-rapids/README.md | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/spark-rapids/README.md b/spark-rapids/README.md
index 5636c6832..ff9b46953 100644
--- a/spark-rapids/README.md
+++ b/spark-rapids/README.md
@@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
 ## RAPIDS Accelerator For Apache Spark
 
 ### Prerequisites
-
-To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3
-
-*   Apache Spark 3.2+
-*   Hardware Requirements
-    *   NVIDIA Volta™ GPU architecture or better (V100, T4, A10/A100, L4 and later)
-    *   Multi-node clusters with homogenous GPU configuration
-*   Software Requirements
-    *   NVIDIA GPU driver R470+
-    *   CUDA v11.0+
-    *   NCCL 2.11.4+
-    *   Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8, Debian 10, Debian 11
+Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) 
+official doc for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).
 
 This section describes how to create
 [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with
@@ -59,11 +49,10 @@ export GCS_BUCKET=<your bucket for the logs and notebooks>
 export REGION=<region>
 export NUM_GPUS=1
 export NUM_WORKERS=2
-export CUDA_VER=11.5
 
 gcloud dataproc clusters create $CLUSTER_NAME  \
     --region $REGION \
-    --image-version=2.1-ubuntu20 \
+    --image-version=2.2-ubuntu22 \
     --master-machine-type n1-standard-4 \
     --master-boot-disk-size 200 \
     --num-workers $NUM_WORKERS \
@@ -71,8 +60,6 @@ gcloud dataproc clusters create $CLUSTER_NAME  \
     --worker-machine-type n1-standard-8 \
     --num-worker-local-ssds 1 \
     --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
-    --optional-components=JUPYTER,ZEPPELIN \
-    --metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \
     --bucket $GCS_BUCKET \
     --subnet=default \
     --enable-component-gateway

From 5bbac9a994ab29cf9f0df70aec45178efbf20dc7 Mon Sep 17 00:00:00 2001
From: liyuan <yuali@nvidia.com>
Date: Fri, 25 Oct 2024 15:27:38 +0800
Subject: [PATCH 03/10] update the readme doc

Signed-off-by: liyuan <yuali@nvidia.com>
---
 spark-rapids/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark-rapids/README.md b/spark-rapids/README.md
index ff9b46953..92f55fea3 100644
--- a/spark-rapids/README.md
+++ b/spark-rapids/README.md
@@ -18,7 +18,7 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
 
 ### Prerequisites
 Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) 
-official doc for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).
+official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).
 
 This section describes how to create
 [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with

From 5e7302798ca8029b7bb78ec6c2d2b977f46c0bec Mon Sep 17 00:00:00 2001
From: liyuan <yuali@nvidia.com>
Date: Tue, 24 Dec 2024 14:38:45 +0800
Subject: [PATCH 04/10] update v2412 version

Signed-off-by: liyuan <yuali@nvidia.com>
---
 spark-rapids/spark-rapids.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index 7e359e7d9..aaab584e3 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -216,7 +216,7 @@ else
 fi
 
 # Update SPARK RAPIDS config
-readonly DEFAULT_SPARK_RAPIDS_VERSION="24.10.0"
+readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
 readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
 readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 

From a12b4fe06f66d4f87c240d3872e8001a0c954534 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 18:36:25 -0800
Subject: [PATCH 05/10] do not recreate git clone on second pass

---
 spark-rapids/spark-rapids.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index aaab584e3..f80af0682 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() {
       mkdir -p "${WORKDIR}"
       pushd $_
       # Fetch open souce kernel module with corresponding tag
-      git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
-          --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
+      test -d open-gpu-kernel-modules || \
+	 git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
+            --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
       cd ${WORKDIR}/open-gpu-kernel-modules
       #
       # build kernel modules

From 8a2a9687ea4f5d453489fe5648fc00bee5c7c58b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 18:56:24 -0800
Subject: [PATCH 06/10] do not clone compute-gpu-monitoring if it is already
 extant

---
 spark-rapids/spark-rapids.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index f80af0682..31e14a469 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -526,7 +526,8 @@ function download_agent(){
   mkdir -p /opt/google
   chmod 777 /opt/google
   cd /opt/google
-  execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
 }
 
 function install_agent_dependency(){

From 57d95f2d6df266f8a65c5b6ba02472a276c5ac91 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 19:13:45 -0800
Subject: [PATCH 07/10] tricking the test framework into only running our tests
 while using new Dockerfile

---
 cloudbuild/Dockerfile   | 22 +++++++++++++++-------
 cloudbuild/presubmit.sh |  1 +
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index 94e6e6cb3..aebaffd84 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -9,16 +9,24 @@ COPY --chown=ia-tests:ia-tests . /init-actions
 
 # Install Bazel:
 # https://docs.bazel.build/versions/master/install-ubuntu.html
-ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg
+ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \
+    bazel_version=7.4.0 \
+    bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \
+    bazel_repo_file="/etc/apt/sources.list.d/bazel.list" \
+    DEBIAN_FRONTEND=noninteractive
 RUN apt-get install -y -qq curl >/dev/null 2>&1 && \
     apt-get clean
-RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \
-    gpg --dearmor -o "${bazel_kr_path}"
-RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \
-    dd of=/etc/apt/sources.list.d/bazel.list status=none && \
+RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
+    gpg --import --no-default-keyring --keyring "${bazel_kr_path}" && \
+    echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \
+    dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
-RUN apt-get autoremove -y -qq && \
-    apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
     apt-get clean
 
+# Set bazel-${bazel_version} as the default bazel alternative in this container
+RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
+    update-alternatives                    --set bazel /usr/bin/bazel-${bazel_version}
+
 USER ia-tests
diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index eec7adb76..d8b2bed17 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,6 +70,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue # remove before merge
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0

From ab836653744034ffd18b5c1f48c7c4682298708e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 19:51:14 -0800
Subject: [PATCH 08/10] gathering timing data for some long-running sections of
 the installer

---
 spark-rapids/spark-rapids.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index 31e14a469..0b4aabd57 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -261,7 +261,7 @@ IS_MIG_ENABLED=0
 function execute_with_retries() {
   local -r cmd=$1
   for ((i = 0; i < 10; i++)); do
-    if eval "$cmd"; then
+    if time eval "$cmd"; then
       return 0
     fi
     sleep 5
@@ -452,7 +452,7 @@ function install_nvidia_gpu_driver() {
       curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
        "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
        -o cuda.run
-      bash cuda.run --silent --toolkit --no-opengl-libs
+      time bash cuda.run --silent --toolkit --no-opengl-libs
       rm cuda.run
     else
       # Install from repo provided by NV

From 00caa8baecf75b9260f9826b1b640da0729d9c88 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 20:39:58 -0800
Subject: [PATCH 09/10] over-commitment on the disk space cleaned up

---
 spark-rapids/test_spark_rapids.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 7af8e3154..6e03f2d62 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
       machine_type="n1-standard-4",
       master_accelerator=accelerator if configuration == "SINGLE" else None,
       worker_accelerator=accelerator,
-      boot_disk_size="1024GB",
+      boot_disk_size="50GB",
       timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:

From f5acb7ef88510686e83cab0db32feaef5c763fee Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 20:55:42 -0800
Subject: [PATCH 10/10] revert to master for final squash+merge

---
 cloudbuild/Dockerfile   | 22 +++++++---------------
 cloudbuild/presubmit.sh |  1 -
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index aebaffd84..94e6e6cb3 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -9,24 +9,16 @@ COPY --chown=ia-tests:ia-tests . /init-actions
 
 # Install Bazel:
 # https://docs.bazel.build/versions/master/install-ubuntu.html
-ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \
-    bazel_version=7.4.0 \
-    bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \
-    bazel_repo_file="/etc/apt/sources.list.d/bazel.list" \
-    DEBIAN_FRONTEND=noninteractive
+ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg
 RUN apt-get install -y -qq curl >/dev/null 2>&1 && \
     apt-get clean
-RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
-    gpg --import --no-default-keyring --keyring "${bazel_kr_path}" && \
-    echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \
-    dd of="${bazel_repo_file}" status=none && \
+RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \
+    gpg --dearmor -o "${bazel_kr_path}"
+RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \
+    dd of=/etc/apt/sources.list.d/bazel.list status=none && \
     apt-get update -qq
-RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq && \
+    apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \
     apt-get clean
 
-# Set bazel-${bazel_version} as the default bazel alternative in this container
-RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
-    update-alternatives                    --set bazel /usr/bin/bazel-${bazel_version}
-
 USER ia-tests
diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index d8b2bed17..eec7adb76 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,7 +70,6 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
-      continue # remove before merge
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0