From f9828325120d994ba9136fcec3584887b852e8ff Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Sat, 7 Feb 2026 16:45:28 +0800 Subject: [PATCH 1/3] nit --- .github/workflows/build_and_test.yml | 4 +-- .../python-312-classic-only/Dockerfile | 23 ++++++++-------- .../python-312-pandas-3/Dockerfile | 26 +++++++++---------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 97cb5e922782a..9c3971c6f0e07 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,14 +41,14 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-312-classic-only", "PYTHON_TO_TEST": "python3.12"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined in this file, e.g., build. See precondition job below. required: false type: string - default: '' + default: '{"pyspark": "true", "pyspark-pandas": "true"}' secrets: codecov_token: description: The upload token of codecov. diff --git a/dev/spark-test-image/python-312-classic-only/Dockerfile b/dev/spark-test-image/python-312-classic-only/Dockerfile index 685f4e80315c6..d7fc4cfb2b5a9 100644 --- a/dev/spark-test-image/python-312-classic-only/Dockerfile +++ b/dev/spark-test-image/python-312-classic-only/Dockerfile @@ -15,16 +15,16 @@ # limitations under the License. # -# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# Image for building and testing Spark branches. Based on Ubuntu 24.04. # See also in https://hub.docker.com/_/ubuntu -FROM ubuntu:jammy-20240911.1 +FROM ubuntu:noble LABEL org.opencontainers.image.authors="Apache Spark project " LABEL org.opencontainers.image.licenses="Apache-2.0" LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Classic with Python 3.12" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20260203 +ENV FULL_REFRESH_DATE=20260207 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true @@ -41,26 +41,27 @@ RUN apt-get update && apt-get install -y \ libopenblas-dev \ libssl-dev \ openjdk-17-jdk-headless \ + python3.12 \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev - -# Install Python 3.12 -RUN add-apt-repository ppa:deadsnakes/ppa -RUN apt-get update && apt-get install -y \ - python3.12 \ + zlib1g-dev \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Setup virtual environment +ENV VIRTUAL_ENV=/opt/spark-venv +RUN python3.12 -m venv --without-pip $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Install Python 3.12 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 pandas==2.3.3 plotly<6.0.0 matplotlib openpyxl memory-profiler>=0.61.0 mlflow>=2.8.1 scipy scikit-learn>=1.3.2 pystack>=1.6.0 psutil" ARG TEST_PIP_PKGS="coverage unittest-xml-reporting" -# Install Python 3.12 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 -RUN python3.12 -m pip install --ignore-installed 'blinker>=1.6.2' # mlflow needs this RUN python3.12 -m pip install $BASIC_PIP_PKGS $TEST_PIP_PKGS && \ python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ python3.12 -m pip install deepspeed torcheval && \ diff --git a/dev/spark-test-image/python-312-pandas-3/Dockerfile b/dev/spark-test-image/python-312-pandas-3/Dockerfile index 6b2d61be529e5..a310709b1a070 100644 --- a/dev/spark-test-image/python-312-pandas-3/Dockerfile +++ b/dev/spark-test-image/python-312-pandas-3/Dockerfile @@ -18,16 +18,16 @@ # Note this is a temporary image file for development with Pandas 3, # and will be remvoed after PySpark is fully compatible with Pandas 3. -# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# Image for building and testing Spark branches. Based on Ubuntu 24.04. # See also in https://hub.docker.com/_/ubuntu -FROM ubuntu:jammy-20240911.1 +FROM ubuntu:noble LABEL org.opencontainers.image.authors="Apache Spark project " LABEL org.opencontainers.image.licenses="Apache-2.0" LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.12 and Pandas 3" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20260127 +ENV FULL_REFRESH_DATE=20260207 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true @@ -44,27 +44,27 @@ RUN apt-get update && apt-get install -y \ libopenblas-dev \ libssl-dev \ openjdk-17-jdk-headless \ + python3.12 \ pkg-config \ tzdata \ software-properties-common \ - zlib1g-dev - -# Install Python 3.12 -RUN add-apt-repository ppa:deadsnakes/ppa -RUN apt-get update && apt-get install -y \ - python3.12 \ + zlib1g-dev \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Setup virtual environment +ENV VIRTUAL_ENV=/opt/spark-venv +RUN python3.12 -m venv --without-pip $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Install Python 3.12 packages # Note that mlflow is execluded since it requires pandas<3 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 + ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas>=3 scipy plotly<6.0.0 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" -# Python deps for Spark Connect ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3" -# Install Python 3.12 packages -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 -# RUN python3.12 -m pip install --ignore-installed 'blinker>=1.6.2' # mlflow needs this RUN python3.12 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS lxml && \ python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ python3.12 -m pip install torcheval && \ From 6ab5004365ceee446d90187a2cca1def800a4200 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Sat, 7 Feb 2026 18:49:37 +0800 Subject: [PATCH 2/3] test pandas 3 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9c3971c6f0e07..697d8e25f6f51 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,7 +41,7 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-312-classic-only", "PYTHON_TO_TEST": "python3.12"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-312-pandas-3", "PYTHON_TO_TEST": "python3.12"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined From a2ab30bf1748e7a17db11478d6595d9df8285932 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Sun, 8 Feb 2026 08:34:20 +0800 Subject: [PATCH 3/3] restore pr builder --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 697d8e25f6f51..97cb5e922782a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,14 +41,14 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-312-pandas-3", "PYTHON_TO_TEST": "python3.12"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined in this file, e.g., build. See precondition job below. required: false type: string - default: '{"pyspark": "true", "pyspark-pandas": "true"}' + default: '' secrets: codecov_token: description: The upload token of codecov.