diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 97cb5e922782a..9dcb2f2d31358 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -41,14 +41,14 @@ on: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string - default: '{"PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12"}' + default: '{"PYSPARK_IMAGE_TO_TEST": "python-minimum", "PYTHON_TO_TEST": "python3.10"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined in this file, e.g., build. See precondition job below. required: false type: string - default: '' + default: '{"pyspark": "true", "pyspark-pandas": "true"}' secrets: codecov_token: description: The upload token of codecov. diff --git a/dev/spark-test-image/python-minimum/Dockerfile b/dev/spark-test-image/python-minimum/Dockerfile index 4f671c2229d52..011618a36d3f9 100644 --- a/dev/spark-test-image/python-minimum/Dockerfile +++ b/dev/spark-test-image/python-minimum/Dockerfile @@ -15,16 +15,16 @@ # limitations under the License. # -# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# Image for building and testing Spark branches. Based on Ubuntu 24.04. # See also in https://hub.docker.com/_/ubuntu -FROM ubuntu:jammy-20240911.1 +FROM ubuntu:noble LABEL org.opencontainers.image.authors="Apache Spark project " LABEL org.opencontainers.image.licenses="Apache-2.0" LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with old dependencies" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20260127 +ENV FULL_REFRESH_DATE=20260206 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true @@ -43,20 +43,28 @@ RUN apt-get update && apt-get install -y \ libssl-dev \ openjdk-17-jdk-headless \ pkg-config \ - python3.10 \ - python3-psutil \ tzdata \ software-properties-common \ - zlib1g-dev \ + zlib1g-dev + +# Install Python 3.10 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.10 \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==18.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting" -# Python deps for Spark Connect -ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20 protobuf==6.33.5" +# Setup virtual environment +ENV VIRTUAL_ENV=/opt/spark-venv +RUN python3.10 -m venv --without-pip $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Install Python 3.10 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + +ARG BASIC_PIP_PKGS="numpy==1.22.4 pyarrow==18.0.0 pandas==2.2.0 six==1.16.0 scipy scikit-learn coverage unittest-xml-reporting psutil" +ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20 protobuf==6.33.5" + RUN python3.10 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ python3.10 -m pip cache purge diff --git a/dev/spark-test-image/python-ps-minimum/Dockerfile b/dev/spark-test-image/python-ps-minimum/Dockerfile index ca1bd092a2fa8..5daecd379498e 100644 --- a/dev/spark-test-image/python-ps-minimum/Dockerfile +++ b/dev/spark-test-image/python-ps-minimum/Dockerfile @@ -15,16 +15,16 @@ # limitations under the License. # -# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# Image for building and testing Spark branches. Based on Ubuntu 24.04. # See also in https://hub.docker.com/_/ubuntu -FROM ubuntu:jammy-20240911.1 +FROM ubuntu:noble LABEL org.opencontainers.image.authors="Apache Spark project " LABEL org.opencontainers.image.licenses="Apache-2.0" LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For Pandas API on Spark with old dependencies" # Overwrite this label to avoid exposing the underlying Ubuntu OS version label LABEL org.opencontainers.image.version="" -ENV FULL_REFRESH_DATE=20260127 +ENV FULL_REFRESH_DATE=20260206 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true @@ -43,21 +43,28 @@ RUN apt-get update && apt-get install -y \ libssl-dev \ openjdk-17-jdk-headless \ pkg-config \ - python3.10 \ - python3-psutil \ tzdata \ software-properties-common \ - zlib1g-dev \ + zlib1g-dev + +# Install Python 3.10 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.10 \ && apt-get autoremove --purge -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* - -ARG BASIC_PIP_PKGS="pyarrow==18.0.0 pandas==2.2.0 six==1.16.0 numpy scipy coverage unittest-xml-reporting" -# Python deps for Spark Connect -ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20 protobuf==6.33.5" +# Setup virtual environment +ENV VIRTUAL_ENV=/opt/spark-venv +RUN python3.10 -m venv --without-pip $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Install Python 3.10 packages RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + +ARG BASIC_PIP_PKGS="pyarrow==18.0.0 pandas==2.2.0 six==1.16.0 numpy scipy coverage unittest-xml-reporting psutil" +ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20 protobuf==6.33.5" + RUN python3.10 -m pip install --force $BASIC_PIP_PKGS $CONNECT_PIP_PKGS && \ python3.10 -m pip cache purge