Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions dev/spark-test-image/python-312-classic-only/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
# limitations under the License.
#

# Image for building and testing Spark branches. Based on Ubuntu 22.04.
# Image for building and testing Spark branches. Based on Ubuntu 24.04.
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy-20240911.1
FROM ubuntu:noble
LABEL org.opencontainers.image.authors="Apache Spark project <dev@spark.apache.org>"
LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Classic with Python 3.12"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260207

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -41,26 +41,27 @@ RUN apt-get update && apt-get install -y \
libopenblas-dev \
libssl-dev \
openjdk-17-jdk-headless \
python3.12 \
pkg-config \
tzdata \
software-properties-common \
zlib1g-dev

# Install Python 3.12
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.12 \
zlib1g-dev \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.12 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 pandas==2.3.3 plotly<6.0.0 matplotlib openpyxl memory-profiler>=0.61.0 mlflow>=2.8.1 scipy scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG TEST_PIP_PKGS="coverage unittest-xml-reporting"

# Install Python 3.12 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
RUN python3.12 -m pip install --ignore-installed 'blinker>=1.6.2' # mlflow needs this
RUN python3.12 -m pip install $BASIC_PIP_PKGS $TEST_PIP_PKGS && \
python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
python3.12 -m pip install deepspeed torcheval && \
Expand Down
26 changes: 13 additions & 13 deletions dev/spark-test-image/python-312-pandas-3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@
# Note this is a temporary image file for development with Pandas 3,
# and will be remvoed after PySpark is fully compatible with Pandas 3.

# Image for building and testing Spark branches. Based on Ubuntu 22.04.
# Image for building and testing Spark branches. Based on Ubuntu 24.04.
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy-20240911.1
FROM ubuntu:noble
LABEL org.opencontainers.image.authors="Apache Spark project <dev@spark.apache.org>"
LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.12 and Pandas 3"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260127
ENV FULL_REFRESH_DATE=20260207

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -44,27 +44,27 @@ RUN apt-get update && apt-get install -y \
libopenblas-dev \
libssl-dev \
openjdk-17-jdk-headless \
python3.12 \
pkg-config \
tzdata \
software-properties-common \
zlib1g-dev

# Install Python 3.12
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.12 \
zlib1g-dev \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.12 packages
# Note that mlflow is execluded since it requires pandas<3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas>=3 scipy plotly<6.0.0 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

# Install Python 3.12 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
# RUN python3.12 -m pip install --ignore-installed 'blinker>=1.6.2' # mlflow needs this
RUN python3.12 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS lxml && \
python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
python3.12 -m pip install torcheval && \
Expand Down