Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 137 additions & 60 deletions src/job-exporter/build/job-exporter.common.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,75 +16,152 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04
############################
# builder: only for compiling python wheels
############################
FROM ubuntu:22.04 AS builder

ARG TARGETARCH
# Register the ROCM package repository, and install rocm-dev package
ARG ROCM_VERSION=6.2.2
ARG AMDGPU_VERSION=6.2.2

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
bash \
build-essential \
cmake \
curl \
file \
g++ \
git \
gnupg \
ibverbs-utils \
kmod \
libc++-dev \
libcap-dev \
libelf1 \
libgflags-dev \
libgtest-dev \
libnuma-dev \
libtool \
numactl \
pkg-config \
python3-dev \
python3-pip \
sudo \
unzip && \
if [ "$TARGETARCH" = "amd64" ]; then \
printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | tee /etc/apt/preferences.d/rocm-pin-600 && \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \
echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev; \
fi
RUN set -eux; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
python3-pip \
python3-dev \
build-essential \
gcc; \
rm -rf /var/lib/apt/lists/*

COPY src/Moneo /Moneo
WORKDIR /w

# Install RDC
RUN if [ "$TARGETARCH" = "amd64" ]; then sudo bash Moneo/src/worker/install/amd.sh; fi
# build wheels once
COPY requirements.txt /w/requirements.txt
RUN python3 -m pip install --no-cache-dir -U pip wheel && \
python3 -m pip wheel --no-cache-dir --wheel-dir /w/wheels \
-r /w/requirements.txt \
prometheus_client psutil filelock

# Install DCGM
RUN sed -i 's/systemctl --now enable nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \
sed -i 's/systemctl start nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \
sudo bash Moneo/src/worker/install/nvidia.sh

ENV PATH="${PATH}:/opt/rocm/bin"
COPY build/moneo-*-exporter_entrypoint.sh ./
COPY build/update-dcgm.py .
############################
# nerdctl-builder: build nerdctl from source
############################
FROM golang:1.25.6 AS nerdctl-builder

ARG TARGETARCH
ARG NERDCTL_VERSION=2.2.1

WORKDIR /build

# For the job exporter
ENV NERDCTL_VERSION=2.1.3
RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates
RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \
mkdir -p /tmp/nerdctl && \
tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \
mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \
mkdir -p /job_exporter && \
rm -rf /tmp/nerdctl*
RUN set -eux; \
git clone --depth 1 --branch v${NERDCTL_VERSION} https://github.com/containerd/nerdctl.git .; \
make binaries; \
mkdir -p /opt/nerdctl; \
cp _output/nerdctl /opt/nerdctl/nerdctl; \
chmod +x /opt/nerdctl/nerdctl

COPY requirements.txt /job_exporter/
RUN pip3 install -r /job_exporter/requirements.txt

RUN apt update && apt upgrade -y && apt-get clean && rm -rf /var/lib/apt/lists/*
############################
# runtime: minimal CUDA base with only essential components
############################
FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-base-ubuntu22.04

ARG TARGETARCH
ARG ROCM_VERSION=6.2.2
ARG AMDGPU_VERSION=6.2.2
ARG DCGM_TARGET_VERSION=1:4.4.1-1

# --------------------------
# Install all components in single layer for size optimization
# --------------------------
RUN set -eux; \
# Base setup
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
gnupg \
python3 \
kmod; \
# ROCm (runtime only) for AMD GPUs
if [ "$TARGETARCH" = "amd64" ]; then \
printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \
> /etc/apt/preferences.d/rocm-pin-600; \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \
echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \
> /etc/apt/sources.list.d/rocm.list; \
echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \
Comment on lines +91 to +94
Copy link

Copilot AI Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The command apt-key add is deprecated and will be removed in a future Ubuntu release. Consider using the recommended approach of placing the GPG key in /etc/apt/trusted.gpg.d/ or /usr/share/keyrings/ and referencing it with the signed-by option in the sources.list entry. For example: curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /usr/share/keyrings/rocm-archive-keyring.gpg and then use [signed-by=/usr/share/keyrings/rocm-archive-keyring.gpg] in the deb line.

Suggested change
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \
echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \
> /etc/apt/sources.list.d/rocm.list; \
echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /usr/share/keyrings/rocm-archive-keyring.gpg; \
echo "deb [signed-by=/usr/share/keyrings/rocm-archive-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \
> /etc/apt/sources.list.d/rocm.list; \
echo "deb [signed-by=/usr/share/keyrings/rocm-archive-keyring.gpg] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \

Copilot uses AI. Check for mistakes.
> /etc/apt/sources.list.d/amdgpu.list; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \
fi; \
# DCGM for GPU monitoring (NVIDIA)
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \
datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \
datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \
# Clean up everything in single layer
apt-get remove -y curl gnupg; \
apt-get autoremove -y; \
apt-get clean; \
rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*

# --------------------------
# nerdctl (copy from nerdctl-builder)
# --------------------------
COPY --from=nerdctl-builder /opt/nerdctl/nerdctl /usr/local/bin/nerdctl

# --------------------------
# python runtime deps (from wheels)
# --------------------------

COPY --from=builder /w/wheels /wheels
COPY requirements.txt /job_exporter/requirements.txt

RUN set -eux; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip; \
python3 -m pip install --no-cache-dir -U pip && \
python3 -m pip install --no-cache-dir \
--no-index --find-links=/wheels \
-r /job_exporter/requirements.txt && \
python3 -m pip install --no-cache-dir \
--no-index --find-links=/wheels \
prometheus_client psutil filelock && \
# Mark important packages as manual to prevent removal
apt-mark manual rdc amd-smi-lib 2>/dev/null || true; \
apt-get remove -y python3-pip; \
# Set environment variable to allow sudo removal during autoremove
SUDO_FORCE_REMOVE=yes apt-get autoremove -y; \
apt-get clean; \
rm -rf /wheels /root/.cache /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*

# --------------------------
# app files
# --------------------------
COPY src/Moneo /Moneo
COPY src/*.py /job_exporter/
COPY build/moneo-*-exporter_entrypoint.sh ./

# --------------------------
# Final cleanup: remove unnecessary CUDA files to reduce image size
# --------------------------
RUN set -eux; \
# Remove CUDA static libraries (we only need shared libs for runtime)
find /usr/local/cuda-12.0 -name "*.a" -delete 2>/dev/null || true; \
find /usr/local/cuda-12.0 -name "*.la" -delete 2>/dev/null || true; \
# Remove CUDA development tools and samples
rm -rf /usr/local/cuda-12.0/nsight* \
/usr/local/cuda-12.0/libnvvp \
/usr/local/cuda-12.0/doc \
/usr/local/cuda-12.0/samples \
/usr/local/cuda-12.0/extras \
2>/dev/null || true; \
# Remove documentation and man pages
rm -rf /usr/share/doc/* \
/usr/share/man/* \
/usr/share/info/* \
2>/dev/null || true; \
# Final cache cleanup
rm -rf /var/cache/* /tmp/* /var/tmp/* 2>/dev/null || true
1 change: 0 additions & 1 deletion src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ if lsmod | grep -qi amdgpu; then
echo "AMD Exporter Started!"
elif lsmod | grep -qi nvidia; then
echo "NVIDIA Graphics card detected."
python3 /update-dcgm.py
# Launches NVIDIA DCGM Daemon
nohup nv-hostengine &
echo "DCGM Daemon Started!"
Expand Down
117 changes: 0 additions & 117 deletions src/job-exporter/build/update-dcgm.py

This file was deleted.

Loading
Loading