From 35ed516a310cf6062636f2329289468bb450a002 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Tue, 20 Jan 2026 01:50:28 +0000 Subject: [PATCH 1/9] update job exporter docker file to remove all the build step for rocm and simplify the build steps --- .../build/job-exporter.common.dockerfile | 63 ++++------ .../build/moneo-gpu-exporter_entrypoint.sh | 1 - src/job-exporter/build/update-dcgm.py | 117 ------------------ .../src/Moneo/src/worker/install/amd.sh | 60 --------- .../src/Moneo/src/worker/install/common.sh | 20 --- .../src/Moneo/src/worker/install/nvidia.sh | 86 ------------- 6 files changed, 24 insertions(+), 323 deletions(-) delete mode 100644 src/job-exporter/build/update-dcgm.py delete mode 100644 src/job-exporter/src/Moneo/src/worker/install/amd.sh delete mode 100644 src/job-exporter/src/Moneo/src/worker/install/common.sh delete mode 100755 src/job-exporter/src/Moneo/src/worker/install/nvidia.sh diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 3997304d..1004495c 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -19,62 +19,34 @@ FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 ARG TARGETARCH -# Register the ROCM package repository, and install rocm-dev package ARG ROCM_VERSION=6.2.2 ARG AMDGPU_VERSION=6.2.2 +ARG DCGM_TARGET_VERSION = "1:4.4.1-1" RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - autoconf \ - automake \ bash \ - build-essential \ - cmake \ curl \ - file \ - g++ \ - git \ gnupg \ - ibverbs-utils \ - kmod \ - libc++-dev \ - libcap-dev \ - libelf1 \ - libgflags-dev \ - libgtest-dev \ - libnuma-dev \ - libtool \ - numactl \ - pkg-config \ - python3-dev \ + wget \ + ca-certificates \ python3-pip \ - sudo \ - unzip && \ + python3-dev \ + sudo && \ if [ "$TARGETARCH" = "amd64" ]; then \ printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | tee /etc/apt/preferences.d/rocm-pin-600 && \ curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev rdc; \ fi -COPY src/Moneo /Moneo - -# Install RDC -RUN if [ "$TARGETARCH" = "amd64" ]; then sudo bash Moneo/src/worker/install/amd.sh; fi - -# Install DCGM -RUN sed -i 's/systemctl --now enable nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \ - sed -i 's/systemctl start nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \ - sudo bash Moneo/src/worker/install/nvidia.sh - -ENV PATH="${PATH}:/opt/rocm/bin" -COPY build/moneo-*-exporter_entrypoint.sh ./ -COPY build/update-dcgm.py . +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \ + datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION} -# For the job exporter -ENV NERDCTL_VERSION=2.1.3 -RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates +ENV NERDCTL_VERSION=2.2.1 RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \ mkdir -p /tmp/nerdctl && \ tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \ @@ -82,6 +54,19 @@ RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/d mkdir -p /job_exporter && \ rm -rf /tmp/nerdctl* +RUN python3 -m pip install prometheus_client psutil filelock + +COPY src/Moneo /Moneo + +ENV PATH="${PATH}:/opt/rocm/bin" +COPY build/moneo-*-exporter_entrypoint.sh ./ + +RUN [ -d /opt/rocm/lib ] && echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf; \ + [ -d /opt/rocm/rdc/lib ] && echo "/opt/rocm/rdc/lib" >> /etc/ld.so.conf.d/rocm.conf; \ + [ -d /opt/rocm/llvm/lib ] && echo "/opt/rocm/llvm/lib" >> /etc/ld.so.conf.d/rocm.conf + +RUN ldconfig + COPY requirements.txt /job_exporter/ RUN pip3 install -r /job_exporter/requirements.txt diff --git a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh index 9798fd96..e7006354 100755 --- a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh +++ b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh @@ -9,7 +9,6 @@ if lsmod | grep -qi amdgpu; then echo "AMD Exporter Started!" elif lsmod | grep -qi nvidia; then echo "NVIDIA Graphics card detected." - python3 /update-dcgm.py # Launches NVIDIA DCGM Daemon nohup nv-hostengine & echo "DCGM Daemon Started!" diff --git a/src/job-exporter/build/update-dcgm.py b/src/job-exporter/build/update-dcgm.py deleted file mode 100644 index 8eef6d2c..00000000 --- a/src/job-exporter/build/update-dcgm.py +++ /dev/null @@ -1,117 +0,0 @@ -import subprocess -import sys -import re -import fileinput - -#!/usr/bin/env python3 - -DCGM_TARGET_VERSION = "1:4.4.1-1" - -def get_dcgm_version(): - try: - result = subprocess.run( - ["dpkg", "--list", "datacenter-gpu-manager"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) - for line in result.stdout.splitlines(): - if line.startswith("ii") and "datacenter-gpu-manager" in line: - # Example line: ii datacenter-gpu-manager 1.2.3-1 amd64 NVIDIA datacenter GPU management tools - parts = re.split(r'\s+', line) - if len(parts) >= 3: - return parts[2] - print("datacenter-gpu-manager is not installed.", file=sys.stderr) - sys.exit(0) - except subprocess.CalledProcessError as e: - print("Error running dpkg:", e, file=sys.stderr) - sys.exit(0) - - -def get_cuda_version(): - try: - result = subprocess.run( - ["nvidia-smi"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) - for line in result.stdout.splitlines(): - match = re.search(r"CUDA Version:\s*([\d\.]+)", line) - if match: - return match.group(1) - print("CUDA version not found in nvidia-smi output.", file=sys.stderr) - sys.exit(0) - except subprocess.CalledProcessError as e: - print("Error running nvidia-smi:", e, file=sys.stderr) - sys.exit(0) - - -def remove_dcgm(): - try: - subprocess.run( - ["apt", "purge", "--yes", "datacenter-gpu-manager"], - check=True - ) - subprocess.run( - ["apt", "purge", "--yes", "datacenter-gpu-manager-config"], - check=True - ) - print("datacenter-gpu-manager and its config have been removed.") - except subprocess.CalledProcessError as e: - print("Error removing datacenter-gpu-manager:", e, file=sys.stderr) - - - -def install_latest_dcgm(): - try: - subprocess.run( - ["apt", "update"], - check=True - ) - subprocess.run( - [ - "apt-get", "install", "--yes", - f"datacenter-gpu-manager-4-cuda12={DCGM_TARGET_VERSION}", - f"datacenter-gpu-manager-4-core={DCGM_TARGET_VERSION}", - f"datacenter-gpu-manager-4-proprietary-cuda12={DCGM_TARGET_VERSION}" - ], - check=True - ) - print("Latest datacenter-gpu-manager-4-cuda12 has been installed.") - except subprocess.CalledProcessError as e: - print("Error installing datacenter-gpu-manager-4-cuda12:", e, file=sys.stderr) - sys.exit(1) - - -def update_nvidia_exporter_path(file_path): - - old_line = "sys.path.append('/usr/local/dcgm/bindings/python3')" - new_line = "sys.path.append('/usr/share/datacenter-gpu-manager-4/bindings/python3')" - - replaced = False - for line in fileinput.input(file_path, inplace=True): - if old_line in line: - print(line.replace(old_line, new_line), end='') - replaced = True - else: - print(line, end='') - if replaced: - print(f"Updated sys.path in {file_path}") - else: - print(f"No matching sys.path line found in {file_path}") - -if __name__ == "__main__": - version = get_dcgm_version() - print(f"Current DCGM version: {version}") - cuda_version = get_cuda_version() - print(f"Current CUDA version: {cuda_version}") - - if version.startswith("1:3.") and float(cuda_version) >= 12.8: - remove_dcgm() - install_latest_dcgm() - update_nvidia_exporter_path("/Moneo/src/worker/exporters/nvidia_exporter.py") - else: - print("no dcgm update") \ No newline at end of file diff --git a/src/job-exporter/src/Moneo/src/worker/install/amd.sh b/src/job-exporter/src/Moneo/src/worker/install/amd.sh deleted file mode 100644 index cc39c286..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/amd.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -source ./$(dirname "${BASH_SOURCE[0]}")/common.sh -apt-get install -y automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev libc++-dev curl libcap-dev - -# install grpc -export GRPC_ROOT=/opt/grpc - -# Check if the directory exists and is not empty -if [ -d "$GRPC_ROOT" ] && [ "$(ls -A $GRPC_ROOT)" ]; then - cd "$GRPC_ROOT" - git pull -else - git clone -b v1.61.0 https://github.com/grpc/grpc --depth=1 --shallow-submodules --recurse-submodules "$GRPC_ROOT" - cd "$GRPC_ROOT" -fi -cmake -B build \ - -DgRPC_INSTALL=ON \ - -DgRPC_BUILD_TESTS=OFF \ - -DBUILD_SHARED_LIBS=ON \ - -DCMAKE_INSTALL_PREFIX="$GRPC_ROOT" \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_BUILD_TYPE=Release -make -C build -j $(nproc) -make -C build install -echo "$GRPC_ROOT" | sudo tee /etc/ld.so.conf.d/grpc.conf - -# install rdc -export RDC_ROOT=/opt/rdc -# Check if the directory exists and is not empty -if [ -d "$RDC_ROOT" ] && [ "$(ls -A $RDC_ROOT)" ]; then - cd "$RDC_ROOT" - git pull -else - git clone --depth 1 --branch rocm-6.2.2 https://github.com/RadeonOpenCompute/rdc "$RDC_ROOT" - cd "$RDC_ROOT" -fi - -git fetch origin amd-staging -git config user.email "Moneo@local.host" -git config user.name "Moneo" -git cherry-pick 660c5afaf49630781c1059ba6d30bae21743c32f - -# default installation location is /opt/rocm, specify with -DROCM_DIR or -DCMAKE_INSTALL_PREFIX -cmake -B build -DGRPC_ROOT="$GRPC_ROOT" -DROCM_DIR="/opt/rocm" -DCMAKE_INSTALL_PREFIX="/opt/rocm" -make -C build -j $(nproc) -make -C build install - -# Update ldconfig -export RDC_LIB_DIR=/opt/rocm/lib/rdc -export GRPC_LIB_DIR=/opt/grpc/lib -echo -e "${GRPC_LIB_DIR}\n${GRPC_LIB_DIR}64" | sudo tee /etc/ld.so.conf.d/x86_64-librdc_client.conf -echo -e "${RDC_LIB_DIR}\n${RDC_LIB_DIR}64" | sudo tee -a /etc/ld.so.conf.d/x86_64-librdc_client.conf -ldconfig diff --git a/src/job-exporter/src/Moneo/src/worker/install/common.sh b/src/job-exporter/src/Moneo/src/worker/install/common.sh deleted file mode 100644 index 938887b8..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/common.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -# install DCGM -distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` -if [[ $distro =~ "Ubuntu" ]]; then - apt-get install -y python3-dev -elif [[ $distro =~ "AlmaLinux" ]]; then - yum install -y python3-devel -else - echo "OS version is not supported" -fi - -command -v pip3 >/dev/null 2>&1 || python3 <(curl -s https://bootstrap.pypa.io/get-pip.py) -python3 -m pip -qqq install prometheus_client psutil filelock diff --git a/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh b/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh deleted file mode 100755 index 5346bb76..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -source $(dirname "${BASH_SOURCE[0]}")/common.sh - - -distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` -echo $distro - -ubuntu_dcgm_install () { - echo "Installing Dcgm" - apt-get update \ - && sudo apt-get install -y datacenter-gpu-manager - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm -} - -alma_dcgm_install () { - echo "Installing Dcgm" - DCGM_VERSION=2.4.4 - DCGM_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - wget --retry-connrefused --tries=3 --waitretry=5 $DCGM_URL - FILE_NAME=$(basename $DCGM_URL) - RLINK=$(readlink -f $FILE_NAME) - Check="1d8fbe97797fada8048a7832bfac4bc7d3ad661bb24163d21324965ae7e7817d" - checksum=`sha256sum $RLINK | awk '{print $1}'` - if [[ $checksum != $Check ]] - then - echo "*** Error - Checksum verification failed" - echo "*** Error - Checksum verification failed" > dcgm_fail.log - exit -1 - fi - rpm -i datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - rm -f datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm -} - -check_min_dcgm_ver(){ - DCGM_VER=`dcgmi --version |grep version | awk -F ': ' '{print $2}'` - REQ_VER=$2 - if [ "$(printf '%s\n' "$REQ_VER" "$DCGM_VER" | sort -V | head -n1)" = "$REQ_VER" ]; then - echo "A suitable version of Dcgm is already installed" - else - echo "removing old DCGM" - # remove old version - if [[ $distro =~ "Ubuntu" ]]; then - apt -y remove datacenter-gpu-manager - elif [[ $distro =~ "AlmaLinux" ]]; then - yum -y remove datacenter-gpu-manager - fi - $1 - fi - -} - - -# install DCGM -if [[ $distro =~ "Ubuntu" ]]; then - dcgm_check=`sudo dpkg-query -l` - if [[ $dcgm_check =~ "datacenter-gpu-manager" ]]; then - check_min_dcgm_ver ubuntu_dcgm_install "3.1.6" - else - ubuntu_dcgm_install - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm - fi -elif [[ $distro =~ "AlmaLinux" ]]; then - dcgm_check=`rpm -qa` - if [[ $dcgm_check =~ "datacenter-gpu-manager" ]]; then - check_min_dcgm_ver alma_dcgm_install "2.4.4" - else - alma_dcgm_install - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm - fi -else - echo "OS version is not supported" -fi - -exit 0 From c7c79c9e9bc971b5796e7c6f4832dbb7d13b47c7 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Tue, 20 Jan 2026 05:22:13 +0000 Subject: [PATCH 2/9] update job exporter dockerfile to reduce its size --- .../build/job-exporter.common.dockerfile | 146 ++++++++++++------ 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 1004495c..0fcd2dbc 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -16,60 +16,120 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +############################ +# builder: only for compiling python wheels +############################ +FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 AS builder + +ARG TARGETARCH + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + python3-pip \ + python3-dev \ + build-essential \ + gcc; \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /w + +# build wheels once +COPY requirements.txt /w/requirements.txt +RUN python3 -m pip install --no-cache-dir -U pip wheel && \ + python3 -m pip wheel --no-cache-dir --wheel-dir /w/wheels \ + -r /w/requirements.txt \ + prometheus_client psutil filelock + + +############################ +# runtime: final image +############################ FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 ARG TARGETARCH ARG ROCM_VERSION=6.2.2 ARG AMDGPU_VERSION=6.2.2 -ARG DCGM_TARGET_VERSION = "1:4.4.1-1" - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - bash \ - curl \ - gnupg \ - wget \ - ca-certificates \ - python3-pip \ - python3-dev \ - sudo && \ - if [ "$TARGETARCH" = "amd64" ]; then \ - printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | tee /etc/apt/preferences.d/rocm-pin-600 && \ - curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ - echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ - echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ - apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev rdc; \ - fi - -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \ - datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ - datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ - datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION} +ARG DCGM_TARGET_VERSION=1:4.4.1-1 -ENV NERDCTL_VERSION=2.2.1 -RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \ - mkdir -p /tmp/nerdctl && \ - tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \ - mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \ - mkdir -p /job_exporter && \ - rm -rf /tmp/nerdctl* - -RUN python3 -m pip install prometheus_client psutil filelock +# -------------------------- +# base + REQUIRED apt upgrade +# -------------------------- +RUN set -eux; \ + apt-get update; \ + apt-get upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + gnupg \ + wget \ + python3 \ + python3-pip; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/* -COPY src/Moneo /Moneo +# -------------------------- +# ROCm (runtime only) +# -------------------------- +RUN set -eux; \ + if [ "$TARGETARCH" = "amd64" ]; then \ + printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \ + > /etc/apt/preferences.d/rocm-pin-600; \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \ + echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \ + > /etc/apt/sources.list.d/rocm.list; \ + echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \ + > /etc/apt/sources.list.d/amdgpu.list; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \ + rm -rf /var/lib/apt/lists/*; \ + fi -ENV PATH="${PATH}:/opt/rocm/bin" -COPY build/moneo-*-exporter_entrypoint.sh ./ +# -------------------------- +# DCGM (runtime only, same layer clean) +# -------------------------- +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* -RUN [ -d /opt/rocm/lib ] && echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf; \ - [ -d /opt/rocm/rdc/lib ] && echo "/opt/rocm/rdc/lib" >> /etc/ld.so.conf.d/rocm.conf; \ - [ -d /opt/rocm/llvm/lib ] && echo "/opt/rocm/llvm/lib" >> /etc/ld.so.conf.d/rocm.conf +# -------------------------- +# nerdctl +# -------------------------- +ENV NERDCTL_VERSION=2.2.1 +RUN set -eux; \ + wget -O /tmp/nerdctl.tar.gz \ + https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz; \ + mkdir -p /tmp/nerdctl; \ + tar -xzf /tmp/nerdctl.tar.gz -C /tmp/nerdctl; \ + mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl; \ + rm -rf /tmp/nerdctl* /tmp/nerdctl.tar.gz -RUN ldconfig +# -------------------------- +# python runtime deps (from wheels) +# -------------------------- -COPY requirements.txt /job_exporter/ -RUN pip3 install -r /job_exporter/requirements.txt +COPY --from=builder /w/wheels /wheels +COPY requirements.txt /job_exporter/requirements.txt -RUN apt update && apt upgrade -y && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN python3 -m pip install --no-cache-dir -U pip && \ + python3 -m pip install --no-cache-dir \ + --no-index --find-links=/wheels \ + -r /job_exporter/requirements.txt && \ + python3 -m pip install --no-cache-dir \ + --no-index --find-links=/wheels \ + prometheus_client psutil filelock && \ + rm -rf /wheels +# -------------------------- +# app files +# -------------------------- +COPY src/Moneo /Moneo COPY src/*.py /job_exporter/ +COPY build/moneo-*-exporter_entrypoint.sh ./ From 4742ed0f4b42f29571c658f69ae463a9adc04ef2 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Fri, 6 Feb 2026 07:52:46 +0000 Subject: [PATCH 3/9] update job-exporter to make it smaller --- .../build/job-exporter.common.dockerfile | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 0fcd2dbc..f9618b6e 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -19,7 +19,7 @@ ############################ # builder: only for compiling python wheels ############################ -FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 AS builder +FROM ubuntu:22.04 AS builder ARG TARGETARCH @@ -44,9 +44,9 @@ RUN python3 -m pip install --no-cache-dir -U pip wheel && \ ############################ -# runtime: final image +# runtime: use minimal CUDA base (includes nvidia-smi and CUDA libs) ############################ -FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 +FROM nvcr.io/nvidia/cuda:12.0.1-base-ubuntu22.04 ARG TARGETARCH ARG ROCM_VERSION=6.2.2 @@ -62,19 +62,18 @@ RUN set -eux; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ bash \ ca-certificates \ - curl \ - gnupg \ - wget \ python3 \ - python3-pip; \ + kmod; \ apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* # -------------------------- # ROCm (runtime only) # -------------------------- RUN set -eux; \ if [ "$TARGETARCH" = "amd64" ]; then \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl gnupg; \ printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \ > /etc/apt/preferences.d/rocm-pin-600; \ curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \ @@ -84,11 +83,15 @@ RUN set -eux; \ > /etc/apt/sources.list.d/amdgpu.list; \ apt-get update; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \ - rm -rf /var/lib/apt/lists/*; \ + apt-get remove -y curl gnupg; \ + apt-get autoremove -y; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*; \ fi # -------------------------- -# DCGM (runtime only, same layer clean) +# DCGM (minimal runtime, monitoring only) +# Note: CUDA base image already provides nvidia-smi and CUDA libraries # -------------------------- RUN set -eux; \ apt-get update; \ @@ -97,19 +100,23 @@ RUN set -eux; \ datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \ apt-get clean; \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* # -------------------------- # nerdctl # -------------------------- ENV NERDCTL_VERSION=2.2.1 RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends wget; \ wget -O /tmp/nerdctl.tar.gz \ https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz; \ - mkdir -p /tmp/nerdctl; \ - tar -xzf /tmp/nerdctl.tar.gz -C /tmp/nerdctl; \ - mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl; \ - rm -rf /tmp/nerdctl* /tmp/nerdctl.tar.gz + tar -xzf /tmp/nerdctl.tar.gz -C /usr/local/bin nerdctl; \ + chmod +x /usr/local/bin/nerdctl; \ + apt-get remove -y wget; \ + apt-get autoremove -y; \ + apt-get clean; \ + rm -rf /tmp/* /var/lib/apt/lists/* /var/cache/apt/* # -------------------------- # python runtime deps (from wheels) @@ -118,14 +125,20 @@ RUN set -eux; \ COPY --from=builder /w/wheels /wheels COPY requirements.txt /job_exporter/requirements.txt -RUN python3 -m pip install --no-cache-dir -U pip && \ +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip; \ + python3 -m pip install --no-cache-dir -U pip && \ python3 -m pip install --no-cache-dir \ --no-index --find-links=/wheels \ -r /job_exporter/requirements.txt && \ python3 -m pip install --no-cache-dir \ --no-index --find-links=/wheels \ prometheus_client psutil filelock && \ - rm -rf /wheels + apt-get remove -y python3-pip; \ + apt-get autoremove -y; \ + apt-get clean; \ + rm -rf /wheels /root/.cache /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* # -------------------------- # app files From 4d8ecc17da876e6608e8b4d6950bb8e63a060e80 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Fri, 6 Feb 2026 08:21:51 +0000 Subject: [PATCH 4/9] fix the build errors --- src/job-exporter/build/job-exporter.common.dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index f9618b6e..0f07a279 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -135,8 +135,11 @@ RUN set -eux; \ python3 -m pip install --no-cache-dir \ --no-index --find-links=/wheels \ prometheus_client psutil filelock && \ + # Mark important packages as manual to prevent removal + apt-mark manual rdc amd-smi-lib 2>/dev/null || true; \ apt-get remove -y python3-pip; \ - apt-get autoremove -y; \ + # Set environment variable to allow sudo removal during autoremove + SUDO_FORCE_REMOVE=yes apt-get autoremove -y; \ apt-get clean; \ rm -rf /wheels /root/.cache /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* From 96be80b1200cc0477de6eac1397fd6a36060efc3 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Tue, 10 Feb 2026 06:30:38 +0000 Subject: [PATCH 5/9] add cuda utils in the docker file --- .../build/job-exporter.common.dockerfile | 55 +++++++++++++++++-- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 0f07a279..e445d5f2 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -44,9 +44,9 @@ RUN python3 -m pip install --no-cache-dir -U pip wheel && \ ############################ -# runtime: use minimal CUDA base (includes nvidia-smi and CUDA libs) +# runtime: minimal Ubuntu base with only essential NVIDIA components ############################ -FROM nvcr.io/nvidia/cuda:12.0.1-base-ubuntu22.04 +FROM ubuntu:22.04 ARG TARGETARCH ARG ROCM_VERSION=6.2.2 @@ -54,7 +54,7 @@ ARG AMDGPU_VERSION=6.2.2 ARG DCGM_TARGET_VERSION=1:4.4.1-1 # -------------------------- -# base + REQUIRED apt upgrade +# base + NVIDIA CUDA repository setup # -------------------------- RUN set -eux; \ apt-get update; \ @@ -62,8 +62,27 @@ RUN set -eux; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ bash \ ca-certificates \ + curl \ + gnupg \ python3 \ kmod; \ + # Add NVIDIA CUDA repository + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg; \ + echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* + +# -------------------------- +# Install minimal CUDA components (only what's needed for nvidia-smi and DCGM) +# -------------------------- +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + cuda-cudart-12-0 \ + cuda-compat-12-0; \ + # Remove curl and gnupg after CUDA setup + apt-get remove -y curl gnupg; \ + apt-get autoremove -y; \ apt-get clean; \ rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* @@ -91,7 +110,7 @@ RUN set -eux; \ # -------------------------- # DCGM (minimal runtime, monitoring only) -# Note: CUDA base image already provides nvidia-smi and CUDA libraries +# Note: DCGM packages will pull in necessary CUDA dependencies including nvidia-smi # -------------------------- RUN set -eux; \ apt-get update; \ @@ -99,8 +118,12 @@ RUN set -eux; \ datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \ + # Clean up any extra CUDA packages we don't need apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* \ + /usr/local/cuda-12.0/targets/x86_64-linux/lib/*.a \ + /usr/local/cuda-12.0/nsight* \ + /usr/local/cuda-12.0/libnvvp # -------------------------- # nerdctl @@ -149,3 +172,25 @@ RUN set -eux; \ COPY src/Moneo /Moneo COPY src/*.py /job_exporter/ COPY build/moneo-*-exporter_entrypoint.sh ./ + +# -------------------------- +# Final cleanup: remove unnecessary CUDA files to reduce image size +# -------------------------- +RUN set -eux; \ + # Remove CUDA static libraries (we only need shared libs for runtime) + find /usr/local/cuda-12.0 -name "*.a" -delete 2>/dev/null || true; \ + find /usr/local/cuda-12.0 -name "*.la" -delete 2>/dev/null || true; \ + # Remove CUDA development tools and samples + rm -rf /usr/local/cuda-12.0/nsight* \ + /usr/local/cuda-12.0/libnvvp \ + /usr/local/cuda-12.0/doc \ + /usr/local/cuda-12.0/samples \ + /usr/local/cuda-12.0/extras \ + 2>/dev/null || true; \ + # Remove documentation and man pages + rm -rf /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/info/* \ + 2>/dev/null || true; \ + # Final cache cleanup + rm -rf /var/cache/* /tmp/* /var/tmp/* 2>/dev/null || true \ No newline at end of file From a4399ec3abad52f88e72850484934d0c39e1bf16 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Tue, 10 Feb 2026 07:32:37 +0000 Subject: [PATCH 6/9] install nvidia-smi and utils --- src/job-exporter/build/job-exporter.common.dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index e445d5f2..6d8ccede 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -80,6 +80,9 @@ RUN set -eux; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ cuda-cudart-12-0 \ cuda-compat-12-0; \ + # Install nvidia-smi and utils (automatically match the version from CUDA repo) + NVIDIA_UTILS_VERSION=$(apt-cache search --names-only '^nvidia-utils-[0-9]+$' | sort -V | tail -1 | awk '{print $1}'); \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ${NVIDIA_UTILS_VERSION}; \ # Remove curl and gnupg after CUDA setup apt-get remove -y curl gnupg; \ apt-get autoremove -y; \ @@ -193,4 +196,4 @@ RUN set -eux; \ /usr/share/info/* \ 2>/dev/null || true; \ # Final cache cleanup - rm -rf /var/cache/* /tmp/* /var/tmp/* 2>/dev/null || true \ No newline at end of file + rm -rf /var/cache/* /tmp/* /var/tmp/* 2>/dev/null || true From 6d3364c18d8d75c9e8935bc217570131a09ae647 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Wed, 11 Feb 2026 05:09:54 +0000 Subject: [PATCH 7/9] use own build version to replace nerdctl --- .../build/job-exporter.common.dockerfile | 96 +++++++------------ 1 file changed, 32 insertions(+), 64 deletions(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 6d8ccede..7a561cb9 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -44,9 +44,27 @@ RUN python3 -m pip install --no-cache-dir -U pip wheel && \ ############################ -# runtime: minimal Ubuntu base with only essential NVIDIA components +# nerdctl-builder: build nerdctl from source ############################ -FROM ubuntu:22.04 +FROM golang:1.25.6 AS nerdctl-builder + +ARG TARGETARCH +ARG NERDCTL_VERSION=2.2.1 + +WORKDIR /build + +RUN set -eux; \ + git clone --depth 1 --branch v${NERDCTL_VERSION} https://github.com/containerd/nerdctl.git .; \ + make binaries; \ + mkdir -p /opt/nerdctl; \ + cp _output/nerdctl /opt/nerdctl/nerdctl; \ + chmod +x /opt/nerdctl/nerdctl + + +############################ +# runtime: minimal CUDA base with only essential components +############################ +FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-base-ubuntu22.04 ARG TARGETARCH ARG ROCM_VERSION=6.2.2 @@ -54,11 +72,11 @@ ARG AMDGPU_VERSION=6.2.2 ARG DCGM_TARGET_VERSION=1:4.4.1-1 # -------------------------- -# base + NVIDIA CUDA repository setup +# Install all components in single layer for size optimization # -------------------------- RUN set -eux; \ + # Base setup apt-get update; \ - apt-get upgrade -y; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ bash \ ca-certificates \ @@ -66,36 +84,8 @@ RUN set -eux; \ gnupg \ python3 \ kmod; \ - # Add NVIDIA CUDA repository - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg; \ - echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list; \ - apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* - -# -------------------------- -# Install minimal CUDA components (only what's needed for nvidia-smi and DCGM) -# -------------------------- -RUN set -eux; \ - apt-get update; \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - cuda-cudart-12-0 \ - cuda-compat-12-0; \ - # Install nvidia-smi and utils (automatically match the version from CUDA repo) - NVIDIA_UTILS_VERSION=$(apt-cache search --names-only '^nvidia-utils-[0-9]+$' | sort -V | tail -1 | awk '{print $1}'); \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ${NVIDIA_UTILS_VERSION}; \ - # Remove curl and gnupg after CUDA setup - apt-get remove -y curl gnupg; \ - apt-get autoremove -y; \ - apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* - -# -------------------------- -# ROCm (runtime only) -# -------------------------- -RUN set -eux; \ + # ROCm (runtime only) for AMD GPUs if [ "$TARGETARCH" = "amd64" ]; then \ - apt-get update; \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl gnupg; \ printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \ > /etc/apt/preferences.d/rocm-pin-600; \ curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \ @@ -105,44 +95,22 @@ RUN set -eux; \ > /etc/apt/sources.list.d/amdgpu.list; \ apt-get update; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \ - apt-get remove -y curl gnupg; \ - apt-get autoremove -y; \ - apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*; \ - fi - -# -------------------------- -# DCGM (minimal runtime, monitoring only) -# Note: DCGM packages will pull in necessary CUDA dependencies including nvidia-smi -# -------------------------- -RUN set -eux; \ - apt-get update; \ + fi; \ + # DCGM for GPU monitoring (NVIDIA) DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \ - # Clean up any extra CUDA packages we don't need + # Clean up everything in single layer + apt-get remove -y curl gnupg; \ + apt-get autoremove -y; \ apt-get clean; \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* \ - /usr/local/cuda-12.0/targets/x86_64-linux/lib/*.a \ - /usr/local/cuda-12.0/nsight* \ - /usr/local/cuda-12.0/libnvvp + rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/* # -------------------------- -# nerdctl +# nerdctl (copy from nerdctl-builder) # -------------------------- -ENV NERDCTL_VERSION=2.2.1 -RUN set -eux; \ - apt-get update; \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends wget; \ - wget -O /tmp/nerdctl.tar.gz \ - https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz; \ - tar -xzf /tmp/nerdctl.tar.gz -C /usr/local/bin nerdctl; \ - chmod +x /usr/local/bin/nerdctl; \ - apt-get remove -y wget; \ - apt-get autoremove -y; \ - apt-get clean; \ - rm -rf /tmp/* /var/lib/apt/lists/* /var/cache/apt/* +COPY --from=nerdctl-builder /opt/nerdctl/nerdctl /usr/local/bin/nerdctl # -------------------------- # python runtime deps (from wheels) From f64ea85230eea0aa628d0d19eb7dda053a870f1d Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Wed, 25 Feb 2026 07:21:12 +0000 Subject: [PATCH 8/9] update go version and add AMD python library --- src/job-exporter/build/job-exporter.common.dockerfile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 7a561cb9..18f587c4 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -46,7 +46,7 @@ RUN python3 -m pip install --no-cache-dir -U pip wheel && \ ############################ # nerdctl-builder: build nerdctl from source ############################ -FROM golang:1.25.6 AS nerdctl-builder +FROM golang:1.25.7 AS nerdctl-builder ARG TARGETARCH ARG NERDCTL_VERSION=2.2.1 @@ -94,7 +94,7 @@ RUN set -eux; \ echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \ > /etc/apt/sources.list.d/amdgpu.list; \ apt-get update; \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc amd-smi-lib; \ fi; \ # DCGM for GPU monitoring (NVIDIA) DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ @@ -129,9 +129,6 @@ RUN set -eux; \ python3 -m pip install --no-cache-dir \ --no-index --find-links=/wheels \ prometheus_client psutil filelock && \ - # Mark important packages as manual to prevent removal - apt-mark manual rdc amd-smi-lib 2>/dev/null || true; \ - apt-get remove -y python3-pip; \ # Set environment variable to allow sudo removal during autoremove SUDO_FORCE_REMOVE=yes apt-get autoremove -y; \ apt-get clean; \ From 04022d18bd43bfd6dc4fa1727d6c673c93bac523 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Wed, 25 Feb 2026 08:23:04 +0000 Subject: [PATCH 9/9] add missing tools --- src/job-exporter/build/job-exporter.common.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 18f587c4..c19a12e9 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -121,7 +121,7 @@ COPY requirements.txt /job_exporter/requirements.txt RUN set -eux; \ apt-get update; \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip numactl; \ python3 -m pip install --no-cache-dir -U pip && \ python3 -m pip install --no-cache-dir \ --no-index --find-links=/wheels \