diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 2a483a5..ed8d5ef 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -47,7 +47,7 @@ jobs: needs: build-base # always() forces job run even if the dependant is skipped (but not if it failed) if: always() && (needs.build-base.result == 'success' || needs.build-base.result == 'skipped') - runs-on: blacksmith-8vcpu-ubuntu-2204 + runs-on: blacksmith-16vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v4 @@ -64,7 +64,7 @@ jobs: - 'official-templates/pytorch/**' - name: Setup Docker - if: steps.changes.outputs.pytorch_any_changed == 'true' + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: ./.github/actions/docker-setup id: setup with: @@ -72,7 +72,7 @@ jobs: dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }} - name: Build pytorch images - if: steps.changes.outputs.pytorch_any_changed == 'true' + if: github.event_name == 'workflow_dispatch' || steps.changes.outputs.pytorch_any_changed == 'true' uses: docker/bake-action@v6 env: BUILDX_BAKE_ENTITLEMENTS_FS: 0 diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml new file mode 100644 index 0000000..3a77029 --- /dev/null +++ b/.github/workflows/nvidia.yml @@ -0,0 +1,41 @@ +name: Nvidia Image Build + +on: + push: + paths: + - ".github/workflows/nvidia.yml" + - "official-templates/nvidia-*/**" + workflow_dispatch: + +permissions: + contents: read + packages: write + +jobs: + build-nvidia: + runs-on: blacksmith-16vcpu-ubuntu-2404 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Docker + uses: ./.github/actions/docker-setup + id: setup + with: + dockerhub-username: ${{ secrets.DOCKERHUB_USERNAME }} + dockerhub-token: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build nvidia images + uses: docker/bake-action@v6 + env: + BUILDX_BAKE_ENTITLEMENTS_FS: 0 + RELEASE_SUFFIX: ${{ steps.setup.outputs.release-suffix }} + with: + source: . + files: | + official-templates/shared/versions.hcl + official-templates/nvidia-pytorch/docker-bake.hcl + push: true + diff --git a/container-template/start.sh b/container-template/start.sh index d39f7f3..1d70e3f 100644 --- a/container-template/start.sh +++ b/container-template/start.sh @@ -99,4 +99,6 @@ export_env_vars echo "Start script(s) finished, Pod is ready to use." +execute_script "/post_start.sh" "Running post-start script..." + sleep infinity diff --git a/official-templates/base/Dockerfile b/official-templates/base/Dockerfile index 5b2c487..bd2d8eb 100644 --- a/official-templates/base/Dockerfile +++ b/official-templates/base/Dockerfile @@ -1,6 +1,9 @@ ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} +ARG RP_SKIP_PYTHON +ARG RP_SKIP_JUPYTER + SHELL ["/bin/bash", "-o", "pipefail", "-c"] ENV SHELL=/bin/bash @@ -44,53 +47,54 @@ RUN apt-get update --yes && \ libsm6 libssl-dev libswscale-dev libtiff-dev libv4l-dev libx264-dev libxrender-dev \ libxvidcore-dev lsof make mtr nano nfs-common nginx openssh-server rsync slurm-wlm \ software-properties-common sudo tmux unzip vim wget zip zstd - -# Add the Python PPA -RUN add-apt-repository ppa:deadsnakes/ppa -y - -# Install Python -RUN apt-get install --yes --no-install-recommends \ - python3.9-dev python3.9-venv python3.9-distutils \ - python3.10-dev python3.10-venv python3.10-distutils \ - python3.11-dev python3.11-venv python3.11-distutils \ - python3.12-dev python3.12-venv \ - python3.13-dev python3.13-venv && \ + +# Install Python versions +RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ + add-apt-repository ppa:deadsnakes/ppa -y && \ + apt-get install --yes --no-install-recommends \ + python3.9-dev python3.9-venv python3.9-distutils \ + python3.10-dev python3.10-venv python3.10-distutils \ + python3.11-dev python3.11-venv python3.11-distutils \ + python3.12-dev python3.12-venv \ + python3.13-dev python3.13-venv && \ apt-get autoremove -y && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install pip if we are not ROCm -RUN if [ -z "${ROCM_PATH}" ]; then \ - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ - python3.9 get-pip.py && \ - python3.10 get-pip.py && \ - python3.11 get-pip.py && \ - python3.12 get-pip.py && \ - python3.13 get-pip.py && \ - rm get-pip.py; \ - fi - -# ensurepip (it's there, we just installed it^) and install virtualenv -RUN if [ -z "${ROCM_PATH}" ]; then \ - python3.9 -m pip install --upgrade pip virtualenv && \ - python3.10 -m pip install --upgrade pip virtualenv && \ - python3.11 -m pip install --upgrade pip virtualenv && \ - python3.12 -m pip install --upgrade pip virtualenv && \ - python3.13 -m pip install --upgrade pip virtualenv; \ - fi - -RUN ln -sf /usr/bin/python3.12 /usr/local/bin/python -RUN ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip -RUN ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3 + rm -rf /var/lib/apt/lists/*; + +# Install pip +RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ + curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3.9 get-pip.py && \ + python3.10 get-pip.py && \ + python3.11 get-pip.py && \ + python3.12 get-pip.py && \ + python3.13 get-pip.py && \ + rm get-pip.py; + +# Install virtualenv +RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ + python3.9 -m pip install --upgrade pip virtualenv && \ + python3.10 -m pip install --upgrade pip virtualenv && \ + python3.11 -m pip install --upgrade pip virtualenv && \ + python3.12 -m pip install --upgrade pip virtualenv && \ + python3.13 -m pip install --upgrade pip virtualenv; + +# Symlink default python/pip +RUN [[ -n $RP_SKIP_PYTHON ]] && exit 0; \ + ln -sf /usr/bin/python3.12 /usr/local/bin/python && \ + ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip && \ + ln -sf /usr/local/bin/pip3.12 /usr/local/bin/pip3; COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ -# Install Jupyter and related packages -RUN python -m pip install --upgrade --no-cache-dir \ - jupyterlab \ - ipywidgets \ - jupyter-archive \ - notebook==7.4.2 +# Install Jupyter and hf_transfer packages +RUN [[ -n $RP_SKIP_JUPYTER ]] && exit 0; \ + python -m pip install --upgrade --no-cache-dir \ + hf_transfer \ + jupyterlab \ + ipywidgets \ + jupyter-archive \ + notebook==7.4.2; # Install filebrowser RUN curl -LsSf https://raw.githubusercontent.com/filebrowser/get/master/get.sh | bash diff --git a/official-templates/nvidia-pytorch/README.md b/official-templates/nvidia-pytorch/README.md new file mode 100644 index 0000000..960beda --- /dev/null +++ b/official-templates/nvidia-pytorch/README.md @@ -0,0 +1,16 @@ +# NVIDIA PyTorch Base Image + +NVIDIA's PyTorch NGC Container (`nvcr.io/nvidia/pytorch`) built for easy deployment on Runpod. + +For more information on the NGC images visit https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch. + +## Deployment + +Please use `runpod/nvidia-pytorch:1.0.3-25.11` + +## Build + +```bash +./bake.sh nvidia-pytorch +``` + diff --git a/official-templates/nvidia-pytorch/docker-bake.hcl b/official-templates/nvidia-pytorch/docker-bake.hcl new file mode 100644 index 0000000..10bb396 --- /dev/null +++ b/official-templates/nvidia-pytorch/docker-bake.hcl @@ -0,0 +1,28 @@ +group "default" { + targets = ["pytorch-2511"] +} + +target "nvidia-base" { + context = "official-templates/base" + dockerfile = "Dockerfile" + platforms = ["linux/amd64"] + contexts = { + scripts = "container-template" + proxy = "container-template/proxy" + logo = "container-template" + } + args = { + RP_SKIP_PYTHON = "1" + RP_SKIP_JUPYTER = "1" + } +} + +target "pytorch-2511" { + inherits = ["nvidia-base"] + tags = [ + "runpod/nvidia-pytorch:${RELEASE_VERSION}${RELEASE_SUFFIX}-25.11", + ] + args = { + BASE_IMAGE = "nvcr.io/nvidia/pytorch:25.11-py3" + } +} diff --git a/official-templates/pytorch/docker-bake.hcl b/official-templates/pytorch/docker-bake.hcl index bb03c6e..3a04650 100644 --- a/official-templates/pytorch/docker-bake.hcl +++ b/official-templates/pytorch/docker-bake.hcl @@ -2,6 +2,10 @@ variable "TORCH_META" { default = { + "2.9.1" = {} + "2.9.0" = { + torchvision = "0.24.0" + } "2.8.0" = { torchvision = "0.23.0" } @@ -21,14 +25,20 @@ variable "CUDA_TORCH_COMBINATIONS" { { cuda_version = "12.8.1", torch = "2.6.0", whl_src = "126" }, { cuda_version = "12.8.1", torch = "2.7.1", whl_src = "128" }, { cuda_version = "12.8.1", torch = "2.8.0", whl_src = "128" }, + { cuda_version = "12.8.1", torch = "2.9.0", whl_src = "128" }, + { cuda_version = "12.8.1", torch = "2.9.1", whl_src = "128" }, { cuda_version = "12.9.0", torch = "2.6.0", whl_src = "126" }, { cuda_version = "12.9.0", torch = "2.7.1", whl_src = "128" }, { cuda_version = "12.9.0", torch = "2.8.0", whl_src = "129" }, + { cuda_version = "12.9.0", torch = "2.9.0", whl_src = "129" }, + { cuda_version = "12.9.0", torch = "2.9.1", whl_src = "129" }, { cuda_version = "13.0.0", torch = "2.6.0", whl_src = "126" }, { cuda_version = "13.0.0", torch = "2.7.1", whl_src = "128" }, - { cuda_version = "13.0.0", torch = "2.8.0", whl_src = "129" } + { cuda_version = "13.0.0", torch = "2.8.0", whl_src = "129" }, + { cuda_version = "13.0.0", torch = "2.9.0", whl_src = "130" }, + { cuda_version = "13.0.0", torch = "2.9.1", whl_src = "130" }, ] } @@ -44,7 +54,7 @@ variable "COMPATIBLE_BUILDS" { wheel_src = combo.whl_src torch = combo.torch torch_code = replace(combo.torch, ".", "") - torch_vision = TORCH_META[combo.torch].torchvision + torch_vision = lookup(TORCH_META[combo.torch], "torchvision", "") } if cuda.version == combo.cuda_version && contains(cuda.ubuntu, ubuntu.version) ] ] @@ -80,7 +90,7 @@ target "pytorch-matrix" { args = { BASE_IMAGE = "runpod/base:${RELEASE_VERSION}${RELEASE_SUFFIX}-cuda${build.cuda_code}-${build.ubuntu_name}" WHEEL_SRC = build.wheel_src - TORCH = "torch==${build.torch} torchvision==${build.torch_vision} torchaudio==${build.torch}" + TORCH = "torch==${build.torch}${build.torch_vision != "" ? " torchvision==${build.torch_vision}" : ""} torchaudio==${build.torch}" } tags = [ diff --git a/official-templates/rocm/docker-bake.hcl b/official-templates/rocm/docker-bake.hcl index 3535a5a..13d7914 100644 --- a/official-templates/rocm/docker-bake.hcl +++ b/official-templates/rocm/docker-bake.hcl @@ -16,6 +16,9 @@ target "rocm-base" { proxy = "container-template/proxy" logo = "container-template" } + args = { + RP_SKIP_PYTHON = "1" + } } target "rocm644-ubuntu2204-pytorch251" { diff --git a/official-templates/shared/versions.hcl b/official-templates/shared/versions.hcl index 1af3891..bd522f5 100644 --- a/official-templates/shared/versions.hcl +++ b/official-templates/shared/versions.hcl @@ -1,4 +1,4 @@ -RELEASE_VERSION = "1.0.2" +RELEASE_VERSION = "1.0.3" variable "RELEASE_SUFFIX" { default = "" # Set by CI, not used by humans.