Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,39 @@ jobs:
name: python-wheels-${{ runner.os }}-${{ matrix.arch }}
path: python/wheelhouse

build-python-wheels-rocm:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-24.04, windows-2025]

steps:
- uses: actions/checkout@v6
with:
submodules: recursive

- name: Build wheels
uses: pypa/cibuildwheel@v3.2.1
with:
package-dir: python
output-dir: python/wheelhouse
env:
CIBW_ENVIRONMENT_LINUX: ROCM_PATH=/opt/rocm LD_LIBRARY_PATH=/opt/rocm/lib/llvm/lib:$LD_LIBRARY_PATH
CIBW_ENVIRONMENT_WINDOWS: CTRANSLATE2_ROOT='${{ github.workspace }}\install'
CIBW_BEFORE_ALL_LINUX: python/tools/prepare_build_environment_linux_rocm.sh
CIBW_BEFORE_ALL_WINDOWS: bash python/tools/prepare_build_environment_windows_rocm.sh
CIBW_BEFORE_BUILD: pip install -r python/install_requirements.txt
CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
CIBW_ARCHS: auto64
CIBW_SKIP: "*-musllinux_*"
CIBW_REPAIR_WHEEL_COMMAND_LINUX: 'auditwheel repair -w {dest_dir} --exclude "/opt/rocm/lib/lib*" {wheel}'

- name: Upload Python wheels
uses: actions/upload-artifact@v6
with:
name: rocm-python-wheels-${{ runner.os }}
path: python/wheelhouse


# We could test the Python wheels using cibuildwheel but we prefer to run the tests outside
# the build environment to ensure wheels correctly embed all dependencies.
Expand Down Expand Up @@ -334,6 +367,10 @@ jobs:

build-and-push-docker-images:
runs-on: ubuntu-22.04
strategy:
matrix:
gpu: [cuda, rocm]

steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -364,7 +401,7 @@ jobs:

- name: Build Docker images
run: |
./docker/build_all.sh
./docker/build_all.sh latest 0 ${{ matrix.gpu }}

- name: Login to DockerHub
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
Expand All @@ -377,7 +414,7 @@ jobs:
- name: Push Docker images
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
run: |
./docker/build_all.sh ${GITHUB_REF##*/v} 1
./docker/build_all.sh ${GITHUB_REF##*/v} 1 ${{ matrix.gpu }}


build-and-deploy-docs:
Expand Down
92 changes: 92 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ option(WITH_OPENBLAS "Compile with OpenBLAS backend" OFF)
option(WITH_RUY "Compile with Ruy backend" OFF)
option(WITH_CUDA "Compile with CUDA backend" OFF)
option(WITH_CUDNN "Compile with cuDNN backend" OFF)
option(WITH_HIP "Compile with HIP backend" OFF)
option(CUDA_DYNAMIC_LOADING "Dynamically load CUDA libraries at runtime" OFF)
option(ENABLE_CPU_DISPATCH "Compile CPU kernels for multiple ISA and dispatch at runtime" ON)
option(ENABLE_PROFILING "Compile with profiling support" OFF)
Expand Down Expand Up @@ -491,6 +492,9 @@ ELSEIF (ENABLE_ADDRESS_SANITIZER)
ENDIF ()

if (WITH_CUDA)
if(WITH_HIP)
message(FATAL_ERROR "WITH_CUDA=ON incompatible with WITH_HIP=ON")
endif()
find_package(CUDA 11.0 REQUIRED)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
if (WITH_TENSOR_PARALLEL)
Expand Down Expand Up @@ -679,6 +683,94 @@ if (WITH_CUDA)
)


elseif(WITH_HIP)
if(WITH_TENSOR_PARALLEL)
message(FATAL_ERROR "WITH_HIP=ON incompatible with WITH_TENSOR_PARALLEL=ON")
endif()
enable_language(HIP)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")

if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH /opt/rocm)
else()
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})

find_package(hiprand REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocprim REQUIRED)
find_package(rocthrust REQUIRED)
find_package(hipcub REQUIRED)

list(REMOVE_ITEM SOURCES
src/ops/awq/dequantize.cc
src/ops/awq/dequantize_cpu.cc
src/ops/awq/gemm.cc
src/ops/awq/gemm_cpu.cc
src/ops/awq/gemv.cc
src/ops/awq/gemv_cpu.cc
)
list(REMOVE_ITEM CUDA_SOURCES
src/ops/awq/gemm_gpu.cu
src/ops/awq/gemv_gpu.cu
src/ops/awq/dequantize_gpu.cu
)
if(WITH_FLASH_ATTN)
message(FATAL_ERROR "WITH_HIP=ON incompatible with WITH_FLASH_ATTN=ON")
endif()

set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)
set_source_files_properties(
src/cpu/allocator.cc
src/cpu/backend.cc
src/cpu/cpu_info.cc
src/cpu/cpu_isa.cc
src/cpu/kernels.cc
src/cpu/parallel.cc
src/cpu/primitives.cc
src/ops/alibi_add_cpu.cc
src/ops/bias_add_cpu.cc
src/ops/concat_split_slide_cpu.cc
src/ops/conv1d_cpu.cc
src/ops/dequantize_cpu.cc
src/ops/gather_cpu.cc
src/ops/gumbel_max_cpu.cc
src/ops/layer_norm_cpu.cc
src/ops/mean_cpu.cc
src/ops/median_filter_cpu.cc
src/ops/multinomial_cpu.cc
src/ops/quantize_cpu.cc
src/ops/rms_norm_cpu.cc
src/ops/rotary_cpu.cc
src/ops/softmax_cpu.cc
src/ops/tile_cpu.cc
src/ops/topk_cpu.cc
src/ops/topp_mask_cpu.cc
src/ops/nccl_ops_cpu.cc
PROPERTIES LANGUAGE CXX
)
link_directories(${ROCM_PATH}/lib)

add_definitions(-DCT2_WITH_CUDA)
add_definitions(-DCT2_USE_HIP)

add_library(${PROJECT_NAME}
SHARED
${SOURCES}
${CUDA_SOURCES}
)

add_compile_definitions(__HIP_PLATFORM_AMD__)
add_compile_definitions(__HIP_PLATFORM_HCC__)
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
target_link_libraries(${PROJECT_NAME} PRIVATE hiprand roc::hipblas roc::rocprim roc::rocthrust hip::hipcub)

set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)


elseif(WITH_CUDNN)
message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
else()
Expand Down
110 changes: 110 additions & 0 deletions docker/Dockerfile_rocm
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
FROM rocm/dev-ubuntu-22.04:7.2 AS builder

RUN apt-get update && \
apt-get install -y --no-install-recommends \
rocm-hip-runtime-dev \
hipblas-common-dev \
hipblas-dev \
hipcub-dev \
hiprand-dev \
rocprim-dev \
rocrand-dev \
rocthrust-dev \
python3-dev \
python3-pip \
wget \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

WORKDIR /root

ENV ONEAPI_VERSION=2025.3
RUN wget -q https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
apt-key add *.PUB && \
rm *.PUB && \
echo "deb https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list && \
apt-get update && \
apt-get install -y --no-install-recommends \
intel-oneapi-mkl-devel-$ONEAPI_VERSION \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN python3 -m pip --no-cache-dir install cmake==3.22.*

ENV ONEDNN_VERSION=3.10.2
RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \
tar xf *.tar.gz && \
rm *.tar.gz && \
cd oneDNN-* && \
cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF . && \
make -j$(nproc) install && \
cd .. && \
rm -r oneDNN-*

ENV OPENMPI_VERSION=4.1.6
RUN wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.bz2 && \
tar xf *.tar.bz2 && \
rm *.tar.bz2 && \
cd openmpi-* && \
./configure && \
make -j$(nproc) install && \
cd .. && \
rm -r openmpi-*

COPY third_party third_party
COPY cli cli
COPY include include
COPY src src
COPY cmake cmake
COPY python python
COPY CMakeLists.txt .

ARG CXX_FLAGS
ENV CXX_FLAGS=${CXX_FLAGS:-"-msse4.1 -O3 -Wno-deprecated-literal-operator"}
ARG HIP_FLAGS
ENV HIP_FLAGS=${HIP_FLAGS:-"-O3 -Wno-deprecated-literal-operator"}
ARG HIP_ARCHITECTURES
ENV HIP_ARCHITECTURES=${HIP_ARCHITECTURES:-"gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"}
ENV CTRANSLATE2_ROOT=/opt/ctranslate2
ENV LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}

RUN mkdir build_tmp && \
cd build_tmp && \
cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} -DCMAKE_C_COMPILER=amdclang -DCMAKE_CXX_COMPILER=amdclang++ \
-DWITH_HIP=ON -DWITH_MKL=ON -DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \
-DCMAKE_HIP_ARCHITECTURES="${HIP_ARCHITECTURES}" \
-DGPU_TARGETS="${HIP_ARCHITECTURES}" -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DCMAKE_HIP_FLAGS="${HIP_FLAGS}" \
.. && \
VERBOSE=1 make -j$(nproc) install

ENV LANG=en_US.UTF-8
COPY README.md .

RUN cd python && \
python3 -m pip --no-cache-dir install -r install_requirements.txt && \
python3 setup.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT

FROM rocm/dev-ubuntu-22.04:7.2

RUN apt-get update && \
apt-get install -y --no-install-recommends \
rocm-hip-libraries \
openmpi-bin \
libgomp1 \
python3-pip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV CTRANSLATE2_ROOT=/opt/ctranslate2
ENV ROCM_ROOT=/opt/rocm
ENV LD_LIBRARY_PATH=$CTRANSLATE2_ROOT/lib:$ROCM_ROOT/lib/llvm/lib:$LD_LIBRARY_PATH

COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT
RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \
rm $CTRANSLATE2_ROOT/*.whl

ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"]
7 changes: 6 additions & 1 deletion docker/build_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ cd $ROOT_DIR

VERSION=${1:-latest}
PUSH=${2:-0}
GPU=${3:-cuda}
IMAGE=ghcr.io/opennmt/ctranslate2

build()
Expand All @@ -42,4 +43,8 @@ build()
fi
}

build Dockerfile ubuntu22.04-cuda12.2
if [ "$GPU" == "rocm" ]; then
build Dockerfile_rocm ubuntu22.04-rocm7.2
else
build Dockerfile ubuntu22.04-cuda12.8
fi
2 changes: 2 additions & 0 deletions include/ctranslate2/ops/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
#include "slide.h"
#include "nccl_ops.h"
#include "flash_attention.h"
#ifndef CT2_USE_HIP
#include "awq/gemm.h"
#include "awq/gemv.h"
#include "awq/dequantize_awq.h"
#endif
#include "sum.h"
2 changes: 2 additions & 0 deletions python/ctranslate2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
add_dll_directory = getattr(os, "add_dll_directory", None)
if add_dll_directory is not None:
add_dll_directory(package_dir)
add_dll_directory(f"{package_dir}/../_rocm_sdk_core/bin")
add_dll_directory(f"{package_dir}/../_rocm_sdk_libraries_custom/bin")

for library in glob.glob(os.path.join(package_dir, "*.dll")):
ctypes.CDLL(library)
Expand Down
Loading