From 1a1ddff3840c0dd7c8d22302052ed84348d440e4 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Fri, 12 Jun 2026 06:06:27 +0000 Subject: [PATCH 1/2] [ROCm] Add AMD GPU support via HIP Build the GPU solver for AMD GPUs on ROCm in addition to NVIDIA CUDA, from a single source tree. A new -DUSE_HIP=ON CMake option compiles the .cu kernels with the HIP language (parallel to the existing -DBUILD_CUDA=ON), selects the target architecture via -DCMAKE_HIP_ARCHITECTURES, and links hipBLAS/hipSPARSE (rocBLAS/rocSPARSE backends). Review order: start with cupdlp/cuda/cuda_to_hip.h, a compatibility shim that aliases the CUDA runtime, cuBLAS, and cuSPARSE symbols used by the project to their HIP equivalents under __HIP_PLATFORM_AMD__, and is a no-op on NVIDIA. Then FindHIPConf.cmake (ROCm toolchain/library discovery) and the CMakeLists wiring. Finally cupdlp_cuda_kernels.cu, where the reduction kernels are made warp-size-independent so the same code runs correctly on wave64 (CDNA, gfx90a) and wave32 (RDNA, gfx1100/gfx1201). The Windows shared-library build needs the HIP runtime linked into the wrapper targets and the kernel symbols exported. The CUDA build path is unchanged: when USE_HIP is OFF the HIP code is not compiled and no ROCm dependency is introduced. Test Plan: Linux (gfx90a, gfx1100), from a build dir with HiGHS installed: export HIGHS_HOME=$PWD/../install cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release cmake --build . --target plc --target testcudalin --target testcublas LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/testcudalin LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/testcublas LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/plc -fname ../example/afiro.mps -nIterLim 5000 Validated on AMD Instinct MI250X (gfx90a, wave64), AMD Radeon Pro W7800 (gfx1100, wave32), and AMD Radeon RX 9070 XT (gfx1201, wave32, Windows). This work was authored with assistance from Claude (Anthropic). --- CMakeLists.txt | 9 +- FindHIPConf.cmake | 69 ++++++++ README.md | 14 ++ apps/CMakeLists.txt | 9 +- cupdlp/CMakeLists.txt | 13 +- cupdlp/cuda/CMakeLists.txt | 79 +++++++--- cupdlp/cuda/cuda_to_hip.h | 111 +++++++++++++ cupdlp/cuda/cupdlp_cuda_kernels.cu | 233 ++++++++++++++++++++++------ cupdlp/cuda/cupdlp_cuda_kernels.cuh | 4 + cupdlp/cuda/cupdlp_cudalinalg.cu | 8 +- cupdlp/cuda/cupdlp_cudalinalg.cuh | 6 +- interface/CMakeLists.txt | 16 +- 12 files changed, 484 insertions(+), 87 deletions(-) create mode 100644 FindHIPConf.cmake create mode 100644 cupdlp/cuda/cuda_to_hip.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 543c89d..f354886 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.24) project(CUPDLP VERSION 0.0.1) option(BUILD_CUDA "" OFF) +option(USE_HIP "Build with HIP for AMD GPUs" OFF) option(BUILD_APPS "" OFF) option(BUILD_PYTHON "" OFF) message(NOTICE "----------------------- cuPDLP-C ------------------------") @@ -25,8 +26,14 @@ message("reset release flags: ${CMAKE_C_FLAGS_RELEASE}") message(NOTICE "--------------------- cuPDLP CPU/GPU CONFIG -----------------------") list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) message(NOTICE "-- Sets build with CUDA ${BUILD_CUDA}") -if (${BUILD_CUDA} STREQUAL "ON") +message(NOTICE "-- Sets build with HIP ${USE_HIP}") +if (${USE_HIP} STREQUAL "ON") + include(FindHIPConf.cmake) + set(CUDA_LIBRARY-NOTFOUND false) + set(GPU_LIBRARY ${HIP_LIBRARY}) +elseif (${BUILD_CUDA} STREQUAL "ON") include(FindCUDAConf.cmake) + set(GPU_LIBRARY ${CUDA_LIBRARY}) else () set(CUDA_LIBRARY-NOTFOUND true) endif () diff --git a/FindHIPConf.cmake b/FindHIPConf.cmake new file mode 100644 index 0000000..d8023c8 --- /dev/null +++ b/FindHIPConf.cmake @@ -0,0 +1,69 @@ +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# Author: Jeff Daily +# +# Locate the ROCm/HIP toolchain and the hipBLAS/hipSPARSE (rocBLAS/rocSPARSE) +# libraries used by the AMD GPU build. + +message(NOTICE "Finding HIP/ROCm environment") +message(NOTICE " - ROCM_PATH: $ENV{ROCM_PATH}") + +# Default ROCM_PATH if not set +if(NOT DEFINED ENV{ROCM_PATH}) + set(ENV{ROCM_PATH} "/opt/rocm") +endif() + +set(ROCM_PATH $ENV{ROCM_PATH}) + +# Enable HIP language +enable_language(HIP) + +# Set default architectures if not specified +if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "gfx90a") +endif() +message(NOTICE " - CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") + +# Find hipBLAS +find_library(HIP_LIBRARY_BLAS + NAMES hipblas + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find hipSPARSE +find_library(HIP_LIBRARY_SPARSE + NAMES hipsparse + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find amdhip64 runtime +find_library(HIP_LIBRARY_RT + NAMES amdhip64 + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find rocBLAS (hipBLAS backend) +find_library(HIP_LIBRARY_ROCBLAS + NAMES rocblas + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find rocSPARSE (hipSPARSE backend) +find_library(HIP_LIBRARY_ROCSPARSE + NAMES rocsparse + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +set(HIP_LIBRARY ${HIP_LIBRARY_RT} ${HIP_LIBRARY_BLAS} ${HIP_LIBRARY_SPARSE} ${HIP_LIBRARY_ROCBLAS} ${HIP_LIBRARY_ROCSPARSE}) +message(NOTICE " - HIP Libraries: ${HIP_LIBRARY}") + +# Set include directories +set(HIP_INCLUDE_DIRS + "${ROCM_PATH}/include" + "${ROCM_PATH}/include/hipblas" + "${ROCM_PATH}/include/hipsparse" +) diff --git a/README.md b/README.md index 6946c67..7bff816 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,20 @@ cmake -DBUILD_CUDA=ON \ -DCMAKE_CUDA_FLAGS_RELEASE="-O2 -DNDEBUG" .. ``` +### Building for AMD GPUs (ROCm/HIP) + +The GPU solver also runs on AMD GPUs through ROCm. Install ROCm (which provides hipBLAS, hipSPARSE, rocBLAS, and rocSPARSE) and HiGHS as above, then configure with `-DUSE_HIP=ON` in place of `-DBUILD_CUDA=ON` and select the target GPU architecture with `-DCMAKE_HIP_ARCHITECTURES`: + +```shell +export HIGHS_HOME=/path-to-highs +mkdir build +cd build +cmake -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release .. +cmake --build . --target plc +``` + +Set `CMAKE_HIP_ARCHITECTURES` to match your GPU, for example `gfx90a` (MI200 series), `gfx1100` (RDNA3), or `gfx1201` (RDNA4). `ROCM_PATH` defaults to `/opt/rocm`; set it if ROCm is installed elsewhere. The same source builds for NVIDIA (`-DBUILD_CUDA=ON`) and AMD (`-DUSE_HIP=ON`) GPUs. + ## Alternative Interfaces ### The Python Interface If you wish to use the Python interface, use the following steps: diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 33b863c..f1bae97 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -20,10 +20,11 @@ set_target_properties(onlinelp ) if (${CUDA_LIBRARY-NOTFOUND}) target_link_libraries(onlinelp PUBLIC cupdlp m) + target_compile_definitions(onlinelp PUBLIC -DCUPDLP_CPU=1) +elseif (${USE_HIP} STREQUAL "ON") + target_link_libraries(onlinelp PUBLIC cupdlp ${HIP_LIBRARY} m) + target_compile_definitions(onlinelp PUBLIC USE_HIP) + target_include_directories(onlinelp PUBLIC ${HIP_INCLUDE_DIRS}) else () - target_compile_definitions(wrapper_clp - PUBLIC - -DCUPDLP_CPU=1 - ) target_link_libraries(onlinelp PUBLIC cupdlp ${CUDA_LIBRARY} m) endif () \ No newline at end of file diff --git a/cupdlp/CMakeLists.txt b/cupdlp/CMakeLists.txt index 725f9c8..8207ecd 100644 --- a/cupdlp/CMakeLists.txt +++ b/cupdlp/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(cupdlp SHARED ${CUPDLP_INCLUDE_HEADERS} ${CUPDLP_SOURCE_FILES} ) +set_target_properties(cupdlp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_compile_definitions(cupdlp PUBLIC # If the debug configuration pass the DEBUG define to the compiler @@ -24,11 +25,17 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) - target_link_libraries(cupdlp m) + target_link_libraries(cupdlp $<$>:m>) +elseif (${USE_HIP} STREQUAL "ON") + add_subdirectory(cuda) + message(NOTICE "- GPU version PDLP (HIP/ROCm)") + target_include_directories(cupdlp PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(cupdlp PRIVATE cudalin ${HIP_LIBRARY} $<$>:m>) + set_target_properties(cupdlp PROPERTIES HIP_SEPARABLE_COMPILATION ON) else() add_subdirectory(cuda) - message(NOTICE "- GPU version PDLP") + message(NOTICE "- GPU version PDLP (CUDA)") target_include_directories(cupdlp PUBLIC "/usr/local/cuda/include") - target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} m) + target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} $<$>:m>) set_target_properties(cupdlp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif () diff --git a/cupdlp/cuda/CMakeLists.txt b/cupdlp/cuda/CMakeLists.txt index 3bdcc28..9408683 100644 --- a/cupdlp/cuda/CMakeLists.txt +++ b/cupdlp/cuda/CMakeLists.txt @@ -1,29 +1,64 @@ -enable_language(CXX CUDA) - -add_library(cudalin SHARED - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu +set(CUDA_SOURCES + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu ) -set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_include_directories(cudalin PUBLIC "/usr/local/cuda/include") -target_compile_definitions(cudalin - PUBLIC - # If the debug configuration pass the DEBUG define to the compiler - $<$:-DCUPDLP_DEBUG=1> + +set(CUDA_HEADERS + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cuda_to_hip.h ) -target_link_libraries(cudalin ${CUDA_LIBRARY} m) +if (${USE_HIP} STREQUAL "ON") + # HIP/ROCm build + enable_language(HIP) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP) -# add a test -add_executable(testcudalin test_cuda_linalg.c) -add_executable(testcublas test_cublas.c) + add_library(cudalin SHARED + ${CUDA_SOURCES} + ${CUDA_HEADERS} + ) + target_compile_definitions(cudalin PUBLIC USE_HIP) + set_target_properties(cudalin PROPERTIES + HIP_SEPARABLE_COMPILATION ON + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + WINDOWS_EXPORT_ALL_SYMBOLS ON + ) + target_include_directories(cudalin PUBLIC ${HIP_INCLUDE_DIRS}) + target_compile_definitions(cudalin + PUBLIC + $<$:-DCUPDLP_DEBUG=1> + ) + target_link_libraries(cudalin ${HIP_LIBRARY} $<$>:m>) +else() + # CUDA build + enable_language(CXX CUDA) -set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda) -target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY}) + add_library(cudalin SHARED + ${CUDA_SOURCES} + ${CUDA_HEADERS} + ) + set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_include_directories(cudalin PUBLIC "/usr/local/cuda/include") + target_compile_definitions(cudalin + PUBLIC + $<$:-DCUPDLP_DEBUG=1> + ) + target_link_libraries(cudalin ${CUDA_LIBRARY} $<$>:m>) +endif() -set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY}) +# add tests +add_executable(testcudalin test_cuda_linalg.c) +add_executable(testcublas test_cublas.c) +if (${USE_HIP} STREQUAL "ON") + set_target_properties(testcudalin PROPERTIES HIP_SEPARABLE_COMPILATION ON) + target_link_libraries(testcudalin PRIVATE cudalin ${HIP_LIBRARY}) + set_target_properties(testcublas PROPERTIES HIP_SEPARABLE_COMPILATION ON) + target_link_libraries(testcublas PRIVATE cudalin ${HIP_LIBRARY}) +else() + set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY}) + set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY}) +endif() diff --git a/cupdlp/cuda/cuda_to_hip.h b/cupdlp/cuda/cuda_to_hip.h new file mode 100644 index 0000000..3f2e12a --- /dev/null +++ b/cupdlp/cuda/cuda_to_hip.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Author: Jeff Daily + * + * CUDA-to-HIP compatibility header for cuPDLP-C + * + * On AMD (USE_HIP / __HIP_PLATFORM_AMD__): aliases CUDA symbols to HIP equivalents. + * On NVIDIA: no-op include of CUDA headers. + */ +#pragma once + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +// Ensure AMD platform is defined before including HIP headers +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +#define __HIP_PLATFORM_AMD__ +#endif + +#include +#include +#include + +// Runtime API +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset hipMemset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaRuntimeGetVersion hipRuntimeGetVersion +#define cudaDriverGetVersion hipDriverGetVersion +#define cudaDeviceReset hipDeviceReset + +// Memory copy kinds +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDefault hipMemcpyDefault + +// Error types +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess + +// Device properties +#define cudaDeviceProp hipDeviceProp_t +#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize + +// cuBLAS -> hipBLAS +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasDaxpy hipblasDaxpy +#define cublasSaxpy hipblasSaxpy +#define cublasDdot hipblasDdot +#define cublasSdot hipblasSdot +#define cublasDnrm2 hipblasDnrm2 +#define cublasSnrm2 hipblasSnrm2 +#define cublasDscal hipblasDscal +#define cublasSscal hipblasSscal +#define cublasGetStatusString hipblasStatusToString + +// cuSPARSE -> hipSPARSE +#define cusparseHandle_t hipsparseHandle_t +#define cusparseStatus_t hipsparseStatus_t +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define cusparseCreate hipsparseCreate +#define cusparseDestroy hipsparseDestroy +#define cusparseGetVersion hipsparseGetVersion +#define cusparseGetErrorString hipsparseGetErrorString + +// Sparse matrix/vector descriptors +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseDnVecDescr_t hipsparseDnVecDescr_t +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateCsc hipsparseCreateCsc +#define cusparseCreateDnVec hipsparseCreateDnVec +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDestroyDnVec hipsparseDestroyDnVec + +// SpMV operations +#define cusparseSpMV hipsparseSpMV +#define cusparseSpMV_bufferSize hipsparseSpMV_bufferSize +#define cusparseSpMVAlg_t hipsparseSpMVAlg_t +#define cusparseOperation_t hipsparseOperation_t +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_SPMV_CSR_ALG2 HIPSPARSE_SPMV_CSR_ALG2 + +// Compute type +#define CUDA_R_64F HIP_R_64F +#define CUDA_R_32F HIP_R_32F + +// Index base +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I + +#else // NVIDIA CUDA + +#include +#include +#include + +#endif // USE_HIP diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cu b/cupdlp/cuda/cupdlp_cuda_kernels.cu index 58580b3..cae21ad 100644 --- a/cupdlp/cuda/cupdlp_cuda_kernels.cu +++ b/cupdlp/cuda/cupdlp_cuda_kernels.cu @@ -1,4 +1,11 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Author: Jeff Daily + * + * AMD GPU (HIP/ROCm) support for the cuPDLP-C CUDA kernels. + */ #include "cupdlp_cuda_kernels.cuh" +#include "cuda_to_hip.h" @@ -199,7 +206,132 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, */ -#define QUARTER_WARP_REDUCE_2(val1, val2) { \ +// Warp size constant for device code +// CDNA (gfx90a, gfx94x): 64-lane wavefront +// RDNA (gfx10xx, gfx11xx): 32-lane wavefront +// CUDA: always 32 +#if defined(__HIP_PLATFORM_AMD__) + #if defined(__GFX9__) + #define CUPDLP_WARP_SIZE 64 + #else + #define CUPDLP_WARP_SIZE 32 + #endif +#else + #define CUPDLP_WARP_SIZE 32 +#endif + +// Warp reduction macros: arch-unified for wave32 (CUDA/RDNA) and wave64 (CDNA) +// HIP uses __shfl_down (no sync needed, wavefronts execute in lockstep) +// CUDA uses __shfl_down_sync with a 32-bit mask + +#if defined(__HIP_PLATFORM_AMD__) +// HIP: All AMD GPUs use __shfl_down (no sync needed) +// Wave64 (gfx9): reduce over 64 lanes +// Wave32 (RDNA): reduce over 32 lanes (offset>32 is no-op) + +#if defined(__GFX9__) +// Wave64: CDNA (gfx90a, gfx94x) +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down(val1, 32); \ + val2 += __shfl_down(val2, 32); \ + val1 += __shfl_down(val1, 16); \ + val2 += __shfl_down(val2, 16); \ + val1 += __shfl_down(val1, 8); \ + val2 += __shfl_down(val2, 8); \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down(val, 32); \ + val += __shfl_down(val, 16); \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 64 lanes = 4 warps; final reduction of 4 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +// 512 threads / 64 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +#else // HIP RDNA (wave32) + +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down(val1, 16); \ + val2 += __shfl_down(val2, 16); \ + val1 += __shfl_down(val1, 8); \ + val2 += __shfl_down(val2, 8); \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down(val, 16); \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} +#endif // __GFX9__ + +#else // CUDA + +// CUDA: Wave32, use __shfl_down_sync with 32-bit mask +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \ + val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \ + val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \ + val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \ val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \ @@ -208,11 +340,16 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \ } -#define FULL_WARP_REDUCE_2(val1, val2) { \ - val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \ - val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \ - val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \ - val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \ +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 16); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \ val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \ @@ -221,14 +358,34 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \ } -// assumes block size = 256, warp size = 32 +// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} +#endif // __HIP_PLATFORM_AMD__ + +// Max warps per block: 256/32=8 (wave32) or 512/32=16 (wave32), 256/64=4 or 512/64=8 (wave64) +// Upper bound is 16, use that for static shared memory sizing. +static constexpr int kMaxWarpsPerBlock = 16; + +// assumes block size = 256, warp size = 32 or 64 __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_float * __restrict__ res_y, const cupdlp_float * __restrict__ xUpdate, const cupdlp_float * __restrict__ x, const cupdlp_float * __restrict__ atyUpdate, const cupdlp_float * __restrict__ aty, int nCols) { - __shared__ cupdlp_float shared_x[32]; - __shared__ cupdlp_float shared_y[32]; + __shared__ cupdlp_float shared_x[kMaxWarpsPerBlock]; + __shared__ cupdlp_float shared_y[kMaxWarpsPerBlock]; cupdlp_float val_x = 0.0; cupdlp_float val_y = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nCols; i += blockDim.x * gridDim.x) { @@ -238,8 +395,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa val_y = cupdlp_fma_rn(day, dx, val_y); } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE_2(val_x, val_y) if (lane == 0) { @@ -249,9 +407,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa __syncthreads(); if (wid == 0) { - val_x = (threadIdx.x < blockDim.x / 32) ? shared_x[lane] : 0.0; - val_y = (threadIdx.x < blockDim.x / 32) ? shared_y[lane] : 0.0; - QUARTER_WARP_REDUCE_2(val_x, val_y) + val_x = (lane < nWarps) ? shared_x[lane] : 0.0; + val_y = (lane < nWarps) ? shared_y[lane] : 0.0; + FINAL_REDUCE_2_256(val_x, val_y) if (threadIdx.x == 0) { res_x[blockIdx.x] = val_x; res_y[blockIdx.x] = val_y; @@ -259,41 +417,21 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa } } -#define QUARTER_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -#define HALF_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -#define FULL_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 16); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -// assumes: block size = 256, warp size = 32 +// assumes: block size = 256, warp size = 32 or 64 __global__ void movement_2_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __restrict__ yUpdate, const cupdlp_float * __restrict__ y, int nRows) { - __shared__ cupdlp_float shared[32]; + __shared__ cupdlp_float shared[kMaxWarpsPerBlock]; cupdlp_float val = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nRows; i += blockDim.x * gridDim.x) { cupdlp_float d = yUpdate[i] - y[i]; val = cupdlp_fma_rn(d, d, val); } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE(val) if (lane == 0) { @@ -302,25 +440,26 @@ __global__ void movement_2_kernel(cupdlp_float * __restrict__ res, __syncthreads(); if (wid == 0) { - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0; - QUARTER_WARP_REDUCE(val) + val = (lane < nWarps) ? shared[lane] : 0.0; + FINAL_REDUCE_256(val) if (threadIdx.x == 0) { res[blockIdx.x] = val; } } } -// assumes: block size = 512, warp size = 32 +// assumes: block size = 512, warp size = 32 or 64 __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __restrict__ x, int n) { - __shared__ cupdlp_float shared[32]; + __shared__ cupdlp_float shared[kMaxWarpsPerBlock]; cupdlp_float val = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { val += x[i]; } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE(val) if (lane == 0) { @@ -329,8 +468,8 @@ __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __syncthreads(); if (wid == 0) { - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0; - HALF_WARP_REDUCE(val) + val = (lane < nWarps) ? shared[lane] : 0.0; + FINAL_REDUCE_512(val) if (threadIdx.x == 0) { res[blockIdx.x] = val; } diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/cupdlp/cuda/cupdlp_cuda_kernels.cuh index f04aac0..fe84d38 100644 --- a/cupdlp/cuda/cupdlp_cuda_kernels.cuh +++ b/cupdlp/cuda/cupdlp_cuda_kernels.cuh @@ -3,9 +3,13 @@ #include #include /* EXIT_FAILURE */ +#include "cuda_to_hip.h" + +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include #include +#endif //#define CUPDLP_BLOCK_SIZE 512 diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cu b/cupdlp/cuda/cupdlp_cudalinalg.cu index e124065..e7b01ca 100644 --- a/cupdlp/cuda/cupdlp_cudalinalg.cu +++ b/cupdlp/cuda/cupdlp_cudalinalg.cu @@ -1,6 +1,7 @@ #include // printf #include // EXIT_FAILURE +#include "cuda_to_hip.h" #include "cupdlp_cudalinalg.cuh" inline int nBlocks256(int n) { @@ -257,12 +258,7 @@ void cupdlp_movement_interaction_cuda( const cupdlp_float *atyUpdate, const cupdlp_float *aty, int nRows, int nCols) { - int warpSize; - CHECK_CUDA_IGNORE(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, 0)) - if (warpSize != 32) { - printf("warpSize\n"); - exit(1); - } + // Warp size query removed: reduction kernels now support both wave32 and wave64 constexpr int RED_BLOCK_SIZE = 256; constexpr int RED_ELS_PER_THREAD = 4; diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cuh b/cupdlp/cuda/cupdlp_cudalinalg.cuh index f4e4498..3442fcc 100644 --- a/cupdlp/cuda/cupdlp_cudalinalg.cuh +++ b/cupdlp/cuda/cupdlp_cudalinalg.cuh @@ -1,11 +1,13 @@ #ifndef CUPDLP_CUDA_LINALG_H #define CUPDLP_CUDA_LINALG_H +#include "cupdlp_cuda_kernels.cuh" + +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include // cublas #include // cudaMalloc, cudaMemcpy, etc. #include // cusparseSpMV - -#include "cupdlp_cuda_kernels.cuh" +#endif #define PRINT_CUDA_INFO (1) #define PRINT_DETAILED_CUDA_INFO (0) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 664d8a9..69c9c0b 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -5,6 +5,7 @@ enable_language(CXX) add_library(wrapper_lp SHARED mps_lp.h mps_lp.c ) +set_target_properties(wrapper_lp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_link_libraries( wrapper_lp PUBLIC cupdlp ) @@ -14,6 +15,10 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) +elseif (${USE_HIP} STREQUAL "ON") + target_compile_definitions(wrapper_lp PUBLIC USE_HIP) + target_include_directories(wrapper_lp PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(wrapper_lp PRIVATE ${HIP_LIBRARY}) endif() ############################################# @@ -26,6 +31,7 @@ add_library(wrapper_highs SHARED mps_lp.h mps_lp.c wrapper_highs.cpp ${HiGHS_HEADER_FILES} wrapper_highs.h) +set_target_properties(wrapper_highs PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories( wrapper_highs PUBLIC @@ -41,6 +47,10 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) +elseif (${USE_HIP} STREQUAL "ON") + target_compile_definitions(wrapper_highs PUBLIC USE_HIP) + target_include_directories(wrapper_highs PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(wrapper_highs PRIVATE ${HIP_LIBRARY}) endif() @@ -63,9 +73,11 @@ target_link_libraries( if (${CUDA_LIBRARY-NOTFOUND}) - target_link_libraries(plc PRIVATE cupdlp m) + target_link_libraries(plc PRIVATE cupdlp $<$>:m>) +elseif (${USE_HIP} STREQUAL "ON") + target_link_libraries(plc PRIVATE cupdlp ${HIP_LIBRARY} $<$>:m>) else () - target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} m) + target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} $<$>:m>) endif () if (${CUDA_LIBRARY-NOTFOUND}) add_custom_target( From d380bcab9f7bf75bbaa7b14366f6c7902c2a8f91 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Tue, 23 Jun 2026 22:20:57 +0000 Subject: [PATCH 2/2] [ROCm] Rely on HIP arch auto-detect instead of pinning gfx90a The gfx90a pin sat after enable_language(HIP), so its if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) guard was always false and the block was dead -- enable_language(HIP) has already detected the host arch (or errored). Removing it makes intent clear and keeps the build honoring -DCMAKE_HIP_ARCHITECTURES, auto-detecting the host GPU, or erroring on a no-GPU host, rather than risking a silently wrong gfx90a default if file order ever changed. This change was authored with the assistance of the Claude AI assistant. --- FindHIPConf.cmake | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/FindHIPConf.cmake b/FindHIPConf.cmake index d8023c8..06e1e85 100644 --- a/FindHIPConf.cmake +++ b/FindHIPConf.cmake @@ -14,13 +14,9 @@ endif() set(ROCM_PATH $ENV{ROCM_PATH}) -# Enable HIP language +# enable_language(HIP) auto-detects the host GPU arch (and errors on a +# no-GPU build host); pass -DCMAKE_HIP_ARCHITECTURES=... to override. enable_language(HIP) - -# Set default architectures if not specified -if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") - set(CMAKE_HIP_ARCHITECTURES "gfx90a") -endif() message(NOTICE " - CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") # Find hipBLAS