diff --git a/CMakeLists.txt b/CMakeLists.txt index 543c89d..f354886 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.24) project(CUPDLP VERSION 0.0.1) option(BUILD_CUDA "" OFF) +option(USE_HIP "Build with HIP for AMD GPUs" OFF) option(BUILD_APPS "" OFF) option(BUILD_PYTHON "" OFF) message(NOTICE "----------------------- cuPDLP-C ------------------------") @@ -25,8 +26,14 @@ message("reset release flags: ${CMAKE_C_FLAGS_RELEASE}") message(NOTICE "--------------------- cuPDLP CPU/GPU CONFIG -----------------------") list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) message(NOTICE "-- Sets build with CUDA ${BUILD_CUDA}") -if (${BUILD_CUDA} STREQUAL "ON") +message(NOTICE "-- Sets build with HIP ${USE_HIP}") +if (${USE_HIP} STREQUAL "ON") + include(FindHIPConf.cmake) + set(CUDA_LIBRARY-NOTFOUND false) + set(GPU_LIBRARY ${HIP_LIBRARY}) +elseif (${BUILD_CUDA} STREQUAL "ON") include(FindCUDAConf.cmake) + set(GPU_LIBRARY ${CUDA_LIBRARY}) else () set(CUDA_LIBRARY-NOTFOUND true) endif () diff --git a/FindHIPConf.cmake b/FindHIPConf.cmake new file mode 100644 index 0000000..06e1e85 --- /dev/null +++ b/FindHIPConf.cmake @@ -0,0 +1,65 @@ +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# Author: Jeff Daily +# +# Locate the ROCm/HIP toolchain and the hipBLAS/hipSPARSE (rocBLAS/rocSPARSE) +# libraries used by the AMD GPU build. + +message(NOTICE "Finding HIP/ROCm environment") +message(NOTICE " - ROCM_PATH: $ENV{ROCM_PATH}") + +# Default ROCM_PATH if not set +if(NOT DEFINED ENV{ROCM_PATH}) + set(ENV{ROCM_PATH} "/opt/rocm") +endif() + +set(ROCM_PATH $ENV{ROCM_PATH}) + +# enable_language(HIP) auto-detects the host GPU arch (and errors on a +# no-GPU build host); pass -DCMAKE_HIP_ARCHITECTURES=... to override. +enable_language(HIP) +message(NOTICE " - CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") + +# Find hipBLAS +find_library(HIP_LIBRARY_BLAS + NAMES hipblas + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find hipSPARSE +find_library(HIP_LIBRARY_SPARSE + NAMES hipsparse + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find amdhip64 runtime +find_library(HIP_LIBRARY_RT + NAMES amdhip64 + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find rocBLAS (hipBLAS backend) +find_library(HIP_LIBRARY_ROCBLAS + NAMES rocblas + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +# Find rocSPARSE (hipSPARSE backend) +find_library(HIP_LIBRARY_ROCSPARSE + NAMES rocsparse + HINTS "${ROCM_PATH}/lib" + REQUIRED +) + +set(HIP_LIBRARY ${HIP_LIBRARY_RT} ${HIP_LIBRARY_BLAS} ${HIP_LIBRARY_SPARSE} ${HIP_LIBRARY_ROCBLAS} ${HIP_LIBRARY_ROCSPARSE}) +message(NOTICE " - HIP Libraries: ${HIP_LIBRARY}") + +# Set include directories +set(HIP_INCLUDE_DIRS + "${ROCM_PATH}/include" + "${ROCM_PATH}/include/hipblas" + "${ROCM_PATH}/include/hipsparse" +) diff --git a/README.md b/README.md index 6946c67..7bff816 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,20 @@ cmake -DBUILD_CUDA=ON \ -DCMAKE_CUDA_FLAGS_RELEASE="-O2 -DNDEBUG" .. ``` +### Building for AMD GPUs (ROCm/HIP) + +The GPU solver also runs on AMD GPUs through ROCm. Install ROCm (which provides hipBLAS, hipSPARSE, rocBLAS, and rocSPARSE) and HiGHS as above, then configure with `-DUSE_HIP=ON` in place of `-DBUILD_CUDA=ON` and select the target GPU architecture with `-DCMAKE_HIP_ARCHITECTURES`: + +```shell +export HIGHS_HOME=/path-to-highs +mkdir build +cd build +cmake -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release .. +cmake --build . --target plc +``` + +Set `CMAKE_HIP_ARCHITECTURES` to match your GPU, for example `gfx90a` (MI200 series), `gfx1100` (RDNA3), or `gfx1201` (RDNA4). `ROCM_PATH` defaults to `/opt/rocm`; set it if ROCm is installed elsewhere. The same source builds for NVIDIA (`-DBUILD_CUDA=ON`) and AMD (`-DUSE_HIP=ON`) GPUs. + ## Alternative Interfaces ### The Python Interface If you wish to use the Python interface, use the following steps: diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 33b863c..f1bae97 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -20,10 +20,11 @@ set_target_properties(onlinelp ) if (${CUDA_LIBRARY-NOTFOUND}) target_link_libraries(onlinelp PUBLIC cupdlp m) + target_compile_definitions(onlinelp PUBLIC -DCUPDLP_CPU=1) +elseif (${USE_HIP} STREQUAL "ON") + target_link_libraries(onlinelp PUBLIC cupdlp ${HIP_LIBRARY} m) + target_compile_definitions(onlinelp PUBLIC USE_HIP) + target_include_directories(onlinelp PUBLIC ${HIP_INCLUDE_DIRS}) else () - target_compile_definitions(wrapper_clp - PUBLIC - -DCUPDLP_CPU=1 - ) target_link_libraries(onlinelp PUBLIC cupdlp ${CUDA_LIBRARY} m) endif () \ No newline at end of file diff --git a/cupdlp/CMakeLists.txt b/cupdlp/CMakeLists.txt index 725f9c8..8207ecd 100644 --- a/cupdlp/CMakeLists.txt +++ b/cupdlp/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(cupdlp SHARED ${CUPDLP_INCLUDE_HEADERS} ${CUPDLP_SOURCE_FILES} ) +set_target_properties(cupdlp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_compile_definitions(cupdlp PUBLIC # If the debug configuration pass the DEBUG define to the compiler @@ -24,11 +25,17 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) - target_link_libraries(cupdlp m) + target_link_libraries(cupdlp $<$>:m>) +elseif (${USE_HIP} STREQUAL "ON") + add_subdirectory(cuda) + message(NOTICE "- GPU version PDLP (HIP/ROCm)") + target_include_directories(cupdlp PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(cupdlp PRIVATE cudalin ${HIP_LIBRARY} $<$>:m>) + set_target_properties(cupdlp PROPERTIES HIP_SEPARABLE_COMPILATION ON) else() add_subdirectory(cuda) - message(NOTICE "- GPU version PDLP") + message(NOTICE "- GPU version PDLP (CUDA)") target_include_directories(cupdlp PUBLIC "/usr/local/cuda/include") - target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} m) + target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} $<$>:m>) set_target_properties(cupdlp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif () diff --git a/cupdlp/cuda/CMakeLists.txt b/cupdlp/cuda/CMakeLists.txt index 3bdcc28..9408683 100644 --- a/cupdlp/cuda/CMakeLists.txt +++ b/cupdlp/cuda/CMakeLists.txt @@ -1,29 +1,64 @@ -enable_language(CXX CUDA) - -add_library(cudalin SHARED - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh - ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu +set(CUDA_SOURCES + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu ) -set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_include_directories(cudalin PUBLIC "/usr/local/cuda/include") -target_compile_definitions(cudalin - PUBLIC - # If the debug configuration pass the DEBUG define to the compiler - $<$:-DCUPDLP_DEBUG=1> + +set(CUDA_HEADERS + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cuda_to_hip.h ) -target_link_libraries(cudalin ${CUDA_LIBRARY} m) +if (${USE_HIP} STREQUAL "ON") + # HIP/ROCm build + enable_language(HIP) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP) -# add a test -add_executable(testcudalin test_cuda_linalg.c) -add_executable(testcublas test_cublas.c) + add_library(cudalin SHARED + ${CUDA_SOURCES} + ${CUDA_HEADERS} + ) + target_compile_definitions(cudalin PUBLIC USE_HIP) + set_target_properties(cudalin PROPERTIES + HIP_SEPARABLE_COMPILATION ON + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + WINDOWS_EXPORT_ALL_SYMBOLS ON + ) + target_include_directories(cudalin PUBLIC ${HIP_INCLUDE_DIRS}) + target_compile_definitions(cudalin + PUBLIC + $<$:-DCUPDLP_DEBUG=1> + ) + target_link_libraries(cudalin ${HIP_LIBRARY} $<$>:m>) +else() + # CUDA build + enable_language(CXX CUDA) -set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda) -target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY}) + add_library(cudalin SHARED + ${CUDA_SOURCES} + ${CUDA_HEADERS} + ) + set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_include_directories(cudalin PUBLIC "/usr/local/cuda/include") + target_compile_definitions(cudalin + PUBLIC + $<$:-DCUPDLP_DEBUG=1> + ) + target_link_libraries(cudalin ${CUDA_LIBRARY} $<$>:m>) +endif() -set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY}) +# add tests +add_executable(testcudalin test_cuda_linalg.c) +add_executable(testcublas test_cublas.c) +if (${USE_HIP} STREQUAL "ON") + set_target_properties(testcudalin PROPERTIES HIP_SEPARABLE_COMPILATION ON) + target_link_libraries(testcudalin PRIVATE cudalin ${HIP_LIBRARY}) + set_target_properties(testcublas PROPERTIES HIP_SEPARABLE_COMPILATION ON) + target_link_libraries(testcublas PRIVATE cudalin ${HIP_LIBRARY}) +else() + set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY}) + set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY}) +endif() diff --git a/cupdlp/cuda/cuda_to_hip.h b/cupdlp/cuda/cuda_to_hip.h new file mode 100644 index 0000000..3f2e12a --- /dev/null +++ b/cupdlp/cuda/cuda_to_hip.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Author: Jeff Daily + * + * CUDA-to-HIP compatibility header for cuPDLP-C + * + * On AMD (USE_HIP / __HIP_PLATFORM_AMD__): aliases CUDA symbols to HIP equivalents. + * On NVIDIA: no-op include of CUDA headers. + */ +#pragma once + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +// Ensure AMD platform is defined before including HIP headers +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +#define __HIP_PLATFORM_AMD__ +#endif + +#include +#include +#include + +// Runtime API +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset hipMemset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaRuntimeGetVersion hipRuntimeGetVersion +#define cudaDriverGetVersion hipDriverGetVersion +#define cudaDeviceReset hipDeviceReset + +// Memory copy kinds +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDefault hipMemcpyDefault + +// Error types +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess + +// Device properties +#define cudaDeviceProp hipDeviceProp_t +#define cudaDevAttrMultiProcessorCount hipDeviceAttributeMultiprocessorCount +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize + +// cuBLAS -> hipBLAS +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasDaxpy hipblasDaxpy +#define cublasSaxpy hipblasSaxpy +#define cublasDdot hipblasDdot +#define cublasSdot hipblasSdot +#define cublasDnrm2 hipblasDnrm2 +#define cublasSnrm2 hipblasSnrm2 +#define cublasDscal hipblasDscal +#define cublasSscal hipblasSscal +#define cublasGetStatusString hipblasStatusToString + +// cuSPARSE -> hipSPARSE +#define cusparseHandle_t hipsparseHandle_t +#define cusparseStatus_t hipsparseStatus_t +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define cusparseCreate hipsparseCreate +#define cusparseDestroy hipsparseDestroy +#define cusparseGetVersion hipsparseGetVersion +#define cusparseGetErrorString hipsparseGetErrorString + +// Sparse matrix/vector descriptors +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseDnVecDescr_t hipsparseDnVecDescr_t +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateCsc hipsparseCreateCsc +#define cusparseCreateDnVec hipsparseCreateDnVec +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDestroyDnVec hipsparseDestroyDnVec + +// SpMV operations +#define cusparseSpMV hipsparseSpMV +#define cusparseSpMV_bufferSize hipsparseSpMV_bufferSize +#define cusparseSpMVAlg_t hipsparseSpMVAlg_t +#define cusparseOperation_t hipsparseOperation_t +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_SPMV_CSR_ALG2 HIPSPARSE_SPMV_CSR_ALG2 + +// Compute type +#define CUDA_R_64F HIP_R_64F +#define CUDA_R_32F HIP_R_32F + +// Index base +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I + +#else // NVIDIA CUDA + +#include +#include +#include + +#endif // USE_HIP diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cu b/cupdlp/cuda/cupdlp_cuda_kernels.cu index 58580b3..cae21ad 100644 --- a/cupdlp/cuda/cupdlp_cuda_kernels.cu +++ b/cupdlp/cuda/cupdlp_cuda_kernels.cu @@ -1,4 +1,11 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Author: Jeff Daily + * + * AMD GPU (HIP/ROCm) support for the cuPDLP-C CUDA kernels. + */ #include "cupdlp_cuda_kernels.cuh" +#include "cuda_to_hip.h" @@ -199,7 +206,132 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, */ -#define QUARTER_WARP_REDUCE_2(val1, val2) { \ +// Warp size constant for device code +// CDNA (gfx90a, gfx94x): 64-lane wavefront +// RDNA (gfx10xx, gfx11xx): 32-lane wavefront +// CUDA: always 32 +#if defined(__HIP_PLATFORM_AMD__) + #if defined(__GFX9__) + #define CUPDLP_WARP_SIZE 64 + #else + #define CUPDLP_WARP_SIZE 32 + #endif +#else + #define CUPDLP_WARP_SIZE 32 +#endif + +// Warp reduction macros: arch-unified for wave32 (CUDA/RDNA) and wave64 (CDNA) +// HIP uses __shfl_down (no sync needed, wavefronts execute in lockstep) +// CUDA uses __shfl_down_sync with a 32-bit mask + +#if defined(__HIP_PLATFORM_AMD__) +// HIP: All AMD GPUs use __shfl_down (no sync needed) +// Wave64 (gfx9): reduce over 64 lanes +// Wave32 (RDNA): reduce over 32 lanes (offset>32 is no-op) + +#if defined(__GFX9__) +// Wave64: CDNA (gfx90a, gfx94x) +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down(val1, 32); \ + val2 += __shfl_down(val2, 32); \ + val1 += __shfl_down(val1, 16); \ + val2 += __shfl_down(val2, 16); \ + val1 += __shfl_down(val1, 8); \ + val2 += __shfl_down(val2, 8); \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down(val, 32); \ + val += __shfl_down(val, 16); \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 64 lanes = 4 warps; final reduction of 4 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +// 512 threads / 64 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +#else // HIP RDNA (wave32) + +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down(val1, 16); \ + val2 += __shfl_down(val2, 16); \ + val1 += __shfl_down(val1, 8); \ + val2 += __shfl_down(val2, 8); \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down(val, 16); \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ + val1 += __shfl_down(val1, 4); \ + val2 += __shfl_down(val2, 4); \ + val1 += __shfl_down(val1, 2); \ + val2 += __shfl_down(val2, 2); \ + val1 += __shfl_down(val1, 1); \ + val2 += __shfl_down(val2, 1); \ +} + +// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down(val, 8); \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down(val, 4); \ + val += __shfl_down(val, 2); \ + val += __shfl_down(val, 1); \ +} +#endif // __GFX9__ + +#else // CUDA + +// CUDA: Wave32, use __shfl_down_sync with 32-bit mask +#define FULL_WARP_REDUCE_2(val1, val2) { \ + val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \ + val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \ + val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \ + val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \ val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \ @@ -208,11 +340,16 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \ } -#define FULL_WARP_REDUCE_2(val1, val2) { \ - val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \ - val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \ - val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \ - val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \ +#define FULL_WARP_REDUCE(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 16); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_2_256(val1, val2) { \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \ val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \ val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \ @@ -221,14 +358,34 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x, val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \ } -// assumes block size = 256, warp size = 32 +// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements +#define FINAL_REDUCE_512(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} + +// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements +#define FINAL_REDUCE_256(val) { \ + val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ + val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +} +#endif // __HIP_PLATFORM_AMD__ + +// Max warps per block: 256/32=8 (wave32) or 512/32=16 (wave32), 256/64=4 or 512/64=8 (wave64) +// Upper bound is 16, use that for static shared memory sizing. +static constexpr int kMaxWarpsPerBlock = 16; + +// assumes block size = 256, warp size = 32 or 64 __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_float * __restrict__ res_y, const cupdlp_float * __restrict__ xUpdate, const cupdlp_float * __restrict__ x, const cupdlp_float * __restrict__ atyUpdate, const cupdlp_float * __restrict__ aty, int nCols) { - __shared__ cupdlp_float shared_x[32]; - __shared__ cupdlp_float shared_y[32]; + __shared__ cupdlp_float shared_x[kMaxWarpsPerBlock]; + __shared__ cupdlp_float shared_y[kMaxWarpsPerBlock]; cupdlp_float val_x = 0.0; cupdlp_float val_y = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nCols; i += blockDim.x * gridDim.x) { @@ -238,8 +395,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa val_y = cupdlp_fma_rn(day, dx, val_y); } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE_2(val_x, val_y) if (lane == 0) { @@ -249,9 +407,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa __syncthreads(); if (wid == 0) { - val_x = (threadIdx.x < blockDim.x / 32) ? shared_x[lane] : 0.0; - val_y = (threadIdx.x < blockDim.x / 32) ? shared_y[lane] : 0.0; - QUARTER_WARP_REDUCE_2(val_x, val_y) + val_x = (lane < nWarps) ? shared_x[lane] : 0.0; + val_y = (lane < nWarps) ? shared_y[lane] : 0.0; + FINAL_REDUCE_2_256(val_x, val_y) if (threadIdx.x == 0) { res_x[blockIdx.x] = val_x; res_y[blockIdx.x] = val_y; @@ -259,41 +417,21 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa } } -#define QUARTER_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -#define HALF_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -#define FULL_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 16); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ -} - -// assumes: block size = 256, warp size = 32 +// assumes: block size = 256, warp size = 32 or 64 __global__ void movement_2_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __restrict__ yUpdate, const cupdlp_float * __restrict__ y, int nRows) { - __shared__ cupdlp_float shared[32]; + __shared__ cupdlp_float shared[kMaxWarpsPerBlock]; cupdlp_float val = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nRows; i += blockDim.x * gridDim.x) { cupdlp_float d = yUpdate[i] - y[i]; val = cupdlp_fma_rn(d, d, val); } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE(val) if (lane == 0) { @@ -302,25 +440,26 @@ __global__ void movement_2_kernel(cupdlp_float * __restrict__ res, __syncthreads(); if (wid == 0) { - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0; - QUARTER_WARP_REDUCE(val) + val = (lane < nWarps) ? shared[lane] : 0.0; + FINAL_REDUCE_256(val) if (threadIdx.x == 0) { res[blockIdx.x] = val; } } } -// assumes: block size = 512, warp size = 32 +// assumes: block size = 512, warp size = 32 or 64 __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __restrict__ x, int n) { - __shared__ cupdlp_float shared[32]; + __shared__ cupdlp_float shared[kMaxWarpsPerBlock]; cupdlp_float val = 0.0; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { val += x[i]; } - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; + int lane = threadIdx.x % CUPDLP_WARP_SIZE; + int wid = threadIdx.x / CUPDLP_WARP_SIZE; + int nWarps = blockDim.x / CUPDLP_WARP_SIZE; FULL_WARP_REDUCE(val) if (lane == 0) { @@ -329,8 +468,8 @@ __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __syncthreads(); if (wid == 0) { - val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0; - HALF_WARP_REDUCE(val) + val = (lane < nWarps) ? shared[lane] : 0.0; + FINAL_REDUCE_512(val) if (threadIdx.x == 0) { res[blockIdx.x] = val; } diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/cupdlp/cuda/cupdlp_cuda_kernels.cuh index f04aac0..fe84d38 100644 --- a/cupdlp/cuda/cupdlp_cuda_kernels.cuh +++ b/cupdlp/cuda/cupdlp_cuda_kernels.cuh @@ -3,9 +3,13 @@ #include #include /* EXIT_FAILURE */ +#include "cuda_to_hip.h" + +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include #include +#endif //#define CUPDLP_BLOCK_SIZE 512 diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cu b/cupdlp/cuda/cupdlp_cudalinalg.cu index e124065..e7b01ca 100644 --- a/cupdlp/cuda/cupdlp_cudalinalg.cu +++ b/cupdlp/cuda/cupdlp_cudalinalg.cu @@ -1,6 +1,7 @@ #include // printf #include // EXIT_FAILURE +#include "cuda_to_hip.h" #include "cupdlp_cudalinalg.cuh" inline int nBlocks256(int n) { @@ -257,12 +258,7 @@ void cupdlp_movement_interaction_cuda( const cupdlp_float *atyUpdate, const cupdlp_float *aty, int nRows, int nCols) { - int warpSize; - CHECK_CUDA_IGNORE(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, 0)) - if (warpSize != 32) { - printf("warpSize\n"); - exit(1); - } + // Warp size query removed: reduction kernels now support both wave32 and wave64 constexpr int RED_BLOCK_SIZE = 256; constexpr int RED_ELS_PER_THREAD = 4; diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cuh b/cupdlp/cuda/cupdlp_cudalinalg.cuh index f4e4498..3442fcc 100644 --- a/cupdlp/cuda/cupdlp_cudalinalg.cuh +++ b/cupdlp/cuda/cupdlp_cudalinalg.cuh @@ -1,11 +1,13 @@ #ifndef CUPDLP_CUDA_LINALG_H #define CUPDLP_CUDA_LINALG_H +#include "cupdlp_cuda_kernels.cuh" + +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include // cublas #include // cudaMalloc, cudaMemcpy, etc. #include // cusparseSpMV - -#include "cupdlp_cuda_kernels.cuh" +#endif #define PRINT_CUDA_INFO (1) #define PRINT_DETAILED_CUDA_INFO (0) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 664d8a9..69c9c0b 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -5,6 +5,7 @@ enable_language(CXX) add_library(wrapper_lp SHARED mps_lp.h mps_lp.c ) +set_target_properties(wrapper_lp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_link_libraries( wrapper_lp PUBLIC cupdlp ) @@ -14,6 +15,10 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) +elseif (${USE_HIP} STREQUAL "ON") + target_compile_definitions(wrapper_lp PUBLIC USE_HIP) + target_include_directories(wrapper_lp PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(wrapper_lp PRIVATE ${HIP_LIBRARY}) endif() ############################################# @@ -26,6 +31,7 @@ add_library(wrapper_highs SHARED mps_lp.h mps_lp.c wrapper_highs.cpp ${HiGHS_HEADER_FILES} wrapper_highs.h) +set_target_properties(wrapper_highs PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories( wrapper_highs PUBLIC @@ -41,6 +47,10 @@ if (${CUDA_LIBRARY-NOTFOUND}) PUBLIC -DCUPDLP_CPU=1 ) +elseif (${USE_HIP} STREQUAL "ON") + target_compile_definitions(wrapper_highs PUBLIC USE_HIP) + target_include_directories(wrapper_highs PUBLIC ${HIP_INCLUDE_DIRS}) + target_link_libraries(wrapper_highs PRIVATE ${HIP_LIBRARY}) endif() @@ -63,9 +73,11 @@ target_link_libraries( if (${CUDA_LIBRARY-NOTFOUND}) - target_link_libraries(plc PRIVATE cupdlp m) + target_link_libraries(plc PRIVATE cupdlp $<$>:m>) +elseif (${USE_HIP} STREQUAL "ON") + target_link_libraries(plc PRIVATE cupdlp ${HIP_LIBRARY} $<$>:m>) else () - target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} m) + target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} $<$>:m>) endif () if (${CUDA_LIBRARY-NOTFOUND}) add_custom_target(