From 1a1ddff3840c0dd7c8d22302052ed84348d440e4 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 12 Jun 2026 06:06:27 +0000
Subject: [PATCH 1/2] [ROCm] Add AMD GPU support via HIP

Build the GPU solver for AMD GPUs on ROCm in addition to NVIDIA CUDA, from a
single source tree. A new -DUSE_HIP=ON CMake option compiles the .cu kernels
with the HIP language (parallel to the existing -DBUILD_CUDA=ON), selects the
target architecture via -DCMAKE_HIP_ARCHITECTURES, and links hipBLAS/hipSPARSE
(rocBLAS/rocSPARSE backends).

Review order: start with cupdlp/cuda/cuda_to_hip.h, a compatibility shim that
aliases the CUDA runtime, cuBLAS, and cuSPARSE symbols used by the project to
their HIP equivalents under __HIP_PLATFORM_AMD__, and is a no-op on NVIDIA.
Then FindHIPConf.cmake (ROCm toolchain/library discovery) and the CMakeLists
wiring. Finally cupdlp_cuda_kernels.cu, where the reduction kernels are made
warp-size-independent so the same code runs correctly on wave64 (CDNA, gfx90a)
and wave32 (RDNA, gfx1100/gfx1201). The Windows shared-library build needs the
HIP runtime linked into the wrapper targets and the kernel symbols exported.

The CUDA build path is unchanged: when USE_HIP is OFF the HIP code is not
compiled and no ROCm dependency is introduced.

Test Plan:
Linux (gfx90a, gfx1100), from a build dir with HiGHS installed:

  export HIGHS_HOME=$PWD/../install
  cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release
  cmake --build . --target plc --target testcudalin --target testcublas
  LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/testcudalin
  LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/testcublas
  LD_LIBRARY_PATH=$HIGHS_HOME/lib:$PWD/lib ./bin/plc -fname ../example/afiro.mps -nIterLim 5000

Validated on AMD Instinct MI250X (gfx90a, wave64), AMD Radeon Pro W7800
(gfx1100, wave32), and AMD Radeon RX 9070 XT (gfx1201, wave32, Windows).

This work was authored with assistance from Claude (Anthropic).
---
 CMakeLists.txt                      |   9 +-
 FindHIPConf.cmake                   |  69 ++++++++
 README.md                           |  14 ++
 apps/CMakeLists.txt                 |   9 +-
 cupdlp/CMakeLists.txt               |  13 +-
 cupdlp/cuda/CMakeLists.txt          |  79 +++++++---
 cupdlp/cuda/cuda_to_hip.h           | 111 +++++++++++++
 cupdlp/cuda/cupdlp_cuda_kernels.cu  | 233 ++++++++++++++++++++++------
 cupdlp/cuda/cupdlp_cuda_kernels.cuh |   4 +
 cupdlp/cuda/cupdlp_cudalinalg.cu    |   8 +-
 cupdlp/cuda/cupdlp_cudalinalg.cuh   |   6 +-
 interface/CMakeLists.txt            |  16 +-
 12 files changed, 484 insertions(+), 87 deletions(-)
 create mode 100644 FindHIPConf.cmake
 create mode 100644 cupdlp/cuda/cuda_to_hip.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 543c89d..f354886 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.24)
 project(CUPDLP VERSION 0.0.1)
 
 option(BUILD_CUDA "<BUILD_CUDA_OR_NOT>" OFF)
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
 option(BUILD_APPS "<BUILD_APPS_OR_NOT>" OFF)
 option(BUILD_PYTHON "<BUILD_PYTHON_OR_NOT>" OFF)
 message(NOTICE "-----------------------      cuPDLP-C      ------------------------")
@@ -25,8 +26,14 @@ message("reset release flags: ${CMAKE_C_FLAGS_RELEASE}")
 message(NOTICE "--------------------- cuPDLP CPU/GPU CONFIG -----------------------")
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
 message(NOTICE "-- Sets build with CUDA ${BUILD_CUDA}")
-if (${BUILD_CUDA} STREQUAL "ON")
+message(NOTICE "-- Sets build with HIP ${USE_HIP}")
+if (${USE_HIP} STREQUAL "ON")
+    include(FindHIPConf.cmake)
+    set(CUDA_LIBRARY-NOTFOUND false)
+    set(GPU_LIBRARY ${HIP_LIBRARY})
+elseif (${BUILD_CUDA} STREQUAL "ON")
     include(FindCUDAConf.cmake)
+    set(GPU_LIBRARY ${CUDA_LIBRARY})
 else ()
     set(CUDA_LIBRARY-NOTFOUND true)
 endif ()
diff --git a/FindHIPConf.cmake b/FindHIPConf.cmake
new file mode 100644
index 0000000..d8023c8
--- /dev/null
+++ b/FindHIPConf.cmake
@@ -0,0 +1,69 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# Author: Jeff Daily <jeff.daily@amd.com>
+#
+# Locate the ROCm/HIP toolchain and the hipBLAS/hipSPARSE (rocBLAS/rocSPARSE)
+# libraries used by the AMD GPU build.
+
+message(NOTICE "Finding HIP/ROCm environment")
+message(NOTICE "    - ROCM_PATH: $ENV{ROCM_PATH}")
+
+# Default ROCM_PATH if not set
+if(NOT DEFINED ENV{ROCM_PATH})
+  set(ENV{ROCM_PATH} "/opt/rocm")
+endif()
+
+set(ROCM_PATH $ENV{ROCM_PATH})
+
+# Enable HIP language
+enable_language(HIP)
+
+# Set default architectures if not specified
+if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+  set(CMAKE_HIP_ARCHITECTURES "gfx90a")
+endif()
+message(NOTICE "    - CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
+
+# Find hipBLAS
+find_library(HIP_LIBRARY_BLAS
+    NAMES hipblas
+    HINTS "${ROCM_PATH}/lib"
+    REQUIRED
+)
+
+# Find hipSPARSE
+find_library(HIP_LIBRARY_SPARSE
+    NAMES hipsparse
+    HINTS "${ROCM_PATH}/lib"
+    REQUIRED
+)
+
+# Find amdhip64 runtime
+find_library(HIP_LIBRARY_RT
+    NAMES amdhip64
+    HINTS "${ROCM_PATH}/lib"
+    REQUIRED
+)
+
+# Find rocBLAS (hipBLAS backend)
+find_library(HIP_LIBRARY_ROCBLAS
+    NAMES rocblas
+    HINTS "${ROCM_PATH}/lib"
+    REQUIRED
+)
+
+# Find rocSPARSE (hipSPARSE backend)
+find_library(HIP_LIBRARY_ROCSPARSE
+    NAMES rocsparse
+    HINTS "${ROCM_PATH}/lib"
+    REQUIRED
+)
+
+set(HIP_LIBRARY ${HIP_LIBRARY_RT} ${HIP_LIBRARY_BLAS} ${HIP_LIBRARY_SPARSE} ${HIP_LIBRARY_ROCBLAS} ${HIP_LIBRARY_ROCSPARSE})
+message(NOTICE "    - HIP Libraries: ${HIP_LIBRARY}")
+
+# Set include directories
+set(HIP_INCLUDE_DIRS
+    "${ROCM_PATH}/include"
+    "${ROCM_PATH}/include/hipblas"
+    "${ROCM_PATH}/include/hipsparse"
+)
diff --git a/README.md b/README.md
index 6946c67..7bff816 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,20 @@ cmake -DBUILD_CUDA=ON \
 -DCMAKE_CUDA_FLAGS_RELEASE="-O2 -DNDEBUG" ..
 ```  
 
+### Building for AMD GPUs (ROCm/HIP)
+
+The GPU solver also runs on AMD GPUs through ROCm. Install ROCm (which provides hipBLAS, hipSPARSE, rocBLAS, and rocSPARSE) and HiGHS as above, then configure with `-DUSE_HIP=ON` in place of `-DBUILD_CUDA=ON` and select the target GPU architecture with `-DCMAKE_HIP_ARCHITECTURES`:
+
+```shell
+export HIGHS_HOME=/path-to-highs
+mkdir build
+cd build
+cmake -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release ..
+cmake --build . --target plc
+```
+
+Set `CMAKE_HIP_ARCHITECTURES` to match your GPU, for example `gfx90a` (MI200 series), `gfx1100` (RDNA3), or `gfx1201` (RDNA4). `ROCM_PATH` defaults to `/opt/rocm`; set it if ROCm is installed elsewhere. The same source builds for NVIDIA (`-DBUILD_CUDA=ON`) and AMD (`-DUSE_HIP=ON`) GPUs.
+
 ## Alternative Interfaces
 ### The Python Interface
 If you wish to use the Python interface, use the following steps:
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 33b863c..f1bae97 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -20,10 +20,11 @@ set_target_properties(onlinelp
 )
 if (${CUDA_LIBRARY-NOTFOUND})
     target_link_libraries(onlinelp PUBLIC cupdlp m)
+    target_compile_definitions(onlinelp PUBLIC -DCUPDLP_CPU=1)
+elseif (${USE_HIP} STREQUAL "ON")
+    target_link_libraries(onlinelp PUBLIC cupdlp ${HIP_LIBRARY} m)
+    target_compile_definitions(onlinelp PUBLIC USE_HIP)
+    target_include_directories(onlinelp PUBLIC ${HIP_INCLUDE_DIRS})
 else ()
-    target_compile_definitions(wrapper_clp
-            PUBLIC
-            -DCUPDLP_CPU=1
-    )
     target_link_libraries(onlinelp PUBLIC cupdlp ${CUDA_LIBRARY} m)
 endif ()
\ No newline at end of file
diff --git a/cupdlp/CMakeLists.txt b/cupdlp/CMakeLists.txt
index 725f9c8..8207ecd 100644
--- a/cupdlp/CMakeLists.txt
+++ b/cupdlp/CMakeLists.txt
@@ -12,6 +12,7 @@ add_library(cupdlp SHARED
         ${CUPDLP_INCLUDE_HEADERS}
         ${CUPDLP_SOURCE_FILES}
 )
+set_target_properties(cupdlp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_compile_definitions(cupdlp
         PUBLIC
         # If the debug configuration pass the DEBUG define to the compiler
@@ -24,11 +25,17 @@ if (${CUDA_LIBRARY-NOTFOUND})
             PUBLIC
             -DCUPDLP_CPU=1
     )
-    target_link_libraries(cupdlp m)
+    target_link_libraries(cupdlp $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
+elseif (${USE_HIP} STREQUAL "ON")
+    add_subdirectory(cuda)
+    message(NOTICE "- GPU version PDLP (HIP/ROCm)")
+    target_include_directories(cupdlp PUBLIC ${HIP_INCLUDE_DIRS})
+    target_link_libraries(cupdlp PRIVATE cudalin ${HIP_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
+    set_target_properties(cupdlp PROPERTIES HIP_SEPARABLE_COMPILATION ON)
 else()
     add_subdirectory(cuda)
-    message(NOTICE "- GPU version PDLP")
+    message(NOTICE "- GPU version PDLP (CUDA)")
     target_include_directories(cupdlp PUBLIC "/usr/local/cuda/include")
-    target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} m)
+    target_link_libraries(cupdlp PRIVATE cudalin ${CUDA_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
     set_target_properties(cupdlp PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 endif ()
diff --git a/cupdlp/cuda/CMakeLists.txt b/cupdlp/cuda/CMakeLists.txt
index 3bdcc28..9408683 100644
--- a/cupdlp/cuda/CMakeLists.txt
+++ b/cupdlp/cuda/CMakeLists.txt
@@ -1,29 +1,64 @@
-enable_language(CXX CUDA)
-
-add_library(cudalin SHARED
-       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu
-       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh
-       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh
-       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu
+set(CUDA_SOURCES
+    ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu
+    ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu
 )
-set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_include_directories(cudalin PUBLIC "/usr/local/cuda/include")
-target_compile_definitions(cudalin
-        PUBLIC
-        # If the debug configuration pass the DEBUG define to the compiler
-        $<$<CONFIG:Debug>:-DCUPDLP_DEBUG=1>
+
+set(CUDA_HEADERS
+    ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh
+    ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh
+    ${CUPDLP_INCLUDE_DIR}/cuda/cuda_to_hip.h
 )
 
-target_link_libraries(cudalin ${CUDA_LIBRARY} m)
+if (${USE_HIP} STREQUAL "ON")
+    # HIP/ROCm build
+    enable_language(HIP)
+    set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)
 
-# add a test
-add_executable(testcudalin test_cuda_linalg.c)
-add_executable(testcublas test_cublas.c)
+    add_library(cudalin SHARED
+        ${CUDA_SOURCES}
+        ${CUDA_HEADERS}
+    )
+    target_compile_definitions(cudalin PUBLIC USE_HIP)
+    set_target_properties(cudalin PROPERTIES
+        HIP_SEPARABLE_COMPILATION ON
+        HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+        WINDOWS_EXPORT_ALL_SYMBOLS ON
+    )
+    target_include_directories(cudalin PUBLIC ${HIP_INCLUDE_DIRS})
+    target_compile_definitions(cudalin
+            PUBLIC
+            $<$<CONFIG:Debug>:-DCUPDLP_DEBUG=1>
+    )
+    target_link_libraries(cudalin ${HIP_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
+else()
+    # CUDA build
+    enable_language(CXX CUDA)
 
-set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda)
-target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY})
+    add_library(cudalin SHARED
+        ${CUDA_SOURCES}
+        ${CUDA_HEADERS}
+    )
+    set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_include_directories(cudalin PUBLIC "/usr/local/cuda/include")
+    target_compile_definitions(cudalin
+            PUBLIC
+            $<$<CONFIG:Debug>:-DCUPDLP_DEBUG=1>
+    )
+    target_link_libraries(cudalin ${CUDA_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
+endif()
 
-set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY})
+# add tests
+add_executable(testcudalin test_cuda_linalg.c)
+add_executable(testcublas test_cublas.c)
 
+if (${USE_HIP} STREQUAL "ON")
+    set_target_properties(testcudalin PROPERTIES HIP_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcudalin PRIVATE cudalin ${HIP_LIBRARY})
+    set_target_properties(testcublas PROPERTIES HIP_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcublas PRIVATE cudalin ${HIP_LIBRARY})
+else()
+    set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY})
+    set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY})
+endif()
diff --git a/cupdlp/cuda/cuda_to_hip.h b/cupdlp/cuda/cuda_to_hip.h
new file mode 100644
index 0000000..3f2e12a
--- /dev/null
+++ b/cupdlp/cuda/cuda_to_hip.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * CUDA-to-HIP compatibility header for cuPDLP-C
+ *
+ * On AMD (USE_HIP / __HIP_PLATFORM_AMD__): aliases CUDA symbols to HIP equivalents.
+ * On NVIDIA: no-op include of CUDA headers.
+ */
+#pragma once
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+// Ensure AMD platform is defined before including HIP headers
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#define __HIP_PLATFORM_AMD__
+#endif
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+
+// Runtime API
+#define cudaMalloc                hipMalloc
+#define cudaFree                  hipFree
+#define cudaMemcpy                hipMemcpy
+#define cudaMemcpyAsync           hipMemcpyAsync
+#define cudaMemset                hipMemset
+#define cudaDeviceSynchronize     hipDeviceSynchronize
+#define cudaGetLastError          hipGetLastError
+#define cudaGetErrorString        hipGetErrorString
+#define cudaGetDeviceCount        hipGetDeviceCount
+#define cudaGetDeviceProperties   hipGetDeviceProperties
+#define cudaDeviceGetAttribute    hipDeviceGetAttribute
+#define cudaRuntimeGetVersion     hipRuntimeGetVersion
+#define cudaDriverGetVersion      hipDriverGetVersion
+#define cudaDeviceReset           hipDeviceReset
+
+// Memory copy kinds
+#define cudaMemcpyDeviceToHost    hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice    hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToDevice  hipMemcpyDeviceToDevice
+#define cudaMemcpyDefault         hipMemcpyDefault
+
+// Error types
+#define cudaError_t               hipError_t
+#define cudaSuccess               hipSuccess
+
+// Device properties
+#define cudaDeviceProp            hipDeviceProp_t
+#define cudaDevAttrMultiProcessorCount  hipDeviceAttributeMultiprocessorCount
+#define cudaDevAttrWarpSize       hipDeviceAttributeWarpSize
+
+// cuBLAS -> hipBLAS
+#define cublasHandle_t            hipblasHandle_t
+#define cublasStatus_t            hipblasStatus_t
+#define CUBLAS_STATUS_SUCCESS     HIPBLAS_STATUS_SUCCESS
+#define cublasCreate              hipblasCreate
+#define cublasDestroy             hipblasDestroy
+#define cublasDaxpy               hipblasDaxpy
+#define cublasSaxpy               hipblasSaxpy
+#define cublasDdot                hipblasDdot
+#define cublasSdot                hipblasSdot
+#define cublasDnrm2               hipblasDnrm2
+#define cublasSnrm2               hipblasSnrm2
+#define cublasDscal               hipblasDscal
+#define cublasSscal               hipblasSscal
+#define cublasGetStatusString     hipblasStatusToString
+
+// cuSPARSE -> hipSPARSE
+#define cusparseHandle_t          hipsparseHandle_t
+#define cusparseStatus_t          hipsparseStatus_t
+#define CUSPARSE_STATUS_SUCCESS   HIPSPARSE_STATUS_SUCCESS
+#define cusparseCreate            hipsparseCreate
+#define cusparseDestroy           hipsparseDestroy
+#define cusparseGetVersion        hipsparseGetVersion
+#define cusparseGetErrorString    hipsparseGetErrorString
+
+// Sparse matrix/vector descriptors
+#define cusparseSpMatDescr_t      hipsparseSpMatDescr_t
+#define cusparseDnVecDescr_t      hipsparseDnVecDescr_t
+#define cusparseCreateCsr         hipsparseCreateCsr
+#define cusparseCreateCsc         hipsparseCreateCsc
+#define cusparseCreateDnVec       hipsparseCreateDnVec
+#define cusparseDestroySpMat      hipsparseDestroySpMat
+#define cusparseDestroyDnVec      hipsparseDestroyDnVec
+
+// SpMV operations
+#define cusparseSpMV              hipsparseSpMV
+#define cusparseSpMV_bufferSize   hipsparseSpMV_bufferSize
+#define cusparseSpMVAlg_t         hipsparseSpMVAlg_t
+#define cusparseOperation_t       hipsparseOperation_t
+#define CUSPARSE_OPERATION_NON_TRANSPOSE  HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE      HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_SPMV_CSR_ALG2    HIPSPARSE_SPMV_CSR_ALG2
+
+// Compute type
+#define CUDA_R_64F                HIP_R_64F
+#define CUDA_R_32F                HIP_R_32F
+
+// Index base
+#define CUSPARSE_INDEX_BASE_ZERO  HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_INDEX_32I        HIPSPARSE_INDEX_32I
+
+#else  // NVIDIA CUDA
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse.h>
+
+#endif  // USE_HIP
diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cu b/cupdlp/cuda/cupdlp_cuda_kernels.cu
index 58580b3..cae21ad 100644
--- a/cupdlp/cuda/cupdlp_cuda_kernels.cu
+++ b/cupdlp/cuda/cupdlp_cuda_kernels.cu
@@ -1,4 +1,11 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * AMD GPU (HIP/ROCm) support for the cuPDLP-C CUDA kernels.
+ */
 #include "cupdlp_cuda_kernels.cuh"
+#include "cuda_to_hip.h"
 
 
 
@@ -199,7 +206,132 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x,
 */
 
 
-#define QUARTER_WARP_REDUCE_2(val1, val2) { \
+// Warp size constant for device code
+// CDNA (gfx90a, gfx94x): 64-lane wavefront
+// RDNA (gfx10xx, gfx11xx): 32-lane wavefront
+// CUDA: always 32
+#if defined(__HIP_PLATFORM_AMD__)
+  #if defined(__GFX9__)
+    #define CUPDLP_WARP_SIZE 64
+  #else
+    #define CUPDLP_WARP_SIZE 32
+  #endif
+#else
+  #define CUPDLP_WARP_SIZE 32
+#endif
+
+// Warp reduction macros: arch-unified for wave32 (CUDA/RDNA) and wave64 (CDNA)
+// HIP uses __shfl_down (no sync needed, wavefronts execute in lockstep)
+// CUDA uses __shfl_down_sync with a 32-bit mask
+
+#if defined(__HIP_PLATFORM_AMD__)
+// HIP: All AMD GPUs use __shfl_down (no sync needed)
+// Wave64 (gfx9): reduce over 64 lanes
+// Wave32 (RDNA): reduce over 32 lanes (offset>32 is no-op)
+
+#if defined(__GFX9__)
+// Wave64: CDNA (gfx90a, gfx94x)
+#define FULL_WARP_REDUCE_2(val1, val2) { \
+  val1 += __shfl_down(val1, 32); \
+  val2 += __shfl_down(val2, 32); \
+  val1 += __shfl_down(val1, 16); \
+  val2 += __shfl_down(val2, 16); \
+  val1 += __shfl_down(val1, 8); \
+  val2 += __shfl_down(val2, 8); \
+  val1 += __shfl_down(val1, 4); \
+  val2 += __shfl_down(val2, 4); \
+  val1 += __shfl_down(val1, 2); \
+  val2 += __shfl_down(val2, 2); \
+  val1 += __shfl_down(val1, 1); \
+  val2 += __shfl_down(val2, 1); \
+}
+
+#define FULL_WARP_REDUCE(val) { \
+  val += __shfl_down(val, 32); \
+  val += __shfl_down(val, 16); \
+  val += __shfl_down(val, 8); \
+  val += __shfl_down(val, 4); \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+
+// 256 threads / 64 lanes = 4 warps; final reduction of 4 elements
+#define FINAL_REDUCE_2_256(val1, val2) { \
+  val1 += __shfl_down(val1, 2); \
+  val2 += __shfl_down(val2, 2); \
+  val1 += __shfl_down(val1, 1); \
+  val2 += __shfl_down(val2, 1); \
+}
+
+// 512 threads / 64 lanes = 8 warps; final reduction of 8 elements
+#define FINAL_REDUCE_512(val) { \
+  val += __shfl_down(val, 4); \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+
+#define FINAL_REDUCE_256(val) { \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+
+#else  // HIP RDNA (wave32)
+
+#define FULL_WARP_REDUCE_2(val1, val2) { \
+  val1 += __shfl_down(val1, 16); \
+  val2 += __shfl_down(val2, 16); \
+  val1 += __shfl_down(val1, 8); \
+  val2 += __shfl_down(val2, 8); \
+  val1 += __shfl_down(val1, 4); \
+  val2 += __shfl_down(val2, 4); \
+  val1 += __shfl_down(val1, 2); \
+  val2 += __shfl_down(val2, 2); \
+  val1 += __shfl_down(val1, 1); \
+  val2 += __shfl_down(val2, 1); \
+}
+
+#define FULL_WARP_REDUCE(val) { \
+  val += __shfl_down(val, 16); \
+  val += __shfl_down(val, 8); \
+  val += __shfl_down(val, 4); \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+
+// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements
+#define FINAL_REDUCE_2_256(val1, val2) { \
+  val1 += __shfl_down(val1, 4); \
+  val2 += __shfl_down(val2, 4); \
+  val1 += __shfl_down(val1, 2); \
+  val2 += __shfl_down(val2, 2); \
+  val1 += __shfl_down(val1, 1); \
+  val2 += __shfl_down(val2, 1); \
+}
+
+// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements
+#define FINAL_REDUCE_512(val) { \
+  val += __shfl_down(val, 8); \
+  val += __shfl_down(val, 4); \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+
+// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements
+#define FINAL_REDUCE_256(val) { \
+  val += __shfl_down(val, 4); \
+  val += __shfl_down(val, 2); \
+  val += __shfl_down(val, 1); \
+}
+#endif  // __GFX9__
+
+#else  // CUDA
+
+// CUDA: Wave32, use __shfl_down_sync with 32-bit mask
+#define FULL_WARP_REDUCE_2(val1, val2) { \
+  val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \
+  val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \
+  val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \
+  val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \
   val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \
   val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \
   val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \
@@ -208,11 +340,16 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x,
   val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \
 }
 
-#define FULL_WARP_REDUCE_2(val1, val2) { \
-  val1 += __shfl_down_sync(0xFFFFFFFF, val1, 16); \
-  val2 += __shfl_down_sync(0xFFFFFFFF, val2, 16); \
-  val1 += __shfl_down_sync(0xFFFFFFFF, val1, 8); \
-  val2 += __shfl_down_sync(0xFFFFFFFF, val2, 8); \
+#define FULL_WARP_REDUCE(val) { \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 16); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 8); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
+}
+
+// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements
+#define FINAL_REDUCE_2_256(val1, val2) { \
   val1 += __shfl_down_sync(0xFFFFFFFF, val1, 4); \
   val2 += __shfl_down_sync(0xFFFFFFFF, val2, 4); \
   val1 += __shfl_down_sync(0xFFFFFFFF, val1, 2); \
@@ -221,14 +358,34 @@ __global__ void naive_sub_kernel(cupdlp_float *z, const cupdlp_float *x,
   val2 += __shfl_down_sync(0xFFFFFFFF, val2, 1); \
 }
 
-// assumes block size = 256, warp size = 32
+// 512 threads / 32 lanes = 16 warps; final reduction of 16 elements
+#define FINAL_REDUCE_512(val) { \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 8); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
+}
+
+// 256 threads / 32 lanes = 8 warps; final reduction of 8 elements
+#define FINAL_REDUCE_256(val) { \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
+  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
+}
+#endif  // __HIP_PLATFORM_AMD__
+
+// Max warps per block: 256/32=8 (wave32) or 512/32=16 (wave32), 256/64=4 or 512/64=8 (wave64)
+// Upper bound is 16, use that for static shared memory sizing.
+static constexpr int kMaxWarpsPerBlock = 16;
+
+// assumes block size = 256, warp size = 32 or 64
 __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_float * __restrict__ res_y,
                                   const cupdlp_float * __restrict__ xUpdate, const cupdlp_float * __restrict__ x,
                                   const cupdlp_float * __restrict__ atyUpdate, const cupdlp_float * __restrict__ aty,
                                   int nCols) {
 
-  __shared__ cupdlp_float shared_x[32];
-  __shared__ cupdlp_float shared_y[32];
+  __shared__ cupdlp_float shared_x[kMaxWarpsPerBlock];
+  __shared__ cupdlp_float shared_y[kMaxWarpsPerBlock];
   cupdlp_float val_x = 0.0;
   cupdlp_float val_y = 0.0;
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nCols; i += blockDim.x * gridDim.x) {
@@ -238,8 +395,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa
       val_y = cupdlp_fma_rn(day, dx, val_y);
   }
 
-  int lane = threadIdx.x % 32;
-  int wid = threadIdx.x / 32;
+  int lane = threadIdx.x % CUPDLP_WARP_SIZE;
+  int wid = threadIdx.x / CUPDLP_WARP_SIZE;
+  int nWarps = blockDim.x / CUPDLP_WARP_SIZE;
 
   FULL_WARP_REDUCE_2(val_x, val_y)
   if (lane == 0) {
@@ -249,9 +407,9 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa
   __syncthreads();
 
   if (wid == 0) {
-    val_x = (threadIdx.x < blockDim.x / 32) ? shared_x[lane] : 0.0;
-    val_y = (threadIdx.x < blockDim.x / 32) ? shared_y[lane] : 0.0;
-    QUARTER_WARP_REDUCE_2(val_x, val_y)
+    val_x = (lane < nWarps) ? shared_x[lane] : 0.0;
+    val_y = (lane < nWarps) ? shared_y[lane] : 0.0;
+    FINAL_REDUCE_2_256(val_x, val_y)
     if (threadIdx.x == 0) {
       res_x[blockIdx.x] = val_x;
       res_y[blockIdx.x] = val_y;
@@ -259,41 +417,21 @@ __global__ void movement_1_kernel(cupdlp_float * __restrict__ res_x, cupdlp_floa
   }
 }
 
-#define QUARTER_WARP_REDUCE(val) { \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
-}
-
-#define HALF_WARP_REDUCE(val) { \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 8); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
-}
-
-#define FULL_WARP_REDUCE(val) { \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 16); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 8); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
-}
-
-// assumes: block size = 256, warp size = 32
+// assumes: block size = 256, warp size = 32 or 64
 __global__ void movement_2_kernel(cupdlp_float * __restrict__ res,
                                   const cupdlp_float * __restrict__ yUpdate, const cupdlp_float * __restrict__ y,
                                   int nRows) {
 
-  __shared__ cupdlp_float shared[32];
+  __shared__ cupdlp_float shared[kMaxWarpsPerBlock];
   cupdlp_float val = 0.0;
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nRows; i += blockDim.x * gridDim.x) {
       cupdlp_float d = yUpdate[i] - y[i];
       val = cupdlp_fma_rn(d, d, val);
   }
 
-  int lane = threadIdx.x % 32;
-  int wid = threadIdx.x / 32;
+  int lane = threadIdx.x % CUPDLP_WARP_SIZE;
+  int wid = threadIdx.x / CUPDLP_WARP_SIZE;
+  int nWarps = blockDim.x / CUPDLP_WARP_SIZE;
 
   FULL_WARP_REDUCE(val)
   if (lane == 0) {
@@ -302,25 +440,26 @@ __global__ void movement_2_kernel(cupdlp_float * __restrict__ res,
   __syncthreads();
 
   if (wid == 0) {
-    val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0;
-    QUARTER_WARP_REDUCE(val)
+    val = (lane < nWarps) ? shared[lane] : 0.0;
+    FINAL_REDUCE_256(val)
     if (threadIdx.x == 0) {
       res[blockIdx.x] = val;
     }
   }
 }
 
-// assumes: block size = 512, warp size = 32
+// assumes: block size = 512, warp size = 32 or 64
 __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float * __restrict__ x, int n) {
 
-  __shared__ cupdlp_float shared[32];
+  __shared__ cupdlp_float shared[kMaxWarpsPerBlock];
   cupdlp_float val = 0.0;
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
       val += x[i];
   }
 
-  int lane = threadIdx.x % 32;
-  int wid = threadIdx.x / 32;
+  int lane = threadIdx.x % CUPDLP_WARP_SIZE;
+  int wid = threadIdx.x / CUPDLP_WARP_SIZE;
+  int nWarps = blockDim.x / CUPDLP_WARP_SIZE;
 
   FULL_WARP_REDUCE(val)
   if (lane == 0) {
@@ -329,8 +468,8 @@ __global__ void sum_kernel(cupdlp_float * __restrict__ res, const cupdlp_float *
   __syncthreads();
 
   if (wid == 0) {
-    val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0;
-    HALF_WARP_REDUCE(val)
+    val = (lane < nWarps) ? shared[lane] : 0.0;
+    FINAL_REDUCE_512(val)
     if (threadIdx.x == 0) {
       res[blockIdx.x] = val;
     }
diff --git a/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/cupdlp/cuda/cupdlp_cuda_kernels.cuh
index f04aac0..fe84d38 100644
--- a/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+++ b/cupdlp/cuda/cupdlp_cuda_kernels.cuh
@@ -3,9 +3,13 @@
 
 #include <stdio.h>
 #include <stdlib.h>  /* EXIT_FAILURE */
+#include "cuda_to_hip.h"
+
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <cuda_runtime.h>
+#endif
 
 //#define CUPDLP_BLOCK_SIZE 512
 
diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cu b/cupdlp/cuda/cupdlp_cudalinalg.cu
index e124065..e7b01ca 100644
--- a/cupdlp/cuda/cupdlp_cudalinalg.cu
+++ b/cupdlp/cuda/cupdlp_cudalinalg.cu
@@ -1,6 +1,7 @@
 #include <stdio.h>   // printf
 #include <stdlib.h>  // EXIT_FAILURE
 
+#include "cuda_to_hip.h"
 #include "cupdlp_cudalinalg.cuh"
 
 inline int nBlocks256(int n) {
@@ -257,12 +258,7 @@ void cupdlp_movement_interaction_cuda(
     const cupdlp_float *atyUpdate, const cupdlp_float *aty,
     int nRows, int nCols)
 {
-  int warpSize;
-  CHECK_CUDA_IGNORE(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, 0))
-  if (warpSize != 32) {
-    printf("warpSize\n");
-    exit(1);
-  }
+  // Warp size query removed: reduction kernels now support both wave32 and wave64
 
   constexpr int RED_BLOCK_SIZE = 256;
   constexpr int RED_ELS_PER_THREAD = 4;
diff --git a/cupdlp/cuda/cupdlp_cudalinalg.cuh b/cupdlp/cuda/cupdlp_cudalinalg.cuh
index f4e4498..3442fcc 100644
--- a/cupdlp/cuda/cupdlp_cudalinalg.cuh
+++ b/cupdlp/cuda/cupdlp_cudalinalg.cuh
@@ -1,11 +1,13 @@
 #ifndef CUPDLP_CUDA_LINALG_H
 #define CUPDLP_CUDA_LINALG_H
 
+#include "cupdlp_cuda_kernels.cuh"
+
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>         // cublas
 #include <cuda_runtime_api.h>  // cudaMalloc, cudaMemcpy, etc.
 #include <cusparse.h>          // cusparseSpMV
-
-#include "cupdlp_cuda_kernels.cuh"
+#endif
 
 #define PRINT_CUDA_INFO (1)
 #define PRINT_DETAILED_CUDA_INFO (0)
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 664d8a9..69c9c0b 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -5,6 +5,7 @@ enable_language(CXX)
 add_library(wrapper_lp SHARED
         mps_lp.h mps_lp.c
 )
+set_target_properties(wrapper_lp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_link_libraries(
         wrapper_lp PUBLIC cupdlp
 )
@@ -14,6 +15,10 @@ if (${CUDA_LIBRARY-NOTFOUND})
             PUBLIC
             -DCUPDLP_CPU=1
     )
+elseif (${USE_HIP} STREQUAL "ON")
+    target_compile_definitions(wrapper_lp PUBLIC USE_HIP)
+    target_include_directories(wrapper_lp PUBLIC ${HIP_INCLUDE_DIRS})
+    target_link_libraries(wrapper_lp PRIVATE ${HIP_LIBRARY})
 endif()
 
 #############################################
@@ -26,6 +31,7 @@ add_library(wrapper_highs SHARED
         mps_lp.h mps_lp.c
         wrapper_highs.cpp ${HiGHS_HEADER_FILES}
         wrapper_highs.h)
+set_target_properties(wrapper_highs PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(
         wrapper_highs PUBLIC
@@ -41,6 +47,10 @@ if (${CUDA_LIBRARY-NOTFOUND})
             PUBLIC
             -DCUPDLP_CPU=1
     )
+elseif (${USE_HIP} STREQUAL "ON")
+    target_compile_definitions(wrapper_highs PUBLIC USE_HIP)
+    target_include_directories(wrapper_highs PUBLIC ${HIP_INCLUDE_DIRS})
+    target_link_libraries(wrapper_highs PRIVATE ${HIP_LIBRARY})
 endif()
 
 
@@ -63,9 +73,11 @@ target_link_libraries(
 
 
 if (${CUDA_LIBRARY-NOTFOUND})
-    target_link_libraries(plc PRIVATE cupdlp m)
+    target_link_libraries(plc PRIVATE cupdlp $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
+elseif (${USE_HIP} STREQUAL "ON")
+    target_link_libraries(plc PRIVATE cupdlp ${HIP_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
 else ()
-    target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} m)
+    target_link_libraries(plc PRIVATE cupdlp ${CUDA_LIBRARY} $<$<NOT:$<PLATFORM_ID:Windows>>:m>)
 endif ()
 if (${CUDA_LIBRARY-NOTFOUND})
     add_custom_target(

From d380bcab9f7bf75bbaa7b14366f6c7902c2a8f91 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Tue, 23 Jun 2026 22:20:57 +0000
Subject: [PATCH 2/2] [ROCm] Rely on HIP arch auto-detect instead of pinning
 gfx90a

The gfx90a pin sat after enable_language(HIP), so its
if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) guard was always false and the
block was dead -- enable_language(HIP) has already detected the host arch
(or errored). Removing it makes intent clear and keeps the build honoring
-DCMAKE_HIP_ARCHITECTURES, auto-detecting the host GPU, or erroring on a
no-GPU host, rather than risking a silently wrong gfx90a default if file
order ever changed.

This change was authored with the assistance of the Claude AI assistant.
---
 FindHIPConf.cmake | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/FindHIPConf.cmake b/FindHIPConf.cmake
index d8023c8..06e1e85 100644
--- a/FindHIPConf.cmake
+++ b/FindHIPConf.cmake
@@ -14,13 +14,9 @@ endif()
 
 set(ROCM_PATH $ENV{ROCM_PATH})
 
-# Enable HIP language
+# enable_language(HIP) auto-detects the host GPU arch (and errors on a
+# no-GPU build host); pass -DCMAKE_HIP_ARCHITECTURES=... to override.
 enable_language(HIP)
-
-# Set default architectures if not specified
-if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
-  set(CMAKE_HIP_ARCHITECTURES "gfx90a")
-endif()
 message(NOTICE "    - CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
 
 # Find hipBLAS