Add Jacobi eigenvalue algorithm and eigSH via cuSOLVER

dmjio · claude · dmjio · commit ddabf14849ee · 2026-06-06T13:19:15.000-05:00
Adds two complementary paths for symmetric eigendecomposition:

* `ArrayFire.Jacobi` — pure-Haskell cyclic Jacobi method that runs
  entirely on the GPU via ArrayFire array ops, with only 3 scalar
  GPU→CPU reads per active off-diagonal pair (for the rotation angle)
  plus one per sweep (convergence check). Works on any AF backend.

* `eigSH` in `ArrayFire.LAPACK` — wraps a new C shim (`cbits/eigsh.c`)
  that calls `cusolverDnDsyevd` / `cusolverDnSsyevd` directly via
  dlopen at runtime (no link-time CUDA dependency). Gracefully returns
  AF_ERR_RUNTIME on non-CUDA backends. Eigenvalues returned in ascending
  order, matching the hmatrix `eigSH` convention.

Also expands the LAPACK test suite: fixes the `qr` test (was calling
`lu` by mistake), splits the determinant test into separate real and
complex cases with verified imaginary parts, uncomments and corrects the
inverse test, and adds rank and norm tests. New `eigSH (CUDA)` tests
skip gracefully on CPU/OpenCL via `pendingWith`.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ result/
 cabal.project.local
 tags
 /.stack-work/
+.ghc*
diff --git a/arrayfire.cabal b/arrayfire.cabal
@@ -46,6 +46,7 @@ library
     ArrayFire.Graphics
     ArrayFire.Image
     ArrayFire.Index
+    ArrayFire.Jacobi
     ArrayFire.LAPACK
     ArrayFire.Random
     ArrayFire.Signal
@@ -87,6 +88,9 @@ library
     af
   c-sources:
     cbits/wrapper.c
+    cbits/eigsh.c
+  if os(linux)
+    extra-libraries: dl
   build-depends:
     base < 5, deepseq, filepath, vector
   hs-source-dirs:
@@ -176,6 +180,7 @@ test-suite test
     ArrayFire.GraphicsSpec
     ArrayFire.ImageSpec
     ArrayFire.IndexSpec
+    ArrayFire.JacobiSpec
     ArrayFire.LAPACKSpec
     ArrayFire.RandomSpec
     ArrayFire.SignalSpec
diff --git a/cbits/eigsh.c b/cbits/eigsh.c
@@ -0,0 +1,190 @@
+/*
+ * cbits/eigsh.c
+ *
+ * GPU-resident symmetric eigendecomposition via cuSOLVER.
+ *
+ * Supports f32 (cusolverDnSsyevd) and f64 (cusolverDnDsyevd).
+ * cuSOLVER is resolved at runtime through dlopen/dlsym — no link-time
+ * dependency on CUDA toolkit.  The only link-time requirements are
+ * libaf (ArrayFire) and libdl.
+ *
+ * Returns AF_ERR_RUNTIME when CUDA backend is not active or cuSOLVER
+ * cannot be found (graceful degradation on CPU/OpenCL builds).
+ *
+ * Ordering: cusolverDnDsyevd returns eigenvalues in ascending order,
+ * matching hmatrix's eigSH convention.
+ */
+
+#define _GNU_SOURCE
+#define AF_DEFINE_CUDA_TYPES   /* gives us cudaStream_t in af/cuda.h */
+#include "arrayfire.h"
+#include "af/cuda.h"
+#include <dlfcn.h>
+#include <stddef.h>
+
+/* ── minimal cuSOLVER types (avoids needing CUDA toolkit headers) ── */
+typedef void *cusolverDnHandle_t;
+typedef void *cudaStream_t_t;         /* distinct name to avoid redefinition */
+typedef int   cusolverStatus_t;
+
+#define CUSOLVER_STATUS_SUCCESS   0
+#define CUBLAS_FILL_MODE_LOWER    0
+#define CUSOLVER_EIG_MODE_VECTOR  1
+
+/* ── function pointer typedefs ── */
+typedef cusolverStatus_t (*pfn_Create)    (cusolverDnHandle_t *);
+typedef cusolverStatus_t (*pfn_SetStream) (cusolverDnHandle_t, cudaStream_t);
+
+typedef cusolverStatus_t (*pfn_DsyevdBuf)(cusolverDnHandle_t, int, int,
+    int, const double *, int, const double *, int *);
+typedef cusolverStatus_t (*pfn_Dsyevd)   (cusolverDnHandle_t, int, int,
+    int, double *, int, double *, double *, int, int *);
+
+typedef cusolverStatus_t (*pfn_SsyevdBuf)(cusolverDnHandle_t, int, int,
+    int, const float *, int, const float *, int *);
+typedef cusolverStatus_t (*pfn_Ssyevd)   (cusolverDnHandle_t, int, int,
+    int, float *, int, float *, float *, int, int *);
+
+/* ── module-level state ── */
+static cusolverDnHandle_t g_handle    = NULL;
+static pfn_Create         fn_Create   = NULL;
+static pfn_SetStream      fn_SetStr   = NULL;
+static pfn_DsyevdBuf      fn_DsyBuf  = NULL;
+static pfn_Dsyevd         fn_Dsyevd  = NULL;
+static pfn_SsyevdBuf      fn_SsyBuf  = NULL;
+static pfn_Ssyevd         fn_Ssyevd  = NULL;
+static int                g_init      = 0;  /* 0 = uninitialised */
+
+static af_err load_and_init(void)
+{
+    /* Try the exact versioned name first (already loaded by AF CUDA backend),
+     * then fall back to an unversioned symlink if present.              */
+    void *lib = dlopen("libcusolver.so.11", RTLD_NOW | RTLD_NOLOAD);
+    if (!lib) lib = dlopen("libcusolver.so.11", RTLD_NOW | RTLD_GLOBAL);
+    if (!lib) lib = dlopen("libcusolver.so",    RTLD_NOW | RTLD_GLOBAL);
+    if (!lib) return AF_ERR_RUNTIME;
+
+    fn_Create  = (pfn_Create)     dlsym(lib, "cusolverDnCreate");
+    fn_SetStr  = (pfn_SetStream)  dlsym(lib, "cusolverDnSetStream");
+    fn_DsyBuf  = (pfn_DsyevdBuf) dlsym(lib, "cusolverDnDsyevd_bufferSize");
+    fn_Dsyevd  = (pfn_Dsyevd)    dlsym(lib, "cusolverDnDsyevd");
+    fn_SsyBuf  = (pfn_SsyevdBuf) dlsym(lib, "cusolverDnSsyevd_bufferSize");
+    fn_Ssyevd  = (pfn_Ssyevd)    dlsym(lib, "cusolverDnSsyevd");
+
+    if (!fn_Create || !fn_SetStr || !fn_DsyBuf || !fn_Dsyevd ||
+        !fn_SsyBuf || !fn_Ssyevd)
+        return AF_ERR_RUNTIME;
+
+    if (fn_Create(&g_handle) != CUSOLVER_STATUS_SUCCESS)
+        return AF_ERR_INTERNAL;
+
+    /* Bind cuSOLVER to ArrayFire's CUDA stream (device 0) so that
+     * cuSOLVER kernels are sequenced correctly with AF operations.  */
+    cudaStream_t stream = NULL;
+    if (afcu_get_stream(&stream, 0) == AF_SUCCESS && stream)
+        fn_SetStr(g_handle, stream);
+
+    return AF_SUCCESS;
+}
+
+static af_err ensure_init(void)
+{
+    if (g_init) return g_handle ? AF_SUCCESS : AF_ERR_RUNTIME;
+    g_init = 1;
+    return load_and_init();
+}
+
+/* ── core eigensolver: writes eigenvectors into d_A, eigenvalues into d_W ── */
+static af_err run_syevd(int is_double, int n, void *d_A, void *d_W)
+{
+    int lwork;
+    cusolverStatus_t st;
+
+    if (is_double) {
+        st = fn_DsyBuf(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+                       n, (const double *)d_A, n, (const double *)d_W, &lwork);
+    } else {
+        st = fn_SsyBuf(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+                       n, (const float  *)d_A, n, (const float  *)d_W, &lwork);
+    }
+    if (st != CUSOLVER_STATUS_SUCCESS) return AF_ERR_INTERNAL;
+
+    dim_t wsz = (dim_t)lwork * (is_double ? sizeof(double) : sizeof(float));
+
+    void *d_work = NULL, *d_info = NULL;
+    af_err err;
+    if ((err = af_alloc_device_v2(&d_work, wsz))          != AF_SUCCESS) return err;
+    if ((err = af_alloc_device_v2(&d_info, sizeof(int)))  != AF_SUCCESS) {
+        af_free_device_v2(d_work);
+        return err;
+    }
+
+    if (is_double) {
+        st = fn_Dsyevd(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+                       n, (double *)d_A, n, (double *)d_W,
+                       (double *)d_work, lwork, (int *)d_info);
+    } else {
+        st = fn_Ssyevd(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+                       n, (float  *)d_A, n, (float  *)d_W,
+                       (float  *)d_work, lwork, (int *)d_info);
+    }
+
+    af_free_device_v2(d_work);
+    af_free_device_v2(d_info);
+    return (st == CUSOLVER_STATUS_SUCCESS) ? AF_SUCCESS : AF_ERR_INTERNAL;
+}
+
+/* ── public entry point exposed to Haskell ── */
+af_err af_eigsh(af_array *evals_out, af_array *evecs_out, const af_array input)
+{
+    af_err err;
+
+    if ((err = ensure_init()) != AF_SUCCESS) return err;
+
+    af_dtype dtype;
+    if ((err = af_get_type(&dtype, input)) != AF_SUCCESS) return err;
+    if (dtype != f64 && dtype != f32) return AF_ERR_TYPE;
+
+    dim_t d0, d1, d2, d3;
+    if ((err = af_get_dims(&d0, &d1, &d2, &d3, input)) != AF_SUCCESS) return err;
+    int n = (int)d0;
+
+    /* Working copy: cuSOLVER overwrites A in-place with eigenvectors */
+    af_array evecs;
+    if ((err = af_copy_array(&evecs, input)) != AF_SUCCESS) return err;
+
+    /* Eigenvalue output: n-element array owned and managed by ArrayFire */
+    af_array evals;
+    dim_t n_dim = (dim_t)n;
+    if ((err = af_constant(&evals, 0.0, 1, &n_dim, dtype)) != AF_SUCCESS) {
+        af_release_array(evecs);
+        return err;
+    }
+
+    /* Lock both arrays and obtain raw device pointers for cuSOLVER */
+    void *d_A = NULL, *d_W = NULL;
+    if ((err = af_get_device_ptr(&d_A, evecs)) != AF_SUCCESS) {
+        af_release_array(evecs); af_release_array(evals);
+        return err;
+    }
+    if ((err = af_get_device_ptr(&d_W, evals)) != AF_SUCCESS) {
+        af_unlock_array(evecs);
+        af_release_array(evecs); af_release_array(evals);
+        return err;
+    }
+
+    err = run_syevd(dtype == f64, n, d_A, d_W);
+
+    /* Unlock: ArrayFire resumes ownership and sees the in-place modifications */
+    af_unlock_array(evecs);
+    af_unlock_array(evals);
+
+    if (err != AF_SUCCESS) {
+        af_release_array(evecs); af_release_array(evals);
+        return err;
+    }
+
+    *evals_out = evals;
+    *evecs_out = evecs;
+    return AF_SUCCESS;
+}
diff --git a/src/ArrayFire.hs b/src/ArrayFire.hs
@@ -45,6 +45,7 @@ module ArrayFire
   , module ArrayFire.Graphics
   , module ArrayFire.Image
   , module ArrayFire.Index
+  , module ArrayFire.Jacobi
   , module ArrayFire.LAPACK
   , module ArrayFire.Random
   , module ArrayFire.Signal
@@ -71,6 +72,7 @@ import ArrayFire.Features
 import ArrayFire.Graphics
 import ArrayFire.Image
 import ArrayFire.Index
+import ArrayFire.Jacobi
 import ArrayFire.LAPACK
 import ArrayFire.Random
 import ArrayFire.Signal
diff --git a/src/ArrayFire/Internal/LAPACK.hsc b/src/ArrayFire/Internal/LAPACK.hsc
@@ -37,3 +37,5 @@ foreign import ccall unsafe "af_norm"
     af_norm :: Ptr Double -> AFArray -> AFNormType -> Double -> Double -> IO AFErr
 foreign import ccall unsafe "af_is_lapack_available"
     af_is_lapack_available :: Ptr CBool -> IO AFErr
+foreign import ccall unsafe "af_eigsh"
+    af_eigsh :: Ptr AFArray -> Ptr AFArray -> AFArray -> IO AFErr
diff --git a/src/ArrayFire/Jacobi.hs b/src/ArrayFire/Jacobi.hs
@@ -0,0 +1,140 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeApplications    #-}
+--------------------------------------------------------------------------------
+-- |
+-- Module      : ArrayFire.Jacobi
+-- Copyright   : David Johnson (c) 2019-2026
+-- License     : BSD 3
+-- Maintainer  : David Johnson <code@dmj.io>
+-- Stability   : Experimental
+-- Portability : GHC
+--
+-- Jacobi eigenvalue algorithm for real symmetric matrices.
+--
+-- @
+-- >>> let a = matrix \@Double (3,3) [[4,2,2],[2,3,0],[2,0,3]]
+-- >>> let (evals, evecs) = jacobi a 100 1e-10
+-- @
+--
+--------------------------------------------------------------------------------
+module ArrayFire.Jacobi
+  ( jacobi
+  ) where
+--------------------------------------------------------------------------------
+import Foreign.Storable         (Storable)
+--------------------------------------------------------------------------------
+import ArrayFire.Algorithm      (sumAll)
+import ArrayFire.Arith          (add, sub, mul)
+import ArrayFire.Array          (getDims, getScalar, scalar)
+import ArrayFire.Data           (diagCreate, diagExtract, identity)
+import ArrayFire.Index          (index, assignSeq)
+import ArrayFire.Internal.Types (Array, AFType)
+import ArrayFire.Types          (Seq (..))
+--------------------------------------------------------------------------------
+-- | Jacobi eigenvalue decomposition for real symmetric n×n matrices.
+--
+-- Uses the cyclic Jacobi method: each sweep applies a Givens rotation to
+-- every off-diagonal pair (p,q) with p < q. Repeats until the off-diagonal
+-- Frobenius norm is below @tol@ or @maxSweeps@ is reached.
+--
+-- Returns @(eigenvalues, eigenvectors)@:
+--
+--   * @eigenvalues@ — length-n vector; element @i@ is the eigenvalue
+--     associated with column @i@ of @eigenvectors@.
+--   * @eigenvectors@ — n×n orthogonal matrix; columns are eigenvectors.
+--
+-- All intermediate computation stays on the GPU. The only CPU round-trips
+-- are the three scalar reads per pair (apq, app, aqq) needed to compute the
+-- rotation angle, and one scalar read per sweep for the convergence check.
+--
+-- The eigenvalues are /not/ sorted; use 'ArrayFire.Algorithm.sort' if needed.
+jacobi
+  :: forall a . (AFType a, RealFloat a, Storable a)
+  => Array a   -- ^ real symmetric n×n matrix
+  -> Int       -- ^ maximum number of sweeps
+  -> a         -- ^ convergence tolerance (off-diagonal Frobenius norm)
+  -> (Array a, Array a)
+  -- ^ (eigenvalues vector, eigenvectors matrix)
+jacobi mat maxSweeps tol = go 0 mat (identity @a [n, n])
+  where
+    (n, _, _, _) = getDims mat
+    pairs        = [(p, q) | p <- [0 .. n - 2], q <- [p + 1 .. n - 1]]
+
+    go sweepNum a v
+      | sweepNum >= maxSweeps = finish a v
+      | offDiagNorm a < tol   = finish a v
+      | otherwise =
+          let (a', v') = foldl' (applyPair n tol) (a, v) pairs
+          in go (sweepNum + 1) a' v'
+
+    finish a v = (diagExtract a 0, v)
+--------------------------------------------------------------------------------
+-- | Apply a single Givens rotation at position (p,q) entirely on the GPU.
+--
+-- Instead of building an n×n rotation matrix and doing two O(n³) matmuls,
+-- this updates only the two affected rows of A (left multiply by Gᵀ) and
+-- the two affected columns of A and V (right multiply by G). Each update is
+-- O(n) and never materialises the full rotation matrix on either CPU or GPU.
+--
+-- The only GPU→CPU transfers are the three element reads for apq, app, aqq.
+-- c and s are uploaded as [1]-element GPU scalars and broadcast across the
+-- column/row vectors.
+applyPair
+  :: forall a . (AFType a, RealFloat a, Storable a)
+  => Int -> a -> (Array a, Array a) -> (Int, Int) -> (Array a, Array a)
+applyPair n tol (a, v) (p, q) =
+  let apq = getElem a p q
+  in if abs apq < tol
+     then (a, v)
+     else
+       let app = getElem a p p
+           aqq = getElem a q q
+           tau = (aqq - app) / (2 * apq)
+           t   | tau >= 0  = 1 / (tau + sqrt (1 + tau * tau))
+               | otherwise = (-1) / (negate tau + sqrt (1 + tau * tau))
+           c   = 1 / sqrt (1 + t * t)
+           s   = t * c
+           -- GPU scalar arrays; broadcast over column/row vectors via af_mul batch=1
+           sc  = scalar c
+           ss  = scalar s
+           -- Left-multiply A by Gᵀ: update rows p and q
+           rowAp   = getRow a p
+           rowAq   = getRow a q
+           rowAp'  = (sc `mul` rowAp) `sub` (ss `mul` rowAq)
+           rowAq'  = (ss `mul` rowAp) `add` (sc `mul` rowAq)
+           a1      = setRow (setRow a p rowAp') q rowAq'
+           -- Right-multiply A by G: update cols p and q of the row-updated A
+           colA1p  = getCol a1 p
+           colA1q  = getCol a1 q
+           colAp'  = (sc `mul` colA1p) `sub` (ss `mul` colA1q)
+           colAq'  = (ss `mul` colA1p) `add` (sc `mul` colA1q)
+           a2      = setCol (setCol a1 p colAp') q colAq'
+           -- Right-multiply V by G: update cols p and q
+           vp      = getCol v p
+           vq      = getCol v q
+           vp'     = (sc `mul` vp) `sub` (ss `mul` vq)
+           vq'     = (ss `mul` vp) `add` (sc `mul` vq)
+           v'      = setCol (setCol v p vp') q vq'
+       in (a2, v')
+  where
+    n1           = fromIntegral (n - 1) :: Double
+    getRow arr i = index arr    [Seq (fromIntegral i) (fromIntegral i) 1, Seq 0 n1 1]
+    getCol arr j = index arr    [Seq 0 n1 1, Seq (fromIntegral j) (fromIntegral j) 1]
+    setRow arr i = assignSeq arr [Seq (fromIntegral i) (fromIntegral i) 1, Seq 0 n1 1]
+    setCol arr j = assignSeq arr [Seq 0 n1 1, Seq (fromIntegral j) (fromIntegral j) 1]
+--------------------------------------------------------------------------------
+-- Extract the scalar at row i, column j (GPU→CPU; called 3× per active pair).
+getElem :: (AFType a, Storable a) => Array a -> Int -> Int -> a
+getElem arr i j =
+  getScalar $
+    index arr [ Seq (fromIntegral i) (fromIntegral i) 1
+              , Seq (fromIntegral j) (fromIntegral j) 1 ]
+--------------------------------------------------------------------------------
+-- Frobenius norm of the strict off-diagonal part of A (one GPU→CPU per sweep).
+offDiagNorm :: forall a . (AFType a, RealFloat a) => Array a -> a
+offDiagNorm a =
+  let d       = diagCreate (diagExtract a 0) 0
+      offDiag = a `sub` d
+      sq      = offDiag `mul` offDiag
+  in sqrt . realToFrac . fst $ sumAll sq
+--------------------------------------------------------------------------------
diff --git a/src/ArrayFire/LAPACK.hs b/src/ArrayFire/LAPACK.hs
diff --git a/test/ArrayFire/JacobiSpec.hs b/test/ArrayFire/JacobiSpec.hs
diff --git a/test/ArrayFire/LAPACKSpec.hs b/test/ArrayFire/LAPACKSpec.hs