Skip to content

Commit ddabf14

Browse files
dmjioclaude
andcommitted
Add Jacobi eigenvalue algorithm and eigSH via cuSOLVER
Adds two complementary paths for symmetric eigendecomposition: * `ArrayFire.Jacobi` — pure-Haskell cyclic Jacobi method that runs entirely on the GPU via ArrayFire array ops, with only 3 scalar GPU→CPU reads per active off-diagonal pair (for the rotation angle) plus one per sweep (convergence check). Works on any AF backend. * `eigSH` in `ArrayFire.LAPACK` — wraps a new C shim (`cbits/eigsh.c`) that calls `cusolverDnDsyevd` / `cusolverDnSsyevd` directly via dlopen at runtime (no link-time CUDA dependency). Gracefully returns AF_ERR_RUNTIME on non-CUDA backends. Eigenvalues returned in ascending order, matching the hmatrix `eigSH` convention. Also expands the LAPACK test suite: fixes the `qr` test (was calling `lu` by mistake), splits the determinant test into separate real and complex cases with verified imaginary parts, uncomments and corrects the inverse test, and adds rank and norm tests. New `eigSH (CUDA)` tests skip gracefully on CPU/OpenCL via `pendingWith`. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8914928 commit ddabf14

9 files changed

Lines changed: 588 additions & 17 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ result/
77
cabal.project.local
88
tags
99
/.stack-work/
10+
.ghc*

arrayfire.cabal

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ library
4646
ArrayFire.Graphics
4747
ArrayFire.Image
4848
ArrayFire.Index
49+
ArrayFire.Jacobi
4950
ArrayFire.LAPACK
5051
ArrayFire.Random
5152
ArrayFire.Signal
@@ -87,6 +88,9 @@ library
8788
af
8889
c-sources:
8990
cbits/wrapper.c
91+
cbits/eigsh.c
92+
if os(linux)
93+
extra-libraries: dl
9094
build-depends:
9195
base < 5, deepseq, filepath, vector
9296
hs-source-dirs:
@@ -176,6 +180,7 @@ test-suite test
176180
ArrayFire.GraphicsSpec
177181
ArrayFire.ImageSpec
178182
ArrayFire.IndexSpec
183+
ArrayFire.JacobiSpec
179184
ArrayFire.LAPACKSpec
180185
ArrayFire.RandomSpec
181186
ArrayFire.SignalSpec

cbits/eigsh.c

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
/*
2+
* cbits/eigsh.c
3+
*
4+
* GPU-resident symmetric eigendecomposition via cuSOLVER.
5+
*
6+
* Supports f32 (cusolverDnSsyevd) and f64 (cusolverDnDsyevd).
7+
* cuSOLVER is resolved at runtime through dlopen/dlsym — no link-time
8+
* dependency on CUDA toolkit. The only link-time requirements are
9+
* libaf (ArrayFire) and libdl.
10+
*
11+
* Returns AF_ERR_RUNTIME when CUDA backend is not active or cuSOLVER
12+
* cannot be found (graceful degradation on CPU/OpenCL builds).
13+
*
14+
* Ordering: cusolverDnDsyevd returns eigenvalues in ascending order,
15+
* matching hmatrix's eigSH convention.
16+
*/
17+
18+
#define _GNU_SOURCE
19+
#define AF_DEFINE_CUDA_TYPES /* gives us cudaStream_t in af/cuda.h */
20+
#include "arrayfire.h"
21+
#include "af/cuda.h"
22+
#include <dlfcn.h>
23+
#include <stddef.h>
24+
25+
/* ── minimal cuSOLVER types (avoids needing CUDA toolkit headers) ── */
26+
typedef void *cusolverDnHandle_t;
27+
typedef void *cudaStream_t_t; /* distinct name to avoid redefinition */
28+
typedef int cusolverStatus_t;
29+
30+
#define CUSOLVER_STATUS_SUCCESS 0
31+
#define CUBLAS_FILL_MODE_LOWER 0
32+
#define CUSOLVER_EIG_MODE_VECTOR 1
33+
34+
/* ── function pointer typedefs ── */
35+
typedef cusolverStatus_t (*pfn_Create) (cusolverDnHandle_t *);
36+
typedef cusolverStatus_t (*pfn_SetStream) (cusolverDnHandle_t, cudaStream_t);
37+
38+
typedef cusolverStatus_t (*pfn_DsyevdBuf)(cusolverDnHandle_t, int, int,
39+
int, const double *, int, const double *, int *);
40+
typedef cusolverStatus_t (*pfn_Dsyevd) (cusolverDnHandle_t, int, int,
41+
int, double *, int, double *, double *, int, int *);
42+
43+
typedef cusolverStatus_t (*pfn_SsyevdBuf)(cusolverDnHandle_t, int, int,
44+
int, const float *, int, const float *, int *);
45+
typedef cusolverStatus_t (*pfn_Ssyevd) (cusolverDnHandle_t, int, int,
46+
int, float *, int, float *, float *, int, int *);
47+
48+
/* ── module-level state ── */
49+
static cusolverDnHandle_t g_handle = NULL;
50+
static pfn_Create fn_Create = NULL;
51+
static pfn_SetStream fn_SetStr = NULL;
52+
static pfn_DsyevdBuf fn_DsyBuf = NULL;
53+
static pfn_Dsyevd fn_Dsyevd = NULL;
54+
static pfn_SsyevdBuf fn_SsyBuf = NULL;
55+
static pfn_Ssyevd fn_Ssyevd = NULL;
56+
static int g_init = 0; /* 0 = uninitialised */
57+
58+
static af_err load_and_init(void)
59+
{
60+
/* Try the exact versioned name first (already loaded by AF CUDA backend),
61+
* then fall back to an unversioned symlink if present. */
62+
void *lib = dlopen("libcusolver.so.11", RTLD_NOW | RTLD_NOLOAD);
63+
if (!lib) lib = dlopen("libcusolver.so.11", RTLD_NOW | RTLD_GLOBAL);
64+
if (!lib) lib = dlopen("libcusolver.so", RTLD_NOW | RTLD_GLOBAL);
65+
if (!lib) return AF_ERR_RUNTIME;
66+
67+
fn_Create = (pfn_Create) dlsym(lib, "cusolverDnCreate");
68+
fn_SetStr = (pfn_SetStream) dlsym(lib, "cusolverDnSetStream");
69+
fn_DsyBuf = (pfn_DsyevdBuf) dlsym(lib, "cusolverDnDsyevd_bufferSize");
70+
fn_Dsyevd = (pfn_Dsyevd) dlsym(lib, "cusolverDnDsyevd");
71+
fn_SsyBuf = (pfn_SsyevdBuf) dlsym(lib, "cusolverDnSsyevd_bufferSize");
72+
fn_Ssyevd = (pfn_Ssyevd) dlsym(lib, "cusolverDnSsyevd");
73+
74+
if (!fn_Create || !fn_SetStr || !fn_DsyBuf || !fn_Dsyevd ||
75+
!fn_SsyBuf || !fn_Ssyevd)
76+
return AF_ERR_RUNTIME;
77+
78+
if (fn_Create(&g_handle) != CUSOLVER_STATUS_SUCCESS)
79+
return AF_ERR_INTERNAL;
80+
81+
/* Bind cuSOLVER to ArrayFire's CUDA stream (device 0) so that
82+
* cuSOLVER kernels are sequenced correctly with AF operations. */
83+
cudaStream_t stream = NULL;
84+
if (afcu_get_stream(&stream, 0) == AF_SUCCESS && stream)
85+
fn_SetStr(g_handle, stream);
86+
87+
return AF_SUCCESS;
88+
}
89+
90+
static af_err ensure_init(void)
91+
{
92+
if (g_init) return g_handle ? AF_SUCCESS : AF_ERR_RUNTIME;
93+
g_init = 1;
94+
return load_and_init();
95+
}
96+
97+
/* ── core eigensolver: writes eigenvectors into d_A, eigenvalues into d_W ── */
98+
static af_err run_syevd(int is_double, int n, void *d_A, void *d_W)
99+
{
100+
int lwork;
101+
cusolverStatus_t st;
102+
103+
if (is_double) {
104+
st = fn_DsyBuf(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
105+
n, (const double *)d_A, n, (const double *)d_W, &lwork);
106+
} else {
107+
st = fn_SsyBuf(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
108+
n, (const float *)d_A, n, (const float *)d_W, &lwork);
109+
}
110+
if (st != CUSOLVER_STATUS_SUCCESS) return AF_ERR_INTERNAL;
111+
112+
dim_t wsz = (dim_t)lwork * (is_double ? sizeof(double) : sizeof(float));
113+
114+
void *d_work = NULL, *d_info = NULL;
115+
af_err err;
116+
if ((err = af_alloc_device_v2(&d_work, wsz)) != AF_SUCCESS) return err;
117+
if ((err = af_alloc_device_v2(&d_info, sizeof(int))) != AF_SUCCESS) {
118+
af_free_device_v2(d_work);
119+
return err;
120+
}
121+
122+
if (is_double) {
123+
st = fn_Dsyevd(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
124+
n, (double *)d_A, n, (double *)d_W,
125+
(double *)d_work, lwork, (int *)d_info);
126+
} else {
127+
st = fn_Ssyevd(g_handle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
128+
n, (float *)d_A, n, (float *)d_W,
129+
(float *)d_work, lwork, (int *)d_info);
130+
}
131+
132+
af_free_device_v2(d_work);
133+
af_free_device_v2(d_info);
134+
return (st == CUSOLVER_STATUS_SUCCESS) ? AF_SUCCESS : AF_ERR_INTERNAL;
135+
}
136+
137+
/* ── public entry point exposed to Haskell ── */
138+
af_err af_eigsh(af_array *evals_out, af_array *evecs_out, const af_array input)
139+
{
140+
af_err err;
141+
142+
if ((err = ensure_init()) != AF_SUCCESS) return err;
143+
144+
af_dtype dtype;
145+
if ((err = af_get_type(&dtype, input)) != AF_SUCCESS) return err;
146+
if (dtype != f64 && dtype != f32) return AF_ERR_TYPE;
147+
148+
dim_t d0, d1, d2, d3;
149+
if ((err = af_get_dims(&d0, &d1, &d2, &d3, input)) != AF_SUCCESS) return err;
150+
int n = (int)d0;
151+
152+
/* Working copy: cuSOLVER overwrites A in-place with eigenvectors */
153+
af_array evecs;
154+
if ((err = af_copy_array(&evecs, input)) != AF_SUCCESS) return err;
155+
156+
/* Eigenvalue output: n-element array owned and managed by ArrayFire */
157+
af_array evals;
158+
dim_t n_dim = (dim_t)n;
159+
if ((err = af_constant(&evals, 0.0, 1, &n_dim, dtype)) != AF_SUCCESS) {
160+
af_release_array(evecs);
161+
return err;
162+
}
163+
164+
/* Lock both arrays and obtain raw device pointers for cuSOLVER */
165+
void *d_A = NULL, *d_W = NULL;
166+
if ((err = af_get_device_ptr(&d_A, evecs)) != AF_SUCCESS) {
167+
af_release_array(evecs); af_release_array(evals);
168+
return err;
169+
}
170+
if ((err = af_get_device_ptr(&d_W, evals)) != AF_SUCCESS) {
171+
af_unlock_array(evecs);
172+
af_release_array(evecs); af_release_array(evals);
173+
return err;
174+
}
175+
176+
err = run_syevd(dtype == f64, n, d_A, d_W);
177+
178+
/* Unlock: ArrayFire resumes ownership and sees the in-place modifications */
179+
af_unlock_array(evecs);
180+
af_unlock_array(evals);
181+
182+
if (err != AF_SUCCESS) {
183+
af_release_array(evecs); af_release_array(evals);
184+
return err;
185+
}
186+
187+
*evals_out = evals;
188+
*evecs_out = evecs;
189+
return AF_SUCCESS;
190+
}

src/ArrayFire.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ module ArrayFire
4545
, module ArrayFire.Graphics
4646
, module ArrayFire.Image
4747
, module ArrayFire.Index
48+
, module ArrayFire.Jacobi
4849
, module ArrayFire.LAPACK
4950
, module ArrayFire.Random
5051
, module ArrayFire.Signal
@@ -71,6 +72,7 @@ import ArrayFire.Features
7172
import ArrayFire.Graphics
7273
import ArrayFire.Image
7374
import ArrayFire.Index
75+
import ArrayFire.Jacobi
7476
import ArrayFire.LAPACK
7577
import ArrayFire.Random
7678
import ArrayFire.Signal

src/ArrayFire/Internal/LAPACK.hsc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ foreign import ccall unsafe "af_norm"
3737
af_norm :: Ptr Double -> AFArray -> AFNormType -> Double -> Double -> IO AFErr
3838
foreign import ccall unsafe "af_is_lapack_available"
3939
af_is_lapack_available :: Ptr CBool -> IO AFErr
40+
foreign import ccall unsafe "af_eigsh"
41+
af_eigsh :: Ptr AFArray -> Ptr AFArray -> AFArray -> IO AFErr

src/ArrayFire/Jacobi.hs

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
{-# LANGUAGE ScopedTypeVariables #-}
2+
{-# LANGUAGE TypeApplications #-}
3+
--------------------------------------------------------------------------------
4+
-- |
5+
-- Module : ArrayFire.Jacobi
6+
-- Copyright : David Johnson (c) 2019-2026
7+
-- License : BSD 3
8+
-- Maintainer : David Johnson <code@dmj.io>
9+
-- Stability : Experimental
10+
-- Portability : GHC
11+
--
12+
-- Jacobi eigenvalue algorithm for real symmetric matrices.
13+
--
14+
-- @
15+
-- >>> let a = matrix \@Double (3,3) [[4,2,2],[2,3,0],[2,0,3]]
16+
-- >>> let (evals, evecs) = jacobi a 100 1e-10
17+
-- @
18+
--
19+
--------------------------------------------------------------------------------
20+
module ArrayFire.Jacobi
21+
( jacobi
22+
) where
23+
--------------------------------------------------------------------------------
24+
import Foreign.Storable (Storable)
25+
--------------------------------------------------------------------------------
26+
import ArrayFire.Algorithm (sumAll)
27+
import ArrayFire.Arith (add, sub, mul)
28+
import ArrayFire.Array (getDims, getScalar, scalar)
29+
import ArrayFire.Data (diagCreate, diagExtract, identity)
30+
import ArrayFire.Index (index, assignSeq)
31+
import ArrayFire.Internal.Types (Array, AFType)
32+
import ArrayFire.Types (Seq (..))
33+
--------------------------------------------------------------------------------
34+
-- | Jacobi eigenvalue decomposition for real symmetric n×n matrices.
35+
--
36+
-- Uses the cyclic Jacobi method: each sweep applies a Givens rotation to
37+
-- every off-diagonal pair (p,q) with p < q. Repeats until the off-diagonal
38+
-- Frobenius norm is below @tol@ or @maxSweeps@ is reached.
39+
--
40+
-- Returns @(eigenvalues, eigenvectors)@:
41+
--
42+
-- * @eigenvalues@ — length-n vector; element @i@ is the eigenvalue
43+
-- associated with column @i@ of @eigenvectors@.
44+
-- * @eigenvectors@ — n×n orthogonal matrix; columns are eigenvectors.
45+
--
46+
-- All intermediate computation stays on the GPU. The only CPU round-trips
47+
-- are the three scalar reads per pair (apq, app, aqq) needed to compute the
48+
-- rotation angle, and one scalar read per sweep for the convergence check.
49+
--
50+
-- The eigenvalues are /not/ sorted; use 'ArrayFire.Algorithm.sort' if needed.
51+
jacobi
52+
:: forall a . (AFType a, RealFloat a, Storable a)
53+
=> Array a -- ^ real symmetric n×n matrix
54+
-> Int -- ^ maximum number of sweeps
55+
-> a -- ^ convergence tolerance (off-diagonal Frobenius norm)
56+
-> (Array a, Array a)
57+
-- ^ (eigenvalues vector, eigenvectors matrix)
58+
jacobi mat maxSweeps tol = go 0 mat (identity @a [n, n])
59+
where
60+
(n, _, _, _) = getDims mat
61+
pairs = [(p, q) | p <- [0 .. n - 2], q <- [p + 1 .. n - 1]]
62+
63+
go sweepNum a v
64+
| sweepNum >= maxSweeps = finish a v
65+
| offDiagNorm a < tol = finish a v
66+
| otherwise =
67+
let (a', v') = foldl' (applyPair n tol) (a, v) pairs
68+
in go (sweepNum + 1) a' v'
69+
70+
finish a v = (diagExtract a 0, v)
71+
--------------------------------------------------------------------------------
72+
-- | Apply a single Givens rotation at position (p,q) entirely on the GPU.
73+
--
74+
-- Instead of building an n×n rotation matrix and doing two O(n³) matmuls,
75+
-- this updates only the two affected rows of A (left multiply by Gᵀ) and
76+
-- the two affected columns of A and V (right multiply by G). Each update is
77+
-- O(n) and never materialises the full rotation matrix on either CPU or GPU.
78+
--
79+
-- The only GPU→CPU transfers are the three element reads for apq, app, aqq.
80+
-- c and s are uploaded as [1]-element GPU scalars and broadcast across the
81+
-- column/row vectors.
82+
applyPair
83+
:: forall a . (AFType a, RealFloat a, Storable a)
84+
=> Int -> a -> (Array a, Array a) -> (Int, Int) -> (Array a, Array a)
85+
applyPair n tol (a, v) (p, q) =
86+
let apq = getElem a p q
87+
in if abs apq < tol
88+
then (a, v)
89+
else
90+
let app = getElem a p p
91+
aqq = getElem a q q
92+
tau = (aqq - app) / (2 * apq)
93+
t | tau >= 0 = 1 / (tau + sqrt (1 + tau * tau))
94+
| otherwise = (-1) / (negate tau + sqrt (1 + tau * tau))
95+
c = 1 / sqrt (1 + t * t)
96+
s = t * c
97+
-- GPU scalar arrays; broadcast over column/row vectors via af_mul batch=1
98+
sc = scalar c
99+
ss = scalar s
100+
-- Left-multiply A by Gᵀ: update rows p and q
101+
rowAp = getRow a p
102+
rowAq = getRow a q
103+
rowAp' = (sc `mul` rowAp) `sub` (ss `mul` rowAq)
104+
rowAq' = (ss `mul` rowAp) `add` (sc `mul` rowAq)
105+
a1 = setRow (setRow a p rowAp') q rowAq'
106+
-- Right-multiply A by G: update cols p and q of the row-updated A
107+
colA1p = getCol a1 p
108+
colA1q = getCol a1 q
109+
colAp' = (sc `mul` colA1p) `sub` (ss `mul` colA1q)
110+
colAq' = (ss `mul` colA1p) `add` (sc `mul` colA1q)
111+
a2 = setCol (setCol a1 p colAp') q colAq'
112+
-- Right-multiply V by G: update cols p and q
113+
vp = getCol v p
114+
vq = getCol v q
115+
vp' = (sc `mul` vp) `sub` (ss `mul` vq)
116+
vq' = (ss `mul` vp) `add` (sc `mul` vq)
117+
v' = setCol (setCol v p vp') q vq'
118+
in (a2, v')
119+
where
120+
n1 = fromIntegral (n - 1) :: Double
121+
getRow arr i = index arr [Seq (fromIntegral i) (fromIntegral i) 1, Seq 0 n1 1]
122+
getCol arr j = index arr [Seq 0 n1 1, Seq (fromIntegral j) (fromIntegral j) 1]
123+
setRow arr i = assignSeq arr [Seq (fromIntegral i) (fromIntegral i) 1, Seq 0 n1 1]
124+
setCol arr j = assignSeq arr [Seq 0 n1 1, Seq (fromIntegral j) (fromIntegral j) 1]
125+
--------------------------------------------------------------------------------
126+
-- Extract the scalar at row i, column j (GPU→CPU; called 3× per active pair).
127+
getElem :: (AFType a, Storable a) => Array a -> Int -> Int -> a
128+
getElem arr i j =
129+
getScalar $
130+
index arr [ Seq (fromIntegral i) (fromIntegral i) 1
131+
, Seq (fromIntegral j) (fromIntegral j) 1 ]
132+
--------------------------------------------------------------------------------
133+
-- Frobenius norm of the strict off-diagonal part of A (one GPU→CPU per sweep).
134+
offDiagNorm :: forall a . (AFType a, RealFloat a) => Array a -> a
135+
offDiagNorm a =
136+
let d = diagCreate (diagExtract a 0) 0
137+
offDiag = a `sub` d
138+
sq = offDiag `mul` offDiag
139+
in sqrt . realToFrac . fst $ sumAll sq
140+
--------------------------------------------------------------------------------

0 commit comments

Comments
 (0)