Sparse-Matrix-Vector-Multiplication/report.tex at main · ranimeshehata/Sparse-Matrix-Vector-Multiplication · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
\documentclass[9pt,twocolumn]{extarticle}
\usepackage[margin=0.5in,top=0.6in,bottom=0.6in]{geometry}
\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{hyperref}
\usepackage{times}

\title{\textbf{Parallel Sparse Matrix-Vector Multiplication using OpenMP}}
\author{
Ranime Shehata\\
\textit{Department of Computer Science}\\
\textit{Cairo University}\\
ranime.shehata@example.com
}
\date{}

\begin{document}
\maketitle

\begin{abstract}
Sparse Matrix-Vector Multiplication (SpMV) is fundamental in scientific computing. This paper presents serial and parallel SpMV implementations using Compressed Sparse Row (CSR) format with OpenMP parallelization. Experimental results demonstrate up to 1.43× speedup on 2 cores for 10\% density matrices. Implementation is validated against ground truth with exact error $= 0$.
\end{abstract}

\noindent\textbf{Keywords:} Sparse matrices, parallel computing, OpenMP, CSR format, performance optimization

\section{Introduction}
Sparse matrices, with predominantly zero elements, are fundamental in scientific computing including finite element analysis, graph algorithms, and machine learning. Matrix-vector multiplication (SpMV) is critical in iterative solvers, accounting for 80\% of execution time \cite{saad2003iterative}.

\textbf{Problem:} Given sparse matrix $A \in \mathbb{R}^{m \times n}$ and vector $x \in \mathbb{R}^{n}$, compute $y \in \mathbb{R}^{m}$:
\begin{equation}
y_i = \sum_{j=1}^{n} A_{ij} \cdot x_j \quad \forall i \in [1,m]
\end{equation}

The challenge: efficiently parallelize SpMV while storing only non-zeros to achieve speedup and maintain accuracy.

\section{Algorithm Design}

\subsection{CSR Format}
Compressed Sparse Row uses three arrays: \texttt{values[]} (non-zeros), \texttt{col\_indices[]} (column indices), \texttt{row\_ptr[]} (row start positions). Memory: $O(2 \cdot nnz + m)$ vs $O(m \cdot n)$ dense.

\subsection{Foster's Methodology}
\textbf{Partitioning:} Decompose by rows ($m$ tasks). \textbf{Communication:} Read-only vector, no inter-thread communication. \textbf{Agglomeration:} Static scheduling for uniform distribution. \textbf{Mapping:} OpenMP automatic mapping.

\section{Implementation}

\begin{algorithm}[h]
\small
\caption{Serial SpMV}
\begin{algorithmic}[1]
\Require Matrix $A$ (CSR), vector $x$
\Ensure Result $y$
\State $y \gets$ zeros($m$)
\For{$i = 0$ to $m-1$}
    \State $sum \gets 0$
    \For{$j = row\_ptr[i]$ to $row\_ptr[i+1] - 1$}
        \State $sum \gets sum + values[j] \times x[col\_indices[j]]$
    \EndFor
    \State $y[i] \gets sum$
\EndFor
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[h]
\small
\caption{Parallel SpMV (OpenMP)}
\begin{algorithmic}[1]
\Require Matrix $A$ (CSR), vector $x$
\Ensure Result $y$
\State $y \gets$ zeros($m$)
\State \textbf{parallel for} (static schedule)
\For{$i = 0$ to $m-1$}
    \State $sum \gets 0$
    \For{$j = row\_ptr[i]$ to $row\_ptr[i+1] - 1$}
        \State $sum \gets sum + values[j] \times x[col\_indices[j]]$
    \EndFor
    \State $y[i] \gets sum$
\EndFor
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[h]
\small
\caption{CUDA SpMV (GPU)}
\begin{algorithmic}[1]
\Require Matrix $A$ (CSR), vector $x$
\Ensure Result $y$
\State Copy $A$, $x$ to GPU memory
\State $blocks \gets \lceil m / blockSize \rceil$
\State Launch kernel($blocks$, $blockSize$):
\State \quad $tid \gets blockIdx.x \times blockDim.x + threadIdx.x$
\State \quad \textbf{if} $tid < m$ \textbf{then}
\State \quad \quad $sum \gets 0$
\State \quad \quad \textbf{for} $j = row\_ptr[tid]$ to $row\_ptr[tid+1] - 1$
\State \quad \quad \quad $sum \gets sum + values[j] \times x[col\_indices[j]]$
\State \quad \quad \textbf{end for}
\State \quad \quad $y[tid] \gets sum$
\State \quad \textbf{end if}
\State Copy $y$ back to host
\end{algorithmic}
\end{algorithm}

Time: $O(nnz)$, Space: $O(m)$

\section{Validation}

\subsection{Methodology}
Correctness is established by comparing each implementation (serial, OpenMP, CUDA) against dense matrix multiplication as ground truth. Dense multiplication computes $y_i = \sum_{j=1}^{n} A_{ij} \cdot x_j$ exactly. Error metric: $\epsilon_{max} = \max_i |y_i^{impl} - y_i^{truth}|$.

\subsection{Test Suite}
\textbf{Test 1 - Small Predefined (4×4):} Known matrix (nnz=7, 43.8\% density) validates correct CSR indexing and computation. Ensures row boundaries handled properly.

\textbf{Test 2 - Random Sparse (100×100, 10\% density):} Stochastic validation with 1,002 non-zeros. Tests irregular sparsity patterns and column index mapping.

\textbf{Test 3 - Very Sparse (200×200, 2\% density):} Extreme sparsity (802 non-zeros) validates efficiency with minimal data. Tests edge cases with many empty rows.

\textbf{Test 4 - Diagonal (5×5):} Identity-like matrix (nnz=5, 20\% density) validates diagonal special case and boundary conditions.

\subsection{Results}
Both serial and OpenMP implementations achieve $\epsilon_{max} = 0$ (exact match) across all test cases, confirming:
\textbf{(1)} CSR format correctly encodes sparse data;
\textbf{(2)} Serial algorithm computes exact results;
\textbf{(3)} OpenMP parallelization preserves numerical accuracy with zero error (no race conditions).
The exact match proves implementation soundness and demonstrates that parallelization introduces zero computational artifacts.

\section{Performance Evaluation}

\subsection{Experimental Setup}
\textbf{Hardware:} 2-core CPU, GCC -O2 optimization. \textbf{Workload:} 10\% density matrices, sizes 100×100 to 10,000×10,000. \textbf{Methodology:} 2 warmup runs + 5 timed runs, average reported.

\subsection{Measured Speedup}

\begin{table}[h]
\caption{Performance Results}
\centering
\scriptsize
\begin{tabular}{|c|c|c|c|c|}
\hline
\textbf{Size} & \textbf{Serial} & \textbf{Parallel} & \textbf{Speedup} & \textbf{Eff.} \\
 & \textbf{(ms)} & \textbf{(ms)} & \textbf{(×)} & \textbf{(\%)} \\
\hline
100 & 0.001 & 0.002 & 0.84 & 42.2 \\
200 & 0.005 & 0.005 & 1.05 & 52.4 \\
500 & 0.032 & 0.026 & 1.22 & 60.9 \\
1,000 & 0.118 & 0.113 & 1.05 & 52.4 \\
2,000 & 0.550 & 0.421 & \textbf{1.31} & \textbf{65.4} \\
5,000 & 3.478 & 2.437 & 1.43 & 71.4 \\
10,000 & 14.259 & 10.179 & 1.40 & 70.0 \\
\hline
\multicolumn{3}{|r|}{\textbf{Average:}} & 1.19 & 59.3 \\
\hline
\end{tabular}
\end{table}

Performance shows modest speedup ranging 1.05-1.43× across problem sizes. Peak speedup 1.43× at N=5,000 (71.4\% efficiency). Size 100 shows \textit{slowdown} (0.84×) due to thread overhead exceeding benefit for tiny workload. Larger matrices (N $\geq$ 2,000) achieve consistent 1.3-1.4× speedup.

\subsection{Graph Analysis}

\textbf{Figure 1 (Speedup):} Peak speedup 1.43× at N=5,000 (71.4\% efficiency of ideal 2×). Speedup improves with problem size: 0.84× (N=100) to 1.43× (N=5K), then slight decline to 1.40× (N=10K). Small matrices show overhead-dominated behavior.
\textbf{Figure 2 (Execution Time):} Both serial and parallel scale $O(nnz)$ linearly. Parallel slower for N=100 (overhead), becomes beneficial at N $\geq$ 200. Gap widens with size, reaching maximum benefit at medium-large matrices.

\begin{figure}[h]
\centering
\includegraphics[width=0.48\textwidth]{results/speedup.png}
\caption{Speedup vs Problem Size - Peak 1.43× at N=5,000}
\end{figure}

\begin{figure}[h]
\centering
\includegraphics[width=0.48\textwidth]{results/execution_time.png}
\caption{Execution Time - Serial $O(nnz)$, Parallel memory-limited}
\end{figure}

\subsection{Analysis and Tuning}

\textbf{Scheduling:} Static schedule selected for minimal overhead and cache-friendly chunk assignment. Dynamic scheduling tested but degraded performance due to scheduling overhead and cache thrashing.

\textbf{Memory Access:} CSR row traversal is sequential (good spatial locality). Vector access is random, causing L1/L2 cache misses. Prefetching helps but limited by bandwidth.

\textbf{Thread Count:} Limited to 2 cores on test system. Thread overhead visible at N=100 (slowdown). Speedup becomes beneficial at N $\geq$ 200, reaching 65-71\% efficiency for large matrices.

\textbf{Load Balance:} Static partitioning causes imbalance with varying row nnz. Attempted chunk-based distribution improved small cases but added overhead.

\section{Challenges}
\textbf{Load Imbalance:} Varying row lengths cause uneven distribution. \textbf{Memory Bandwidth:} Memory-bound nature limits gains. \textbf{Small Overhead:} Thread costs dominate for small problems. \textbf{Precision:} Ensuring reproducible parallel floating-point operations.

\section{Conclusion}
Achieved up to 1.43× speedup using OpenMP on 2 cores, with 1.19× average across all sizes. Performance improves with problem size, reaching 70-71\% efficiency for large matrices (N $\geq$ 5,000). Future work: GPU acceleration (CUDA), alternative formats (ELL, COO), distributed parallelization (MPI), testing on higher core-count systems to achieve greater speedup.

\begin{thebibliography}{00}
\bibitem{saad2003iterative} Y. Saad, \textit{Iterative Methods for Sparse Linear Systems}, 2nd ed. SIAM, 2003.
\bibitem{foster1995designing} I. Foster, \textit{Designing and Building Parallel Programs}. Addison-Wesley, 1995.
\bibitem{bell2009implementing} N. Bell and M. Garland, ``Implementing sparse matrix-vector multiplication on throughput-oriented processors,'' \textit{Proc. SC09}, 2009.
\end{thebibliography}

\end{document}