PatternRecognitionOne/main.tex at master · michael-hart/PatternRecognitionOne · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
%-------------------------------------------------------------------------------
% Document & Package declarations
%-------------------------------------------------------------------------------

\documentclass[a4paper, 10pt, conference]{ieeeconf}
\usepackage{graphicx}
\usepackage[colorlinks=true, allcolors=black]{hyperref}
\usepackage{tabularx}

%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}

%% Useful packages
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[font=footnotesize,labelfont=bf]{caption}
\usepackage[font=footnotesize,labelfont=bf]{subcaption}

%% Packages for displaying source code
\usepackage{listings}
% \usepackage[framed,numbered,autolinebreaks,useliterate]{mcode}

\usepackage{float}
\usepackage{longtable}

%% Packages for displaying source code
\usepackage[numbered,framed]{matlab-prettifier}
\usepackage{color}

%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including  **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex,
%%                    bare_jrnl_transmag.tex
%%*************************************************************************

%-------------------------------------------------------------------------------
% Document Configuration
%-------------------------------------------------------------------------------

\begin{document}
\title{Pattern Recognition - PCA and SVM for Face Recognition}
\author{Michael~Hart and
        Meng~Kiang~Seah
\\
        Department of Electrical and Electronic Engineering,
        Imperial College London,
        SW7 2AZ
 \\
        E-mail: \{mh1613, mks211\}@imperial.ac.uk}
\date{\today}

%-------------------------------------------------------------------------------
% Plan on what to write
%-------------------------------------------------------------------------------

% See coursework instructions at:
% https://bb.imperial.ac.uk/bbcswebdav/pid-1001497-dt-content-rid-3367484_1/courses/DSS-EE4_68-16_17/PRCoursework1.pdf

%-------------------------------------------------------------------------------
% Information Banner
%-------------------------------------------------------------------------------

\maketitle

%-------------------------------------------------------------------------------
% Abstract
%-------------------------------------------------------------------------------

\begin{abstract}
\todo{Write abstract.} Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus feugiat metus id dictum venenatis. Etiam id luctus elit, quis tristique lacus. Ut tincidunt purus nec lacinia condimentum. Phasellus at est lacinia, porttitor massa at, vulputate nisl. Nullam turpis dolor, egestas nec enim ac, euismod congue nibh. Curabitur aliquet lacus vitae sem facilisis, quis luctus nisi tristique. Fusce odio risus, sagittis eu semper sit amet, pretium sit amet felis. Nunc a nunc ornare, tincidunt libero in, condimentum turpis.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus feugiat metus id dictum venenatis. Etiam id luctus elit, quis tristique lacus. Ut tincidunt purus nec lacinia condimentum. Phasellus at est lacinia, porttitor massa at, vulputate nisl. Nullam turpis dolor, egestas nec enim ac, euismod congue nibh.
\end{abstract}

%-------------------------------------------------------------------------------
% Introduction
%-------------------------------------------------------------------------------
\section{Introduction}
Throughout this course, there are two methods of classifying data using pre-trained models that have explored. Principal Component Analysis (PCA) and Support Vector Machines (SVMs). These two methods are applied to the face data that is given. Each face is given as a 2561-by-1 column vector. These are the raw intensity vectors obtained by taking the pixel values of each image row by row and concatenating them. Each image given is a greyscale picture 42 pixels wide and 52 pixels high.

While regurgitating the theory presented in lectures in not the aim, it is necessary to establish some context for the work done.

\subsection{Principal Component Analysis}\label{sec:pcatheory}
Each image exists as a point in the $D$-dimensional space. The goal is to project these points onto a $M$-dimensional space (where $ M < D$), where this subspace contains the largest variance of the points. This is done by finding the eigenvalues and eigenvectors of the data's covariance matrix, $S$. The data is projected onto the largest $M$ eigenvectors chosen \cite{pca}.

For classification, the Nearest Neighbour (NN) method is used. Each testing picture is also projected onto the subspace. This point is compared with the closest training point to it, and the testing picture is assigned the same class \cite{pca}.

\subsection{Support Vector Machines}\label{sec:svmtheory}
This method involves creating a hyperplane in the $D$-dimensional space of the data, which separates the data into two classes. This is done by finding the plane in the space separates the data with the maximum distance from the training points closest to the boundary \cite{svm}.

However, SVMs work for two classes only. To extend this to multiple classes, there are two ways. The first, one-versus-all, involves training each class's points against every other classes'. Thus, for $M$ classes, there are $M$ classes, there are $M$ SVMs, each tailored to each class. A test point is assigned the class of the SVM that determines it as a positive result \cite{svm}.

The second is one-versus-one, which trains each class against every other class. This creates $\frac{M \times (M-1)}{2}$ SVMs. Each SVM votes on the class of a point, and the class with the most votes is assigned to the point \cite{svm}.

% Plan for intro
% Probably going to be written at the end
% Introduce the concepts of PCA, SVMs, and classifiers in general
%
\section{Data Partitioning}
Common practice with a data set when performing machine learning tasks is to divide the set into a training set and a test set. A number of online sources discuss the size and ratios of partitioning data for training and testing. They propose a range that extends from a 60\% training to 40\% testing ratio to one of 80\% to 20\%, but most seem to agree on a 70\%-30\% split. Closer inspection of the data given to us in \texttt{face.math} shows that there are 520 pictures and 52 different classes. For each class, or each person, there are 10 images.

Weighing the need for enough training instances with enough testing instances, a split of 70\% training to 30\% test was indeed used, where each class was split into 7 training images and 3 test images. In this way, there would be the same number of training images per class, and there would be sufficient test data to validate after training. The 3 testing instances were the 8th, 9th and 10th instances. As there is no ordering to the pictures in each class, there is no chance of an accidental bias.

This gives us 364 training instances with 156 testing ones. This partitioning is used for the remainder of this assignment.

\section{Principal Component Analysis}

% Training length is 364; test length is 156.
% Rank of AAT is 363
% Number of nonzero elements in AAT is 363
% Rank of ATA is 363
% Number of nonzero elements in ATA is 363

\subsection{PCA with $S = \frac{1}{N}AA^{T}$}
As mentioned in Section \ref{sec:pcatheory}, the first step in PCA is obtaining the data covariance matrix $S$. This can be done with the equation in the title. The $A$ matrix has 364 columns of training pictures of 2576 rows pixels each. It is obtained by subtracting the average training face (shown in Figure \ref{fig:meanface}) from each face.

Thus, $S$ has 2576 rows and columns. At first, using MATLAB to obtain the eigenvalues gives us 2756 values. However, closer inspection shows that the rank of $S$ is 363, meaning that they cannot be more than 363 non-zero eigenvalues. Indeed, rounding off the eigenvalues to 10 decimal places gives 363 non-zero values. This is most likely a floating precision issue.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/S_eig_val_rounded.png}
    \caption{Eigenvalues from $S = AA^T$.}
    \label{fig:S_eig_val}
\end{figure}

The values of these eigenvalues are plotted in Figure \ref{fig:S_eig_val}. As can be seen, the values drop in an exponential manner. This is as was seen in the lecture slides. The largest eigenvalues have corresponding eigenvectors, or eigenfaces, and are the most common parts of the faces in the training set. Indeed, this method extracts the most important parts of the faces to use for recognition. The three main eigenfaces are shown in Figures \ref{fig:eigface1}, \ref{fig:eigface2}, and \ref{fig:eigface2}.

% \begin{figure}[!ht]
% 	\captionsetup[subfigure]{position=b}
%         \centering
%         \begin{subfigure}{0.2\textwidth}
%                 \includegraphics[width=\textwidth]{src/mean_face.png}
%                 \caption{The average testing face.}
%                 \label{fig:meanface}
%         \end{subfigure}
%         ~
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface1.png}
%                 \caption{The first eigenface.}
%                 \label{fig:eigface1}
%         \end{subfigure}
%         \\
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface2.png}
%                 \caption{The second eigenface.}
%                 \label{fig:eigface2}
%         \end{subfigure}
%         ~
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface3.png}
%                 \caption{The third eigenface.}
%                 \label{fig:eigface3}
%         \end{subfigure}
% 	\caption{Mean face and some eigenfaces of $S = AA^T$ (visualisations of the eigenvectors).}
% \end{figure}

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.1\textwidth}
                \includegraphics[width=\textwidth]{src/mean_face.png}
                \caption{The average testing face.}
                \label{fig:meanface}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface1.png}
                \caption{The first eigenface.}
                \label{fig:eigface1}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface2.png}
                \caption{The second eigenface.}
                \label{fig:eigface2}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface3.png}
                \caption{The third eigenface.}
                \label{fig:eigface3}
        \end{subfigure}
	\caption{Mean face and some eigenfaces of $S = AA^T$ (visualisations of the eigenvectors).}
\end{figure}

\subsection{PCA with $S = \frac{1}{N}A^{T}A$}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/S2_eig_val_rounded.png}
    \caption{Eigenvalues from $S = A^T A$.}
    \label{fig:S2_eig_val}
\end{figure}

$S = \frac{1}{N}A^{T}A$ is the alternative calculation method. From the Slides 42 - 43 of the PCA lecture\cite{svm}, the relationship between the eigenvalues and eigenvectors is known. The eigenvalues of $A^T A$ and $AA^T$ are actually identical. The eigenvectors of $AA^T$ ($u_i$) and $A^T A$ ($v_i$) are related by the equation $u_i = A v_i$. Figure \ref{fig:S2_eig_val} shows the eigenvalues and graphed in the same way as in Figure \ref{fig:S_eig_val}, and the trend is similar. Closer inspection in MATLAB showed that both contained the exact same values. For some context, the first 4 eigenfaces obtained through this method are shown. It is important to note that the eigenfaces have already been normalised.

% \begin{figure}[!ht]
% 	\captionsetup[subfigure]{position=b}
%         \centering
%         \begin{subfigure}{0.2\textwidth}
%                 \includegraphics[width=\textwidth]{src/eigface4.png}
%                 \caption{The first eigenface.}
%                 \label{fig:eigface4}
%         \end{subfigure}
%         ~
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface5.png}
%           \caption{The second eigenface.}
%                 \label{fig:eigface5}
%         \end{subfigure}
%         \\
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface6.png}
%           \caption{The third eigenface.}
%                 \label{fig:eigface6}
%         \end{subfigure}
%         ~
%         \begin{subfigure}{0.2\textwidth}
%           \includegraphics[width=\textwidth]{src/eigface7.png}
%                 \caption{The fourth eigenface.}
%                 \label{fig:eigface7}
%         \end{subfigure}
% 	\caption{Some eigenfaces from $S = A^T A$.}
% \end{figure}

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.1\textwidth}
                \includegraphics[width=\textwidth]{src/eigface4.png}
                \caption{The first eigenface.}
                \label{fig:eigface4}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface5.png}
          \caption{The second eigenface.}
                \label{fig:eigface5}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface6.png}
          \caption{The third eigenface.}
                \label{fig:eigface6}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/eigface7.png}
                \caption{The fourth eigenface.}
                \label{fig:eigface7}
        \end{subfigure}
	\caption{Some eigenfaces from $S = A^T A$.}
\end{figure}

Supplementing visual confirmation to the confirmation in MATLAB, it can be seen that most of the eigenvectors are the same. However, there are number of unusual cases where the sign of the eigenvector is reversed. This is peculiar, as there is no mention of such an occurrence in the notes. In fact, the mathematics presented in the slides state that the eigenvectors should be identical.

The distribution of the inverted eigenvectors is also strange, as it happens more frequently with those associated to the smaller eigenvalues. Although computational errors from precision in MATLAb could explain this, a complete analysis of this discrepancy is perhaps too involved for the scope of this assignment.

However, inverted eigenvectors do not adversely affect the PCA method, as the faces are projected onto the eigenvectors. If the vectors are inverted, this will be applied to all the faces equally, and the corresponding coefficient will be simply be inverted as well.

\subsection{Pros and cons of each method}
There are some observations made when comparing the methods. The first is the time taken for the computations of $S$ in the two ways. For $S = AA^T$ (henceforth known as the AAT method), the timing in MATLAB is 17.81 seconds. This is in contrast to the $S = A^T A$ (henceforth known as ATA method), which takes a measly 0.10 seconds, showing a difference of two orders of magnitude.

When running the \texttt{eig} function in MATLAB, a larger matrix will certainly take longer. Additionally, while MATLAB will work to produce the 2576 eigenvalues and associated eigenvectors, most of them are discarded at the end of the calculation as they are not non-zero.

The second is to consider the amount of memory required for each method. This is caused by the size of $S$. With AAT, $S$ is 2576 by 2576, while for ATA, $S$ is 364 by 364. The dimensions are an order of magnitude apart, which means the difference in number of elements is in the range of two orders of magnitude.

While with ATA, there are the additional steps of multiplying the vectors by $A$ again, and normalising the columns, testing in MATLAB shows the time taken for these additional steps are negligible compared the time for the \texttt{eig} function.

<<<<<<< HEAD
In short, the smaller size of ATA means that less memory is required, and the computations are completed more quickly. It is therefore the better choice, and will be used for the rest of this assignment.

\section{Applications of Eigenfaces}
\subsection{Reconstruction and associated Error}
The next step in using PCA is to project each of the training faces onto the eigenfaces, or the bases. This turns each face into a vector of coefficients for each base. The reconstructed face is the result of recreating the face from its associated combinations of coefficients and bases. However, naturally, if not all the bases are used, there is an error from the loss of information when the point in the $D$-dimensional is projected into the $M$-dimensional space.

In Figure \ref{fig:reconerror}, the average reconstruction error for both the training data and the testing data is shown, as it varies depending on the number of bases used. For the training data set, the error drops off very quickly, and eventually approaches zero. This is expected as if all the bases are used, then, $M = D$, and for the training data set, there is no error.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/error.png}
    \caption{Reconstruction error vs. PCA bases used.}
    \label{fig:reconerror}
\end{figure}

\begin{align}
  J = \frac{1}{N} \sum_{n=1}^{N} \lvert \lvert x_n - \widetilde{x_n} \rvert \rvert ^2 = \sum_{i = M+1}^{D} \lambda_i \label{eq:erroreq}
\end{align}

However, applying the same procedure to the testing data set shows that the error drops off at the start as expected, but does not move to zero. This is because as the eigenvalues fall, they represent smaller, more subtle parts of the image. When the larger bases are used, the main nature of the faces in the testing set are preserved. However, the smaller bases are tailored to the peculiarities of the training set, not the testing one. This explains why the error does not trail off to zero.

In fact, according to the theory from Slide 39 from the PCA notes \cite{pca}, the reconstruction error should be explained by Equation \ref{eq:erroreq}, which is the sum of the unused eigenvalues. However, $J$ is the error squared. Thus, $\sqrt{\sum_{i = M+1}^{D} \lambda_i}$ is also graphed in Figure \ref{fig:reconerror}, and it is the same as the average training error trend line.

In Figures \ref{fig:reconex1}, \ref{fig:reconex2}, and \ref{fig:reconex3}, there are samples of three different faces (two from the training set, one form the testing set) shown with their respective reconstructed faces with different bases.

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.1\textwidth}
        \includegraphics[width=\textwidth]{src/face1.png}
                \caption{Original.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface11.png}
          \caption{5 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface12.png}
          \caption{25 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface13.png}
          \caption{50 bases.}
        \end{subfigure}

	\caption{Face number 154 from the training set, reconstructed with different number of bases.}
  \label{fig:reconex1}
\end{figure}

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.1\textwidth}
        \includegraphics[width=\textwidth]{src/face2.png}
                \caption{Original.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface21.png}
          \caption{5 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface22.png}
          \caption{25 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface23.png}
          \caption{50 bases.}
        \end{subfigure}

	\caption{Face number 276 from the training set, reconstructed with different number of bases.}
  \label{fig:reconex2}
\end{figure}

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.1\textwidth}
        \includegraphics[width=\textwidth]{src/face3.png}
                \caption{Original.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface31.png}
          \caption{5 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface32.png}
          \caption{25 bases.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.1\textwidth}
          \includegraphics[width=\textwidth]{src/reface33.png}
          \caption{50 bases.}
        \end{subfigure}

	\caption{Face number 54 from the testing set, reconstructed with different number of bases.}
  \label{fig:reconex3}
\end{figure}

As can be seen, the increasing the number of the bases increase the resemblance between the reconstructed face and the original face. This is in line with what Figure \ref{fig:reconerror} has shown.

\subsection{Nearest Neighbour Classification}
As mentioned Section \ref{sec:pcatheory} Nearest Neighbour (NN) method involves taking a test face and projecting it onto the same set of bases as used during the training phase. This is putting it onto the $M$-dimensional space as with the rest of the training points. Then, it is assigned the same class as the closest training point.

In this testing phase, the NN classification method was used for all the testing pictures while changing the number of bases used. The results are shown in Figures \ref{fig:correctincorrect} and \ref{fig:correctincorrectpercent}.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/correctincorrect.png}
    \caption{Correct and Incorrect Guesses vs. PCA bases used.}
    \label{fig:correctincorrect}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/correctincorrectpercent.png}
    \caption{Accuracy vs. PCA bases used.}
    \label{fig:correctincorrectpercent}
\end{figure}

The accuracy rises drastically with the number of bases used, up until around 100, whereupon the marginal increase in accuracy drops. Overall, there is a limit to the accuracy of PCA used with NN, and in this case, it appears to be around 55\%. However, what is interesting to note is that the first instance of maximum accuracy occurs at 156 bases used. As it happens, this is 3 times the number of classes in the data set. This could be a coincidence, but highlights the fact that the ideal number of bases depends on the number of classes that need to be recognised.

Figure \ref{fig:tics} shows how the time required for classification of the testing set varies with the number of bases used. Note that measurement is in arbitrary "tic" units. As expect, there is a linear trend, as each additional base requires a linearly increasing amount of computation. The spikes seen throughout the larger number of bases are most likely due to the calculation taking long enough that it is impacted by operating system prioritisation.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/tics.png}
    \caption{Time taken vs. PCA bases used.}
    \label{fig:tics}
\end{figure}

Thus, from this the results of NN with 156 PCA bases can be investigated. Figure \ref{fig:confusepca} is the confusion matrix. In an ideal situation, there should be a solid line from the top left to the bottom right, to indicate correct class guesses. Unfortunately, with only 55.13\% accuracy, the line is not continuous.

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.125\textwidth}
        \includegraphics[width=\textwidth]{src/class1.png}
                \caption{Class 26, success.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.125\textwidth}
          \includegraphics[width=\textwidth]{src/class2.png}
          \caption{Class 30, failure.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.125\textwidth}
          \includegraphics[width=\textwidth]{src/class3.png}
          \caption{Class 42, most guessed.}
        \end{subfigure}
        ~
	\caption{Example of a successful class, a failed class, and the most guessed class. }
  \label{fig:classpics}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.5\textwidth]{src/confusepca.png}
    \caption{Confusion matrix with 156 PCA bases used.}
    \label{fig:confusepca}
\end{figure}

An example of a success is Class 26. In addition to the test pictures correctly classified each time, no other test images were classified as 26. In contrast, an example of a failure is Class 30, which was not correctly identified. And an interesting point is that Class 42 was the most guessed class, despite there being only one of the actual Class 42 pictures identified as such. An example of each of these three classes are shown in Figure \ref{fig:classpics}.

What can be concluded is that Class 26 is distinctive enough that it is correctly identified, and no false negatives or positives ever happen. For Class 30, it must be similar enough to other classes and it is constantly misidentified. And finally, a lot of other classes must look like Class 42. The mustache could explain why it is the most frequent guess. Indeed, this highlights just how the problem of facial recognition can be challenging. While a human identifies a distinctive mustache, this is clearly not the case with the PCA bases.

\section{Mutli-class SVM for Face Recognition}
\subsection{One Versus Rest and One Versus One}
\subsubsection{1VR}
To apply the SVM method to a multi-class problem, as mentioned in Section \ref{sec:svmtheory}, one method is to use one-versus-rest (1VR). This involves training as many SVMs as class, each tailored to a class. The code used is in Appendix A.

In 1VR, one class in the training set is assigned to the positive label, and all the other training pictures are assigned the negative label. Then the SVM is trained on that set of data. This is repeated for each of the 52 classes in the training data. 52 SVMs are thus created. To test the data, each testing picture is run through all the 52 SVMs. Ideally, for each picture, a single SVM will classify it as a positive, and the picture will be assigned that class.

However, instead of taking the SVMs' positive or negative decision, the raw decision values were considered. This was because it was discovered that occasionally, a test picture could return a negative from each SVM, or (more rarely), a positive decision from more than one SVM. Thus, the method used was to select the associated class of the SVM that returned the highest raw decision value.

\subsubsection{1V1}
The second method was one-versus-one (1V1). This involves training each class against each other one. One class in the training set is assigned the positive class, and the data is combined with another class that is assigned the negative set. The SVM is trained on this set. This is repeated for each of the possible pairs, resulting in the 1326 SVMs.

To test, each picture was run through all the SVMs. Each SVM produced a decision (positive or negative) and this was converted into a vote for a class. The votes were collected, and the testing picture was assigned the class with the most number of votes.

% \subsubsection{Parameters Explored}
% With SVMs, there are many factors to explore. In this section, data type (raw or PCA coefficients), scaling, and kernel type are investigated, in addition to using both multi-class methods. At first, the

\subsection{Scaled and Unscaled Data}

% >> one_v_rest_test
% RAW WITH TESTING UNSCALED
% Guessed 124 correctly and 32 incorrectly; Success rate is 79.4872%.
% RAW WITH TESTING SCALED
% Guessed 136 correctly and 20 incorrectly; Success rate is 87.1795%.
% PCA WITH TESTING UNSCALED
% Guessed 123 correctly and 33 incorrectly; Success rate is 78.8462%.
% PCA WITH TESTING SCALED
% Guessed 118 correctly and 38 incorrectly; Success rate is 75.641%.
% >> one_v_one_test
% RAW WITH TESTING UNSCALED
% Guessed 119 correctly and 37 incorrectly; Success rate is 76.2821%.
% RAW WITH TESTING SCALED
% Guessed 121 correctly and 35 incorrectly; Success rate is 77.5641%.
% PCA WITH TESTING UNSCALED
% Guessed 119 correctly and 37 incorrectly; Success rate is 76.2821%.
% PCA WITH TESTING SCALED
% Guessed 120 correctly and 36 incorrectly; Success rate is 76.9231%.

% I'm writing bullshit at this point because my head hurts, and expecting one of us to rewrite ALL of the discussion text. It's just a placeholder right now

In this section, the \texttt{svmtrain} function is run with all its default values, with the exception of the kernel. In situations where the number of dimensions exceeds the number of training instances considerably, the linear kernel is considered to have comparable performance to the RBF kernel, which is the default \cite{linear}. The advantage, according to the same source, is that it computes much faster. Thus, for the time being, the linear kernel is used to facilitate investigation of other factors.

The data was scaled by standardising all the testing and training data across each dimension. This means each dimension's inputs are zero-mean and unit standard deviation. The reason for this is that if a certain dimension's inputs are in a greater range, it will cause the SVM to consider it as more important that others \cite{scale} \cite{linear}. In this case, with pictures, while the pixel values will be intensity numbers within a certain range, certain pixels may experience greater variation across pictures.

\begin{table}[!ht]
\centering
\caption{Comparison of unscaled and scaled results} \label{tbl:scaling}
\begin{tabular}{lllll}
\textbf{Multi-class Type} & \textbf{Scaling} & \textbf{Train (s)} & \textbf{Test (s)} & \textbf{Accuracy}\\ \hline
1v1 & unscaled & 3.0948 & 11.9803 & 0.76282\\ \hline
1v1 & scaled & 3.8927 & 13.0467 & 0.77564\\ \hline
1vR & unscaled & 8.4871 & 1.8692 & 0.79487\\ \hline
1vR & scaled & 8.3609 & 2.0556 & 0.87179\\ \hline
\end{tabular}
\end{table}

As can be seen from Table \ref{tbl:scaling}, scaled data produces a more accurate result with comparable time taken for training and testing, although the time taken to scale the data is not included in this calculation. Nevertheless, for both types of multi-class implementation, scaling does indeed improve performance, as expected.

Figure \ref{fig:svmconfuse} shows the confusion matrices for the different cases. And comparing the various images with the confusion matrix from the PCA system in Figure \ref{fig:confusepca} shows just how much more consistent the diagonal is, demonstrating the improvement in classification after moving from PCA to SVM. Unfortunately, while confusion matrices are a colorful way to present our data, space constraints mean that for future scenarios, only the accuracy and times will be shown.

\begin{figure}[!ht]
	\captionsetup[subfigure]{position=b}
        \centering
        \begin{subfigure}{0.23\textwidth}
                \includegraphics[width=\textwidth]{src/1v1_raw_unscaled.png}
                \caption{1V1, raw unscaled data.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.23\textwidth}
          \includegraphics[width=\textwidth]{src/1v1_raw_scaled.png}
          \caption{1V1, raw scaled data.}
        \end{subfigure}
        \\
        \begin{subfigure}{0.23\textwidth}
          \includegraphics[width=\textwidth]{src/1vr_raw_unscaled.png}
          \caption{1VR, raw unscaled data.}
        \end{subfigure}
        ~
        \begin{subfigure}{0.23\textwidth}
          \includegraphics[width=\textwidth]{src/1vr_raw_scaled.png}
          \caption{1VR, raw scaled data.}
        \end{subfigure}
	\caption{Confusion matrices for 1VR and 1VR with scaled and unscaled data.} \label{fig:svmconfuse}
\end{figure}

\subsection{Kernel Type}
This section contains the results obtained by comparing the four types of kernels offered by \texttt{libsvm} that are not precomputed, with default arguments. It is assumed that this provides an even comparison, although it is possible that a kernel that performs worse on default may perform better than all other kernels with tuned parameters.

\begin{table}[!ht]
\centering
\caption{Comparison of kernels with default arguments using raw scaled data}\label{tbl:kernel_raw}
\begin{tabular}{lllll}
\textbf{Kernel Type} & \textbf{Multi-class Type} & \textbf{Train (s)} & \textbf{Test (s)} & \textbf{Accuracy}\\ \hline
linear & 1v1 & 3.8927 & 13.0467 & 0.77564\\ \hline
linear & 1vR & 8.3609 & 2.0556 & 0.87179\\ \hline
polynomial & 1v1 & 3.5697 & 16.8125 & 0.20513\\ \hline
polynomial & 1vR & 22.6973 & 7.4879 & 0.66667\\ \hline
radial & 1v1 & 3.7123 & 20.2153 & 0.61538\\ \hline
radial & 1vR & 13.7904 & 4.0895 & 0.77564\\ \hline
sigmoid & 1v1 & 3.1532 & 13.591 & 0.49359\\ \hline
sigmoid & 1vR & 5.9455 & 1.0399 & 0.42949\\ \hline
\end{tabular}
\end{table}

As can be seen in Table \ref{tbl:kernel_raw}, of the three most accurate results, two were obtained by the linear kernel. The third result was obtained by the radial kernel, with comparable computation time. Therefore, the linear kernel will be further investigated to find out the most efficient parameters to use. This seems in line with what the SVM Guide \cite{linear} had stated, given the ratios of dimensions to instances.

\subsection{Kernel Parameters}
In the linear kernel, the only parameter that can be varied is the cost value for incorrect values ($C$), which affects how well-fitted the model is to the data. This means that as $C$ grows, the model is penalised more for complexity, while a smaller $C$ value does the opposite. To investigate, a range of values were used over several orders of magnitude.

\begin{table}[!ht]
\centering
\caption{Comparison of cost value with linear kernel using raw scaled data}\label{tbl:linear_params}
\begin{tabular}{lllll}
\textbf{Cost Value} & \textbf{Multi-class Type} & \textbf{Train (s)} & \textbf{Test (s)} & \textbf{Accuracy}\\ \hline
% Deliberately used way too many values, I figure we can delete to the interesting few and refer to the appendix for the rest (if we choose to keep the appendix, there's an awful lot there)
0.0001 & 1v1 & 3.4176 & 16.7002 & 0.42308\\ \hline
% 0.0005 & 1v1 & 3.7042 & 13.5209 & 0.65385\\ \hline
% 0.001 & 1v1 & 4.0673 & 12.7367 & 0.76923\\ \hline
% 0.005 & 1v1 & 3.7783 & 12.5276 & 0.77564\\ \hline
0.01 & 1v1 & 3.7647 & 12.5884 & 0.77564\\ \hline
% 0.05 & 1v1 & 3.7463 & 12.3063 & 0.77564\\ \hline
% 0.1 & 1v1 & 3.871 & 12.4042 & 0.77564\\ \hline
% 0.5 & 1v1 & 3.659 & 12.4647 & 0.77564\\ \hline
1 & 1v1 & 3.7817 & 12.4829 & 0.77564\\ \hline
% 5 & 1v1 & 3.7701 & 12.5589 & 0.77564\\ \hline
% 10 & 1v1 & 3.6937 & 12.5284 & 0.77564\\ \hline
% 50 & 1v1 & 3.7575 & 12.8265 & 0.77564\\ \hline
100 & 1v1 & 3.9003 & 12.4283 & 0.77564\\ \hline
% 500 & 1v1 & 3.7964 & 12.3649 & 0.77564\\ \hline
% 1000 & 1v1 & 3.7596 & 12.4408 & 0.77564\\ \hline
% 5000 & 1v1 & 3.8467 & 12.5773 & 0.77564\\ \hline
10000 & 1v1 & 3.8131 & 12.4916 & 0.77564\\ \hline
% 50000 & 1v1 & 3.6352 & 12.6142 & 0.77564\\ \hline
0.0001 & 1vr & 8.1454 & 1.9247 & 0.82692\\ \hline
% 0.0005 & 1vr & 7.8149 & 1.9132 & 0.82692\\ \hline
% 0.001 & 1vr & 7.8003 & 1.9665 & 0.86538\\ \hline
% 0.005 & 1vr & 8.0325 & 2.0027 & 0.87179\\ \hline
0.01 & 1vr & 8.0002 & 2.0105 & 0.87179\\ \hline
% 0.05 & 1vr & 8.0123 & 2.0087 & 0.87179\\ \hline
% 0.1 & 1vr & 8.0213 & 2.01 & 0.87179\\ \hline
% 0.5 & 1vr & 7.9881 & 2.0267 & 0.87179\\ \hline
1 & 1vr & 8.0264 & 2.0479 & 0.87179\\ \hline
% 5 & 1vr & 7.9924 & 2.0216 & 0.87179\\ \hline
% 10 & 1vr & 7.9859 & 2.0153 & 0.87179\\ \hline
% 50 & 1vr & 8.0526 & 2.005 & 0.87179\\ \hline
100 & 1vr & 7.9783 & 2.0271 & 0.87179\\ \hline
% 500 & 1vr & 8.0035 & 2.0137 & 0.87179\\ \hline
% 1000 & 1vr & 8.0447 & 2.0237 & 0.87179\\ \hline
% 5000 & 1vr & 8.0123 & 2.0206 & 0.87179\\ \hline
10000 & 1vr  & 8.014 & 2.026 & 0.87179\\ \hline
% 50000 & 1vr  & 7.985 & 2.0196 & 0.87179\\ \hline
\end{tabular}
\end{table}

As seen in Table \ref{tbl:linear_params}, as soon as $C$ rises about 0.001 in either case, the accuracy no longer changes. As a lower cost value results in an overfitted model, the very small cost values show an overfitted model; the fact that the accuracy saturates above 0.001 shows that underfitting does not significantly affect the accuracy of the model. Overall, it appears that the value of $C$ has no impact on the time taken.

\subsection{PCA Data}
Following the investigation of various parameters for SVMs trained and tested on raw data, the PCA coefficients were also used. From the previous section, it was seen that scaled data worked better, and thus the PCA coefficients were immediately scaled and different kernels were tested with their default values.

\begin{table}[!ht]
\centering
\caption{Comparison of kernels with default arguments using scaled PCA coefficients}\label{tbl:kernel_pca}
\begin{tabular}{lllll}
\textbf{Kernel Type} & \textbf{Multi-class Type} & \textbf{Train (s)} & \textbf{Test (s)} & \textbf{Accuracy}\\ \hline
linear & 1v1 & 0.69941 & 2.3322 & 0.75641\\ \hline
linear & 1vr & 5.0414 & 1.9933 & 0.75\\ \hline
polynomial & 1v1 & 0.72556 & 2.3872 & 0.083333\\ \hline
polynomial & 1vr & 5.2908 & 2.5659 & 0.64103\\ \hline
radial & 1v1 & 0.71748 & 3.0122 & 0.75641\\ \hline
radial & 1vr & 5.7246 & 2.7747 & 0.75641\\ \hline
sigmoid & 1v1 & 0.66103 & 2.5631 & 0.75641\\ \hline
sigmoid & 1vr & 4.569 & 1.8432 & 0.76923\\ \hline
\end{tabular}
\end{table}

This shows a surprising difference to the results in Table \ref{tbl:kernel_raw}, as in Table \ref{tbl:kernel_pca}, the most accurate results are obtained by the sigmoid kernel function.

The sigmoid kernel is calculated using two parameters that the linear function does not need, labelled \texttt{gamma} and \texttt{coef0}. The coefficients were investigated in the range -1.5 to 1.5 in steps of 0.5, cost over a restricted range of several orders of magnitude, and the gamma as several orders of magnitude all divided by the number of features.

% I don't know how to do this one. There is a fucking butt ton of data. Probably we should just skim for interesting results. I'll put a single line placeholder in for now.

% This sticks out over the edge of the column. Think it's the title of multi-class type.
\begin{table}[!ht]
\centering
\caption{Comparison of parameters with sigmoid kernel using scaled PCA coefficients}\label{tbl:sigmoid_params}
\begin{tabular}{lllllll}
\textbf{Cost} & \textbf{\texttt{Gamma}} & \textbf{\texttt{Coef0}} & \textbf{MC Type} & \textbf{Train (s)} & \textbf{Test (s)} & \textbf{Accuracy}\\ \hline
5 & 0.013736 & -1 & 1vr & 5.3513 & 2.134 & 0.77564\\ \hline
10 & 0.013736 & -1 & 1vr & 5.9747 & 2.2746 & 0.77564\\ \hline
050 & 0.013736 & -1 & 1vr &	5.9585 & 2.2714 & 0.77564\\ \hline
100 & 0.013736 & -1 & 1vr & 5.6929 & 2.2026 & 0.77564\\ \hline
0.1 & 0.027473 & -1	& 1v1	& 0.71012	& 2.5408	& 0.80128\\ \hline
\end{tabular}
\end{table}

\todo{Write a little comment about the maximum accuracy achieveable.}
Now refer to something interesting from Table \ref{tbl:sigmoid_params}.

%-------------------------------------------------------------------------------
% Conclusion
%-------------------------------------------------------------------------------
\section{Conclusion}
\todo{Write conclusion!}
A conclusion. Compare PCA with SVM. (overall increase). Summarise 1v1 and 1vr with different params. Mention that weighting data in 1vR is a possiblity next time (limited space). Cross-validation also an option. Maybe vary number of bases (we used all of them in this case).
\todo{PROOFREAD LOL}

\bibliographystyle{unsrt}
\bibliography{pr_refs}

\clearpage
\onecolumn

%-------------------------------------------------------------------------------
% Appendix A - Source Code
%-------------------------------------------------------------------------------

\section*{Appendix A - Source Code}

This section contains the functions used to generate and test the multi-class SVMs.
% Put all code files here. Note that hyperlinks do not work :(
\subsection*{Generation Function for 1VR SVMs}
\lstinputlisting[style=Matlab-editor]{src/one_v_rest_svm.m}
\newpage
\subsection*{Test Function for 1VR SVMs}
\lstinputlisting[style=Matlab-editor]{src/one_v_rest_svm_test.m}
\newpage
\subsection*{Generation Function for 1V1 SVMs}
\lstinputlisting[style=Matlab-editor]{src/one_v_one_svm.m}
\newpage
\subsection*{Test Function for 1V1 SVMs}
\lstinputlisting[style=Matlab-editor]{src/one_v_one_svm_test.m}

%-------------------------------------------------------------------------------
% Appendix B - Results of SVM Training
%-------------------------------------------------------------------------------

% \section*{Appendix B - Results of SVM Training}

% Efficiency values are calculated using eff = (t_test / t_train) *acc
% I'm sure this is wrong because an SVM that takes longer to test is considered more efficient
% Perhaps we can use eff = acc / (t_test * t_train) ? That punishes inaccuracy, testing time, and training time, but does have units 1/s^2
% I have an uncommitted results .csv file and a python file to process the results into a LaTeX formatted table. We can change the equation in there. Let me know if you want me to send you them, or I'm happy to just update svm_res as required

% \include{svm_res}

\end{document}