Machine-Learning/Machine Learning.tex at master · hooyuser/Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
\documentclass{report}
% Comment the following line to NOT allow the usage of umlauts
\usepackage[utf8]{inputenc}
% Uncomment the following line to allow the usage of graphics (.png, .jpg)

\usepackage{geometry}
\geometry{left=3cm,right=3cm,top=3cm,bottom=3cm}

\usepackage[usenames,dvipsnames]{color}
\usepackage[colorlinks,linkcolor=NavyBlue,anchorcolor=red,citecolor=green]{hyperref}
\usepackage[table,xcdraw]{xcolor}
\usepackage{graphicx}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{float}
\usepackage{bm}
\usepackage{bbm}
\usepackage{array,makecell}


\usepackage{amsmath,amsfonts}
\usepackage{mathrsfs}
%\usepackage[utopia]{mathdesign}
%\usepackage[charter]{mathdesign}
%\usepackage{millennial}
%\usepackage{unicode-math}
%\setmathfont{Latin Modern Math}
%\setmathfont{TG Termes Math}

\usepackage[thmmarks,amsmath]{ntheorem}
\usepackage{tikz}
\theorembodyfont{\upshape}
\newtheorem{definition}{Definition}[chapter]
\newtheorem{example}{Example}[chapter]
\newtheorem{proposition}{Proposition}[chapter]
\newtheorem{corollary}{Corollary}[chapter]
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}{Lemma}[chapter]
\theoremstyle{nonumberplain}
\theoremheaderfont{\itshape}
\theorembodyfont{\normalfont}
\theoremsymbol{\\ \rightline{$\square$}}
\newtheorem{proof}{Proof.}

\newcommand{\E}{\mathrm{E}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\M}{\mathbf{M}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\vbe}{\bm{\beta}}
\newcommand{\vep}{\bm{\varepsilon}}
\newcommand{\vb}{\mathbf{b}}

% Start the document
\begin{document}
	\begin{center}
	~\\
	\vspace{6em}
	\textsc{\Huge Machine Learning}
	~\\
	\vspace{2.5em}
	{\Large From the Perspective of Statistics}
	~\\
	\vspace{6em}
	\textsf{H.C.}
	~\\
	\vspace{5in}
	{\large Latest Update: \today}
	\end{center}

% Create a new 1st level heading

\chapter*{Mathematical Notation}
	Suppose that $X:\Omega\to\mathbb{R}^p$ is a random variable. Sometimes we identify $X$ with its image $\mathrm{im}\,X$ for simplicity.
\begin{itemize}
	\item $X\sim P(x)$: The random variable $X$ follows the distribution $P(x)$.
	\item $X_i\stackrel{\text{i.i.d}}{\sim}P(x)\;(i=1,\cdots,N)$: The independent and identically distributed random vectors $X_i$ follow the distribution $P(x)$. \item $\mathbf{S}_N=\{X_i\}_{i=1}^N\stackrel{\text{i.i.d}}{\sim}P(x)$: $\mathbf{S}_N$ is a simple random sample from the distribution $P(x)$.
\end{itemize}
\chapter{Introduction}

\section{Terminology and Framework}

\begin{itemize}
	\item \textbf{Data generating process}: $X$ is a $p$-dimensional random variable and $Y$ is a 1-dimensional random variable. $(X,Y)$ follows the distribution $P(x,y)$.
	\begin{itemize}
		\item Input vector: $X\in D\subseteq\mathbb{R}^p$.
		\item Output vector: $Y\in G\subseteq\mathbb{R}$.
		\item Data: Given the training sample $\mathbf{S}_{N}=\{(X_i,Y_i)\}_{i=1}^{N}\stackrel{\text{i.i.d}}{\sim}P(x,y)$, the training data or training set consists of the realization value of the sample, that is
		$$
		T_N=\{(x_1,y_1),(x_2,y_2),\cdots,(x_{N},y_{N})\}.
		$$
		For another sample $\mathbf{S}_{M}'=\{(X_i',Y_i')\}_{i=1}^{M}\stackrel{\text{i.i.d}}{\sim}P(x,y)$, the test sample, we define the test data or test set as the realization value of $\mathbf{S}_{M}'$
		$$
		Q_M=\{(x'_{1},y'_{1}),(x'_{2},y'_{2}),\cdots,(x'_{M},y'_{M})\}.
		$$
	\end{itemize}

	\item \textbf{Objective}: Given the training set $T_N$, find a decision function $\hat{f}\in\mathcal{F}\subset \mathbb{R}^{(\mathbb{R}^p)}$ that produces as little expected prediction error (EPE)
	\[
	\mathrm{E}_{X,Y}[L(Y,f(X))]
	\]
	as possible.
	\begin{itemize}
		\item Decision function: $f:\mathbb{R}^p\supset D\to\mathbb{R},x\mapsto f(x)$ serves to make a prediction of $Y$, provided a specified value $x$ of $X$.
		\item Loss function: $L:\mathbb{R}^2\to [0,\infty)$ is a non-negative function that satisfies
		$$L(t_1,t_2)=0\iff t_1=t_2=0.$$
		$L(Y,f(X))$ normally has the form of
		\[
		L_2=(Y-f(X))^2\ \text{ or }\ L_1=|Y-f(X)|\ \text{ or }\ L_I=1_{Y\ne f(X)} \text{ or }L_C=Y\log f(X)+(1-Y)\log(1-f(X)).
		\]
		\item Hypothesis space: $\mathcal{F}$ is a collection of all potential decision functions $f$ to be selected. In some cases, we suppose that $f$ as a candidate can be specified by some parameters. Thus $\mathcal{F}=\{f_\theta|Y=f_\theta(X),\;\theta\in\Theta\}$ can be described by the parametric space $\Theta$.
	\end{itemize}
	If the probability distribution of $(X,Y)$ was known to us, it might succeed to find the optimal solution $\tilde{f}$ of the following minimization problem
	\[
	\tilde{f}=\arg\min_{f\in\mathcal{F}}\mathrm{E}_{X,Y}[L(Y,f(X))]
	\]
	in virtue of the optimization theory. In other words, what need to be settled is purely an optimization problem rather than a statistical problem. In particular, in the case of $Y=f(X)$, if $f\in\mathcal{F}$, then the optimal $\tilde{f}$ coincides with the function $f$ itself and the minimum of EPE can reach 0.

	Unfortunately, the specific distribution of $(X,Y)$ is typically inaccessible in reality. Thus we can only exploit the training data to estimate an acceptable decision function $\hat{f}$, with acceptance of the fact that $\hat{f}$ in general has a greater EPE than $\tilde{f}$.

	\item \textbf{Learning Algorithm}:
	A learning algorithm is a measurable mappings $\mathsf{A}:(D\times G)^\mathbb{N}\to \mathcal{F}$ sending the training set $T_N$ to the decision function $\mathsf{A}(T_N)=\hat{f}\in \mathcal{F}$, where $T_N\in(D\times G)^N$ is identified with an element in $(D\times G)^\mathbb{N}$ by the natural inclusion.
	\begin{definition}[PAC identify]
		Assume $\tilde{f}=\arg\min_{f\in\mathcal{F}}\mathrm{E}[L(Y,f(X))]
		$. If there exists a learning algorithm $\mathsf{A}$ such that for all $\varepsilon>0$, $\delta>0$ and $P(x,y)$ on $\mathbb{R}^{p+1}$, there exists a positive integer $N^*$ such that as long as $N\ge N^*$,
		\[
		\mathrm{P}_{\mathbf{S}_N}\left(\mathrm{E}_{X,Y}[L(Y,(\mathsf{A}(\mathbf{S}_N))(X))\mid \mathbf{S}_N]-\mathrm{E}_{X,Y}[L(Y,\tilde{f}(X))] \le \varepsilon\right) \ge 1-\delta,
		\]
		we say that $\mathsf{A}$ can PAC (probably approximately correctly) identify $\tilde{f}$ from $\mathcal{F}$ and that $\tilde{f}$ is PAC identifiable.
	\end{definition}

	\item \textbf{Optimization strategies}: Since the EPE minimization problem is ill-formed, we have developed two major strategies to produce a tractable optimization problem.
	\begin{itemize}
		\item Empirical risk minimization: according to the law of large numbers, if some regular conditions$\footnote{For example, the uniform law of large numbers can be applied here. The details are consigned to the appendix.}$ hold, then as $N\to\infty$, the solution of the following minimization problem will converge to the theoretically optimal solution $\tilde{f}$.
		\[
		\min_{f\in\mathcal{F}}\frac{1}{N}\sum_{i=1}^{N}L(Y_i,f(X_i))
		\]

		\item Structural risk minimization: In reality, the size of training set $N$ is limited and accordingly the method of empirical risk minimization may not generate a function $\hat{f}$ which is sufficiently close to $\tilde{f}$. Later we will elaborate this phenomenon named ``overfitting". However, if we add a regularizer or penalty term $\lambda J(f)$ to penalize the complexity of the decision function $f$ as follows
		\[
		\min_{f\in\mathcal{F}}\frac{1}{N}\sum_{i=1}^{N}L(Y_i,f(X_i))+\lambda J(f),
		\]
		it is possible to lead to a better result.
	\end{itemize}

\end{itemize}

We can easily generalize the assumption that $Y$ is a 1-dimensional random variable, supposing $Y$ is an $n$-dimensional random vector. For example, in the multi-class classification, $Y$ is often assumed to be an $n$-dimensional random vector with components from 0 to 1. And the loss function is taken as
\[
L_C=\sum_{i=1}^{n}Y_i\log (f(X)_i)
\]

There are some other possible objectives involving more distribution information rather than minimizing $\mathrm{E}[L(Y,f(X))]$. For instance, we can expect to solve
\[
\tilde{f}=\arg\min_{f\in\mathcal{F}}\left(\mathrm{E}[L(Y,f(X))]+c_1\mathrm{Var}[L(Y,f(X))]+c_2 \mathtt{Extreme Value Risk}[L(Y,f(X))]\right).
\]
More generally, suppose $\mathrm{Div}(P_1,P_2)$ is a non-negative function measuring the difference between the two distribution $P_1$ and $P_2$. We may expect to find a conditional distribution $\hat{P}(y|x)$ to minimize $$\mathrm{E}[\mathrm{Div}(P(Y|X),\hat{P}(Y|X))].$$


\section{Optimal Decision Functions in Special Cases}
Supposing $\mathcal{F}$ is a collection of all functions $f:D\to G$, let's calculate the forms of the optimal decision function $\tilde{f}$ for some specific loss functions.
\subsection{Loss function for quantitative output variables: squared error loss}
Let $Y\in\mathbb{R}$ be a quantitative variable. And we take the most
common and convenient loss function, squared error loss
\[
L(Y,f(X))=(Y-f(X))^2.
\]
In this case, the problem of minimizing the expected prediction error becomes
\[
\min_{f\in\mathcal{F}}\mathrm{E}[(Y-f(X))^2]=\min_{f\in\mathcal{F}}\int (y-f(x))^2\;\mathrm{dP}(x,y).
\]
Note that
\[\mathrm{E}[(Y-f(X))^2]=\mathrm{E}[\mathrm{E}[(Y-f(X))^2|\; X]]=\int\mathrm{E}[(Y-f(x))^2|\; X=x]\;\mathrm{dP}(x).\]
It suffices to minimize EPE pointwise, that is,
\begin{align*}
&\ \ \min_{f(x)\in\mathbb{R}}\mathrm{E}[(Y-f(x))^2|\; X=x]\\
&=\min_{c\in\mathbb{R}}\mathrm{E}[(Y-c)^2|\; X=x]\\
&=\min_{c\in\mathbb{R}}\mathrm{Var}[Y-c|\; X=x]+(\mathrm{E}[(Y-c)|\; X=x])^2\\
&=\min_{c\in\mathbb{R}}\mathrm{Var}[Y|\; X=x]+(\mathrm{E}[Y|\; X=x]-c)^2.\\
\end{align*}
We see the optimal solution is
\[
\tilde{f}(x) = \mathrm{E}[Y|\; X=x],
\]
Thus the best prediction of $Y$ at any point $X = x$ is the conditional expectation, when best is measured by average squared error. \\
Later in this book we are to develop effective methods to estimate the conditional expectation $\mathrm{E}[Y|\; X=x]$.
\subsection{Loss function for categorical output variable: 0-1 indicator}
Assume that $Y\in G$ is a categorical variable and that the set of possible categories $G=\{G_1,G_2,\cdots,G_K\}$. This time the 0–1 loss function
\[
L(Y,f(X))=1_{Y\ne f(X)}=
\begin{cases}
0,&Y=f(X),\\
1,&Y\ne f(X),
\end{cases}
\]
is adopted for simplification. Likewise it suffices to minimize EPE pointwise.
\begin{align*}
&\ \ \min_{f(x)\in G}\mathrm{E}[1_{Y\ne f(x)}|\; X=x]\\
&=\min_{g\in G}\mathrm{E}[1-1_{Y=g}|\; X=x]\\
&=\min_{g\in G}1-\mathrm{P}(Y=g|\; X=x)
\end{align*}
And the optimal solution is
\[
\tilde{f}(x)=\max_{g\in G} \mathrm{P}(Y=g|\; X=x)
\]

\section{Generalization Error Bound}

As is mentioned before, we always hope that $\hat{f}$ has as small EPE as possible. EPE is also called the generalization error, indicating it gauges the performance of the selected function $\hat{f}$ in a general sense. We also emphasize that without knowing the probability distribution of $(X,Y)$, there is no way to calculate the expectation of $L(Y,f(X))$. In practice, it is typical to analyze the upper bound of generalization error to describe the generalization ability of the selected function $\hat{f}$.

For the convenience of description, let's denote the expected prediction error and the empirical error respectively by
\[
R(f)=\mathrm{E}[L(Y,f(X))]\quad\text{ and }\quad\widehat{R}(f)=\frac{1}{N} \sum_{i=1}^{N} L\left(Y_{i}, f\left(X_{i}\right)\right).
\]

In the case of binary classifications with a finite hypothesis space, deriving the generalization error bound is relatively simple. Actually, we have the following result.

\begin{theorem}
	Suppose that the training sample $\mathbf{S}_N=\{(X_1,Y_1),(X_2,Y_2),\cdots,(X_{N},Y_{N})\}$ where $X_i\in D\subseteq\mathbb{R}^p$ and $Y_i\in\{+1,-1\}$ is  independently generated from the distribution $P(x,y)$. If the hypothesis space is a finite set $\mathcal{F}=\{f_1,f_2,\cdots,f_d\}$ and the 0-1 loss function is taken, then  with a probability of not less than $1-\delta$ we have for all $f\in \mathcal{F}$,
	\[
	\widehat{R}(f)-\sqrt{\frac{1}{2 N}\left(\ln d+\ln \frac{1}{\delta}\right)}\le R(f) \le \widehat{R}(f)+\sqrt{\frac{1}{2 N}\left(\ln d+\ln \frac{1}{\delta}\right)}.
	\]
\end{theorem}

\begin{proof} The Hoeffding's inequality states that if $X_1,\cdots, X_n$ are independent random variables bounded by the interval $[0, 1]$: $0 \le X_i \le 1$, then $\overline{X}=\sum_{i=1}^{n}X_i/n$ satisfies
\[
\mathrm{P}(\mathrm{E}[\overline{X}]-\overline{X} \geq t) \leq  e^{-2 n t^{2}},\quad\mathrm{P}(\overline{X}-\mathrm{E}[\overline{X}] \geq t) \leq  e^{-2 n t^{2}}.
\]
for all $t>0$. Applying this inequality we get
\[
\mathrm{P}\left(R(f_i)-\widehat{R}(f_i) \ge t\right) \leq  e^{-2 N t^{2}},\quad\mathrm{P}\left(\widehat{R}(f_i)-R(f_i) \ge t\right) \leq  e^{-2 N t^{2}}\quad(i=1,2,\cdots,d).
\]
Let $A_i$ be the event which refers to $R(f_i)-\widehat{R}(f_i) \ge t$. We have
\[
\mathrm{P}\left(\bigcup_{i=1}^d A_i\right)\le \sum_{i=1}^d\mathrm{P}\left( A_i\right)\le de^{-2 N t^{2}}
\implies \mathrm{P}\left(\bigcap_{i=1}^d A_i^c\right)\ge 1- de^{-2 N t^{2}}.
\]
In other words, for any $f\in \mathcal{F}$, with a probability of not less than $1- de^{-2 N t^{2}}$ we have
\[
R(f) - \widehat{R}(f)<t \quad\text{ or }\quad R(f) < \widehat{R}(f)+t.
\]
Likewise, we can let $B_i$ be the event which refers to $\widehat{R}(f_i)-R(f_i) \ge t$ and deduce
\[
\widehat{R}(f) -R(f) <t \quad\text{ or }\quad  \widehat{R}(f)-t<R(f) .
\]
Take
\[
t=\sqrt{\frac{1}{2 N}\left(\ln d+\ln \frac{1}{\delta}\right)}
\]
and we completes our proof.
\end{proof}

It would be natural to consider binary classification problems with a infinite hypothesis space, which seems to be a more realistic assumption. To characterize the complexity of the infinite hypothesis space, we have to introduce some new concepts.

\begin{definition}[dichotomies]
	The \emph{dichotomies} generated by $\mathcal{F}\subseteq\{+1,-1\}^D$ on the points $x_{1}, \cdots, x_{n}\in D$ are defined by
	\[
	\mathcal{F}\left({x}_{1}, \cdots, {x}_{n}\right)=\left\{\left(f\left({x}_{1}\right), \cdots, f\left({x}_{n}\right)\right) |x_{1}, \cdots, x_{n}\in D,\; f \in \mathcal{F}\right\},
	\]
	where $\{+1,-1\}^D$ consists of all mappings from $D$ to $\{+1,-1\}$.
\end{definition}

$f$ sends each $x_i$ to a specific value $+1$ or $-1$ as if it endows each $x_i$ with a label. Thus we see $x_{1}, \cdots, x_{n}$ can be classified into two categories by these labels and that different $f$ may result in different ways of classification.

\begin{definition}[growth function]
	The \emph{growth function} is defined for a hypothesis space $\mathcal{F}$ by
	\[
	\Pi_{\mathcal{F}}(n)=\max _{\left\{x_{1}, \cdots, x_{n}\right\} \subseteq D}\left|\mathcal{F}\left({x}_{1}, \cdots, {x}_{n}\right)\right|,
	\]
	where $|\cdot|$ denotes the cardinality (number of elements) of a set.
\end{definition}
$\Pi_{\mathcal{F}}(n)$ is the maximum number of dichotomies that can be generated by $\mathcal{F}$ on any $n$ points. Since $\mathcal{F}\left({x}_{1}, \cdots, {x}_{n}\right)\subseteq\{+1,-1\}^n$,  it clearly holds that
\[
\Pi_{\mathcal{F}}(n)\le 2^n.
\]
If $\mathcal{F}$ is capable of generating all possible dichotomies on $\{x_{1}, \cdots, x_{n}\}$, then $\mathcal{F}\left({x}_{1}, \cdots, {x}_{n}\right)=\{+1,-1\}^n$ and we say that $\mathcal{F}$ can \emph{shatter} $x_{1}, \cdots, x_{n}$. This signifies that $\mathcal{F}$ is as diverse as can be on this particular sample.

\begin{definition}[VC dimension]
	The Vapnik-Chervonenkis dimension of a hypothesis space $\mathcal{F}$, denoted by $d_{VC}(\mathcal{F})$  or simply $d_{VC}$, is the largest value of $N$ for which $\Pi_{\mathcal{F}}(n)=2^n$. That is,
	\[
	d_{VC}(\mathcal{F})=\max \left\{m \mid \Pi_{\mathcal{F}}(m)=2^{m}\right\}
	\]
	If $\Pi_{\mathcal{F}}(n)=2^n$ for all $n$, then $d_{VC}(\mathcal{F})=\infty$.
\end{definition}

It is straightforward to show that if $\Pi_{\mathcal{F}}(m)=2^{m}$ then $\Pi_{\mathcal{F}}(n)=2^{n}$ for all $n<m$, since $\mathcal{F}\left({x}_{1}, \cdots, {x}_{m}\right)=\{+1,-1\}^m\implies\mathcal{F}\left({x}_{1}, \cdots, {x}_{m-1}\right)=\{+1,-1\}^{m-1}$. Thus we have
\[
\begin{cases}
\Pi_{\mathcal{F}}(n)=2^n,&\text{if }n\le d_{VC}(\mathcal{F}),\\
\Pi_{\mathcal{F}}(n)<2^n,&\text{if }n> d_{VC}(\mathcal{F}).
\end{cases}
\]
If $\Pi_{\mathcal{F}}(k)<2^k$, we say $k$ is a \emph{break point} for $\mathcal{F}$. Next we are to find a more accurate bound for $\Pi_{\mathcal{F}}(n)$ at the break point.

\begin{definition} Let $\mathfrak{G}=\{\mathcal{F}\subseteq\{+1,-1\}^D\mid\Pi_{\mathcal{F}}(k)< 2^k\}$. Define
	\[
	B(n, k)=\max _{\mathcal{F}\in\mathfrak{G},\,\left\{x_{1}, \cdots, x_{n}\right\} \subseteq D}\left|\mathcal{F}\left({x}_{1}, \cdots, {x}_{n}\right)\right|.
	\]
\end{definition}
$B(n, k)$ is the maximum number of dichotomies which can be generated by some hypothesis space on $n$ points in $D$ such that these dichotomies cannot shatter any subsets of size $k$ of the $n$ points. If $k$ is a break point for $\mathcal{F}$ and $n\ge k$, then we can see $\Pi_{\mathcal{F}}(n)\le B(n, k)$ just from their definitions.

To evaluate $B(n, k)$ , we start with the two boundary conditions
$k = 1$ and $n = 1$.
\[
\begin{aligned} B(n, 1) &=1, \\
 B(1,k) &=2 \text { for } k>1. \end{aligned}
\]
$B(n,1) = 1$ for all $n$ since if no subset of size 1 can be shattered, then only one dichotomy can be allowed. A second different dichotomy must differ on at least one point and then that subset of size 1 would be shattered. $B(l,k) = 2$ for $k > 1$ since in this case there do not even exist subsets of size k; the constraint is vacuously true and we have 2 possible dichotomies ( $+1$ and $-1$ ) on the one point .
\begin{theorem}
	If $d_{VC}<\infty$, then for all $n\ge1$,
	\[
	\Pi_{\mathcal{F}}(n) \le \sum_{i=0}^{d_{VC}}\binom{n}{i}\le n^{d_{VC}}+1.
	\]
\end{theorem}

\begin{corollary}
	If $d_{VC}<\infty$, then for all $n\ge d_{vc}$,
	\[
	\Pi_{\mathcal{F}}(n) \le\left(\frac{\mathrm{e}\cdot n }{d_{vc}}\right)^{d_{vc}}
	\]
\end{corollary}
\begin{theorem}
Suppose that the training sample $\mathbf{S}_N=\{(X_1,Y_1),(X_2,Y_2),\cdots,(X_{N},Y_{N})\}$ where $X_i\in D\subseteq\mathbb{R}^p$ and $Y_i\in\{+1,-1\}$ is  independently generated from the distribution $P(x,y)$. If the 0-1 loss function is taken, then for any $\mathcal{F}\subseteq\{+1,-1\}^D$, $f\in\mathcal{F}$, tolerance $\delta > 0$,
\[
R(f) \le \widehat{R}(f)+\sqrt{\frac{8}{N} \ln \frac{4 \Pi_{\mathcal{F}}(2N)}{\delta}}\le\widehat{R}(f) +\sqrt{\frac{8}{N} \ln \frac{4 (2N)^{d_{VC}}+4}{\delta}}
\]
with probability $\ge 1-\delta$.
\end{theorem}
\begin{corollary}
	Suppose that the training sample $\mathbf{S}_N=\{(X_1,Y_1),(X_2,Y_2),\cdots,(X_{N},Y_{N})\}$ where $X_i\in D\subseteq\mathbb{R}^p$ and $Y_i\in\{+1,-1\}$ is  independently generated from the distribution $P(x,y)$. If the 0-1 loss function is taken, then for any $\mathcal{F}\subseteq\{+1,-1\}^D$, $f\in\mathcal{F}$, $N\ge d_{VC}$, tolerance $\delta > 0$,
	\[
	R(f) \le \widehat{R}(f)+\sqrt{\frac{8 d_{VC} \ln \frac{2 \mathrm{e} N}{d_{VC}}+8 \ln \frac{4}{\delta}}{N}}
	\]
	with probability $\ge 1-\delta$.
\end{corollary}

\begin{theorem}
	If $d_{VC}<\infty$, empirical risk minimization algorithm
	$$\mathsf{A}_E:T_N\longmapsto \hat{f}=\arg\min_{f\in\mathcal{F}}\widehat{R}(f) $$
	can PAC (probably approximately correctly) identify $\tilde{f}$ from $\mathcal{F}$.
\end{theorem}

\begin{proof}
	Given any $\varepsilon>0$, $\delta>0$, $P(x,y)$ on $\mathbb{R}^{p+1}$, according to theorem 1.1 we take $\mathcal{F}=\{\tilde{f}\}$ and
	\[
	\frac{\varepsilon}{2}=\sqrt{\frac{\ln\left( 2 / \delta\right)}{2 N_1}},
	\]
	which leads to
	\[
	P\left(\widehat{R}(\tilde{f})-R(\tilde{f}) > \frac{\varepsilon}{2}\right) \le \frac{\delta}{2}.
	\]
	According to theorem 1.3, let
	\[
	\frac{\varepsilon}{2}=\sqrt{\frac{8}{N_2} \ln \frac{8 (2N_2)^{d_{VC}}+8}{\delta}}
	\]
	and then we have
	\[
	P\left(R(\hat{f})-\widehat{R}(\hat{f}) > \frac{\varepsilon}{2}\right) \le \frac{\delta}{2}.
	\]
	Hence there must be that
\[
\begin{aligned}
	\mathrm{P}\left(R(\hat{f})-R(\tilde{f}) > \varepsilon\right) &\le \mathrm{P}\left(R(\hat{f})-R(\tilde{f})+\widehat{R}(\tilde{f})-\widehat{R}(\hat{f}) > \varepsilon\right)\qquad\qquad\qquad\left(\text{ since }\widehat{R}(\tilde{f})-\widehat{R}(\hat{f})\ge 0\ \right)\\
	&\le\mathrm{P}\left(\widehat{R}(\tilde{f})-R(\tilde{f}) > \frac{\varepsilon}{2}\;\text{ or }\;R(\hat{f})-\widehat{R}(\hat{f}) > \frac{\varepsilon}{2}\right)\\
	&\le\mathrm{P}\left(\widehat{R}(\tilde{f})-R(\tilde{f}) > \frac{\varepsilon}{2}\right)+\mathrm{P}\left(R(\hat{f})-\widehat{R}(\hat{f}) > \frac{\varepsilon}{2}\right)\\
	&\le\frac{\delta}{2}+\frac{\delta}{2}\\
	&\le\delta.
\end{aligned}
\]
Fix $\varepsilon$ and solve for $\delta$
\[
\delta=\frac{2}{\mathrm{e}^{N_1\varepsilon^2/2}}=\frac{8 (2N_2)^{d_{VC}}+8}{\mathrm{e}^{N_2\varepsilon^2/32}}.
\]
Note that when $N_1$, $N_2$ are sufficiently large these inequalities hold for even smaller $\delta>0$. We conclude that there exists a positive integer $N^*$ such that for all $N\ge N^*$,
\[
\mathrm{P}\left(\mathrm{E}[L(Y,\hat{f}(X))]-\mathrm{E}[L(Y,\tilde{f}(X))] \le \varepsilon\right) \ge 1-\delta.
\]
\end{proof}

\section{tradeoff}
The generalization error bound $\widehat{R}(f)+\varepsilon(d_{VC}, N, \delta)$ helps up understand what is "overfitting" and why we can use structural risk minimization to get over it.

Assume $N$ is fixed and $\hat{f}=\arg\min_{f\in\mathcal{F}}\widehat{R}(f).$ By increasing $d_{VC}$ endlessly, we can extend the hypothesis space $\mathcal{F}$ and consequently reduce or at least maintain the empirical error $\widehat{R}(\hat{f})$ all along. In other words, $\widehat{R}(\hat{f})$ is non-increasing in $d_{VC}$. However, the cost it brings is a greater $\varepsilon(d_{VC}, N, \delta)$, since $\varepsilon(d_{VC}, N, \delta)$ is strictly increasing in $d_{VC}$. Noticing $\widehat{R}(\hat{f})$ cannot be less than 0 and $\varepsilon(d_{VC}, N, \delta)\to\infty$ as $d_{VC}\to\infty$, we can assert that when $d_{VC}$ is sufficiently large, generalization error bound will be also increasing in $d_{VC}$. As a result, although we can find a function $\hat{f}$ with a perfect performance on the training set, the generalization ability of $\hat{f}$ can be very poor. That is to say, $\hat{f}$ is likely to perform badly on the sample out of the training set. The term "overfitting" exactly refers to such a case in which the selected function $\hat{f}$ has a quite small empirical error along with a tremendous generalization error.

Therefore, we have a tradeoff: more complex models help reduce $\widehat{R}(f)$
and hurt $\varepsilon(d_{VC}, N, \delta)$. The optimal model is a compromise that minimizes a combination of the two terms. Now it is clear to see why a penalty term is introduced into the structural risk minimization. It takes the size of hypothesis space $\mathcal{F}$ or equivalently the complexity of candidate functions into consideration. Thus when $N$ is limited, this method promisingly leads to a smaller generalization error.


\chapter{Linear Model}
\section{Finite Sample Linear Model}
\subsection{Statistic model setup}
Linear model supposes the data generating process is
\[
Y= X^T\beta+\varepsilon,
\]
where $X=(1,X^{(1)},\cdots,X^{(p)})\in\mathbb{R}^{p+1}$, $Y\in\mathbb{R}$, $\beta\in\mathbb{R}^{p+1}$ is an unknown parameter and $\varepsilon$ is an error term which cannot be directly observed. Without loss of generality, we can always assume that $\E[\varepsilon|\;X]=0$.
Given a finite sample $(\X,\y)$ of size $n$, the linear Model indicates
\[
\Y = {\mathbf{X}} \beta +\vep,
\]
where
\[
\Y=
\begin{pmatrix}
Y_1\\
Y_2\\
\vdots\\
Y_N
\end{pmatrix}
,\;{\mathbf{X}}=
\begin{pmatrix}
1& x_{11}&\cdots& x_{1p}\\
1& x_{21}&\cdots &x_{2p}\\
\vdots&\vdots& &\vdots\\
1& x_{N1}&\cdots &x_{Np}
\end{pmatrix}
\renewcommand*{\arraystretch}{1.5}
=\begin{pmatrix}
X_1^T\\
X_2^T\\
\vdots\\
X_N^T
\end{pmatrix}
\renewcommand*{\arraystretch}{1}
,\;
\beta=
\begin{pmatrix}
\beta_0\\
\beta_1\\
\vdots\\
\beta_p
\end{pmatrix}
,\;
\vep=
\begin{pmatrix}
\varepsilon_1\\
\varepsilon_2\\
\vdots\\
\varepsilon_N
\end{pmatrix}.
\]
\subsection{Ordinary least square estimate}
If we take squared error loss, then the best prediction of $Y$ is
\[
\tilde{f}(X) = \mathrm{E}[Y|\; X]=X'\beta.
\]
We can use least square method to estimate the parameter $\beta$ in the linear model, by minimizing the residual sum-of-squares
\[
\hat{\beta}=\arg\min_{\theta\in\mathbb{R}^{p+1}}\, RSS(\theta)=\arg\min_{\theta\in\mathbb{R}^{p+1}}\sum_{i=1}^N(Y_i-X_i^T\theta)^2.
\]
If $\X$ is of full column rank, the optimization problem has a unique solution
\[
\hat{\beta}=(\X^T\X)^{-1}\X^T\Y.
\]
The statistic properties of the OLS estimator $\hat{\beta}$ is remarkable. According to Gauss-Markov theorem, Under Assumptions 1.1-1.4
\begin{enumerate}
	\item[1.1]linearity:\quad $Y_i= X_i^T\beta+\varepsilon_i$, $(i=1,2,\cdots,N)$,
	\item[1.2] strict exogeneity:\quad $\E[\vep|\;\X]=0$,
	\item[1.3] no multicollinearity:\quad  $\mathrm{P}(\mathrm{rank}(\X)=p+1)=1$,
	\item[1.4] spherical error variance:\quad $\Var[\vep|\;\X]=\sigma^2>0$,
\end{enumerate}
$\hat{\beta}$ is the best linear unbiased estimator(BLUE). That is, for any unbiased estimator $\hat{\theta}$ that is linear in $\Y$,
\[
\Var[\hat{\theta}|\;\X]\ge\Var[\hat{\beta}|\;\X]
\]
in the matrix sense.

\chapter{SVM}

\section{linear SVM}
Assume that $X\in\mathbb{R}^p$ and that $Y\in G$ is a categorical variable with the set of possible categories $G=\{+1,-1\}$. Linear support vector machine uses the following kind of decision functions to model the relationship between $X$ and $Y$:
\[
Y=\mathrm{sign}\left(X^{T} \beta+\beta_{0}\right).
\]
Suppose we have already had some training data $T_N=\{(x_1,y_1),(x_2,y_2),\cdots,(x_{N},y_{N})\}$. If we can find a superplane $x^{T} \beta+\beta_{0}=0$ such that
\begin{align*}
x_i^{T} \beta+\beta_{0}&>0 \text{ if } y_i=+1,\\
x_i^{T} \beta+\beta_{0}&<0 \text{ if } y_i=-1,
\end{align*}
then the empirical loss of the decision function $\hat{f}(X)=\mathrm{sign}\left(X^{T} \beta+\beta_{0}\right)$ on the training data $T_N$ can reach 0. In this case, we say the two classes are linearly separable and our aim is to maximize the "margin" shown as follows.
\begin{center}
	\begin{tikzpicture}[node distance=2cm]

	\draw[->](-1,0)--(5,0) node[right,scale=1]{$x^1$}; %x axis
	\draw[->](0,-1)--(0,4) node[above,scale=1]{$x^2$}; % y axis
	\draw[-](0.5,0.5)--(4,3);
	\draw[-,dashed](0.5-7*0.017,0.75-5*0.017)--(4-7*0.017,3.25-5*0.017);
	\draw[-,dashed](0.5+7*0.017,0.25+5*0.017)--(4+7*0.017,2.75+5*0.017);
	\draw (2.5,0.8)node[right,scale=0.8]{$x^{T} \beta+\beta_{0}=-M$};
	\draw (0.5,3.5)node[right,scale=0.8]{$x^{T} \beta+\beta_{0}=M$};
	\draw (4.56,3)node[right,scale=0.8]{$x^{T} \beta+\beta_{0}=0$};
	\draw[->](1.3,3.2)--(2.08,2.02);
	\draw[->](3.8,1)--(3.1,2);
	\draw[->](4.5,3)--(4.1,3);
	\filldraw [blue] (1,2.5) circle [radius=1.5pt]
					(0.5,1.5) circle [radius=1.5pt]
					(3,2.7) circle [radius=1.5pt]
					(3.1,3.2) circle [radius=1.5pt]
					(2.5,2.6) circle [radius=1.5pt]
					(1.9,3) circle [radius=1.5pt]
			  		 (1.2,2.2) circle [radius=1.5pt];
	\filldraw [orange] (4,2.3) circle [radius=1.5pt]
	(3.3,1.4) circle [radius=1.5pt]
	(4,2) circle [radius=1.5pt]
	(2,1) circle [radius=1.5pt]
	(1,0.5) circle [radius=1.5pt]
	(1.6,0.3) circle [radius=1.5pt]
	(4.5,2.2) circle [radius=1.5pt];
	\end{tikzpicture}
\end{center}
The distance between the superplane $P_0:x^{T} \beta+\beta_{0}=0$ and $P_1:x^{T} \beta+\beta_{0}=M$ is $\tfrac{M}{\Vert \beta\Vert}$, which can be seen from
\[
M=(x-y)^T\beta\le\Vert x-y\Vert\cdot\Vert \beta\Vert,\ \forall x\in P_1,\forall y\in P_0.
\]
Thus the maximization problem can be stated as
\begin{align*}
\max_{\beta, \beta_{0},M}\ &\frac{M}{\Vert \beta\Vert}\\
\text {s.t. }\quad y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)& \ge M, \ i=1, \ldots, N.
\end{align*}

In general, classes usually overlap in feature space. One way to deal
with the overlap is to still maximize $\tfrac{M}{\Vert \beta\Vert}$, but allow for some points to be on the wrong side of the margin. By defining the slack variables $\xi=\left(\xi_{1}, \xi_{2}, \ldots, \xi_{N}\right)$, we can modify the problem.
\begin{align*}
\max_{\beta, \beta_{0},M}\ &\frac{M}{\Vert \beta\Vert}\\
\text {s.t. }\quad y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)& \ge M(1-\xi_i), \ i=1, \ldots, N,\\
\xi_i&\ge 0,\\
\sum_{i=1}^{N}\xi_i&\le U.
\end{align*}
If we set $M=1$, it yields the following maximization problem
\begin{align*}
\max_{\beta, \beta_{0}}\ &\frac{1}{\Vert \beta\Vert}\\
\text {s.t. }\quad y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)& \ge 1, \ i=1, \ldots, N,
\end{align*}
which is equivalent to the following quadratic programming problem
\begin{align*}
\min_{\beta, \beta_{0}}\ &\frac{1}{2}\Vert \beta\Vert^2\\
\text {s.t. }\quad y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)& \ge 1, \ i=1, \ldots, N.
\end{align*}
The optimal solutions of the problem must satisfy some necessary conditions (Karush-Kuhn-Tucker conditions). Consider the Lagrangian function
\[
L(\beta, \beta_{0},\lambda)=\frac{1}{2}\Vert \beta\Vert^2-\sum_{i=1}^{N}\lambda_i\left(y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)-1\right),
\]
we have
\begin{align*}
&\frac{\partial L}{\partial \beta}=\beta^T-\sum_{i=1}^{N}\lambda_iy_{i}x_i^T=0,\\
&\frac{\partial L}{\partial \beta_0}=\sum_{i=1}^{N}\lambda_iy_{i}=0,\\
&\lambda_i\left(y_{i} \left(x_{i}^{T}\beta+\beta_{0}\right)-1\right)=0,\\
&\lambda_i\ge 0,\\
&y_{i}\left(x_{i}^{T} \beta+\beta_{0}\right)-1\ge 0.
\end{align*}
Cancel $\beta,\beta_0$ and we obtain the dual problem

\begin{align*}
\max_{\lambda}\ &-\sum_{i=1}^{N}\sum_{j=1}^{N}\lambda_i\lambda_jy_{i}y_{j}x_{i}^{T}x_{j}+\sum_{i=1}^{N}\lambda_i\\
\text {s.t. }\quad& \sum_{i=1}^{N}\lambda_iy_{i}=0,\\
&\lambda_i\ge 0.
\end{align*}

\section{linear SVM}


\chapter{KNN}

\section{Nearest-neighbor methods}
Nearest-neighbor methods use those observations in the training set $T$ closest in input space to $x$ to estimate the aforementioned conditional expectation  $\mathrm{E}[Y|\; X=x]$. Specifically, the $k$-nearest neighbor fit for $\hat{Y}$ is defined as follows:
\begin{itemize}
	\item Quantitative output
	\[
	\hat{Y}=\hat{f}(x)=\dfrac{1}{k}\sum_{i:x_i\in N_k(x)}y_i,
	\]
	\item Categorical output
	\[
	\hat{Y}=\hat{f}(x)=\arg\max_{g\in G}\sum_{i:x_i\in N_k(x)}1_{y_i=g},
	\]
\end{itemize}
where $N_k(x)$ is the neighborhood of $x$ defined by the $k$ closest points $x_i$ in the training sample. Closeness implies a metric, which for the moment we assume is Euclidean distance
\[
\Vert x_1-x_2\Vert_2=(x_1-x_2)^2.
\].


\chapter{Decision Tree}
Tree-based methods partition the feature space into a set of rectangles, and
then fit a simple model (like a constant) in each one. We first describe a popular tree-based method called CART (classification and regression tree), and later contrast it with C4.5, a major competitor. CART requires a specific hypothesis space $\mathcal{F}$ which consists of functions of such forms:
\[
f(x)=\sum_{m=1}^{M} c_{m} 1\left[x \in R_{m}\right],
\]
where $p$-dimensional rectangular regions $R_1,R_2,\cdots,R_M$ constitute a recursive partition of the feature space $\mathbb{R}^p$. That is, if a partition $P_J=(R_1,R_2,\cdots,R_J)$ has been obtained, we just cut some rectangular region $R_m(1\le m\le J)$ into two parts by a $p-1$-dimensional hyperplane $x^{(i)}=\ell_J(1\le i\le p)$ to produce a new partition $P_{J+1}=(R'_1,R'_2,\cdots,R'_{J+1})$
\section{Regression Trees}
Assume the output $Y$ is a continuous variable. Then it leads to the concept of regression trees. Conventionally we choose the squared error loss and accordingly obtain the minimization problem
\begin{align*}
\min_{f\in\mathcal{F}}\mathrm{E}[(Y-f(X))^2]&=\min_{f\in\mathcal{F}}\mathrm{E}\left[\left(\sum_{m=1}^{M} (Y-c_{m}) 1_{X \in R_{m}}\right)^2\right]\\
&=\min_{f\in\mathcal{F}}\mathrm{E}\left[\sum_{m=1}^{M} (Y-c_{m})^2 1_{X \in R_{m}}\right]\\
&=\min_{f\in\mathcal{F}}\int\mathrm{E}\left[\left.\sum_{m=1}^{M} (Y-c_{m})^2 1_{x \in R_{m}}\right|X=x\right]\mathrm{dP}(x)\\
&=\min_{f\in\mathcal{F}}\sum_{m=1}^{M}\int_{ R_{m}}\mathrm{E}\left[\left. (Y-c_{m})^2\right|X=x\right]\mathrm{dP}(x).
\end{align*}
It is easy to see whichever partition is specified, the optimal $c_m$ remains
\[
\tilde{c}_m = \mathrm{E}[Y|\; X\in R_{m}].
\]
Now let's consider the empirical risk minimization problem
\[
\min_{f\in\mathcal{F}}\frac{1}{N}\sum_{i=1}^{N}L(Y_i,f(X_i))=\min_{f\in\mathcal{F}}\frac{1}{N}\sum_{i=1}^{N}\sum_{m=1}^{M} (Y_i-c_{m})^2 1_{X_i \in R_{m}}.
\]
Thus the proper estimate of $\hat{c}_m$ is just the average of $Y_i$ in region $R_{m}$
\[
\hat{c}_{m}=\operatorname{ave}\left(Y_{i} | X_{i} \in R_{m}\right).
\]
However, finding the best binary partition in terms of minimum sum of squares
is generally computationally infeasible. Hence we proceed with a greedy
algorithm. Starting with all of the data, consider a splitting variable indexed by $r\in\{1,2,\cdots,p\}$ and
split point $s\in\mathbb{R}$, and define the pair of half-planes
\[
R_{1}(r, s)=\left\{X\in\mathbb{R}^p | X^{(r)} \leq s\right\} \text { and } R_{2}(r, s)=\left\{X\in\mathbb{R}^p  | X^{(r)}>s\right\}.
\]
Then we seek the splitting variable $r$ and split point $s$ that solve
\begin{align*}
&\min _{r, s}\left[\min _{c_{1}} \sum_{X_{i} \in R_{1}(r, s)}\left(Y_{i}-c_{1}\right)^{2}+\min _{c_{2}} \sum_{X_{i} \in R_{2}(r, s)}\left(Y_{i}-c_{2}\right)^{2}\right]\\
=\ &\min _{r, s} \sum_{X_{i} \in R_{1}(r, s)}\left(Y_{i}-\hat{c}_{1}\right)^{2}+\sum_{X_{i} \in R_{2}(r, s)}\left(Y_{i}-\hat{c}_{2}\right)^{2}.
\end{align*}
Having found the best split, we partition the data into the two resulting
regions and repeat the splitting process on each of the two regions. Then
this process is repeated on all of the resulting regions.

Clearly a very large tree might overfit
the data until some minimum node size (say 5) is reached.

\chapter{Neural Networks}
\section{Single hidden layer back-propagation network}
\[
\begin{tikzpicture}[scale=1.5]
%\tikzstyle{every node}=[draw,shape=circle];
\draw (0,0) node [draw,shape=circle] (x1) {$x_1$};
\draw (2,0) node [draw,shape=circle] (x2) {$x_2$};
\draw (4,0) node [draw,shape=circle] (x3) {$x_3$};
\draw (5,0) node (xdots) {$\cdots\cdots$};
\draw (6,0) node [draw,shape=circle] (xp) {$x_p$};
\draw (1,2) node [draw,shape=circle,minimum height=0.92cm] (z1) {$h_1$};
\draw (3,2) node [draw,shape=circle,minimum height=0.92cm] (z2) {$h_2$};
\draw (4,2) node (zdots) {$\cdots\cdots$};
\draw (5,2) node [draw,shape=circle,minimum height=0.92cm] (zm) {$h_M$};
\draw (1.5,4) node [draw,shape=circle,minimum height=0.92cm] (y1) {$y_1$};
\draw (3,4) node [draw,shape=circle,minimum height=0.92cm] (y2) {$y_2$};
\draw (3.75,4) node (ydots) {$\cdots\cdots$};
\draw (4.5,4) node [draw,shape=circle,minimum height=0.92cm] (yk) {$y_K$};
\draw
(x1) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (z1)
(x1) -- node[pos=0.1,below]{} node[pos=0.85,below]{}(z2)
(x1) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (zm)
(x2) -- node[pos=0.05,anchor=90]{} node[pos=0.85,below]{} (z1)
(x2) -- node[pos=0.1,below]{} node[pos=0.85,below]{} (z2)
(x2) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (zm)
(x3) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (z1)
(x3) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (z2)
(x3) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (zm)
(xp) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (z1)
(xp) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (z2)
(xp) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (zm)
(z1) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y1)
(z1) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y2)
(z1) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (yk)
(z2) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y1)
(z2) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y2)
(z2) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (yk)
(zm) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y1)
(zm) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (y2)
(zm) -- node[pos=0.1,above]{} node[pos=0.85,above]{} (yk)
;
\end{tikzpicture}
\]
\subsection{Binary classification}
In the matrix form
\[
y=\sigma(W^{(2)}\mathbf{h}+\mathbf{b}^{(2)}),\mathbf{h}=\sigma(W^{(1)}\mathbf{x}+\mathbf{b}^{(1)}),
\]
where $\sigma(x_i)=\frac{1}{1+e^{-x_i}}$.

\subsection{Multi classification}
In the matrix form
\[
\mathbf{y}=\mathtt{softmax}(W^{(2)}\mathbf{h}+\mathbf{b}^{(2)}),\mathbf{h}=\sigma(W^{(1)}\mathbf{x}+\mathbf{b}^{(1)}),
\]
where
\[
\mathtt{softmax}_k(\mathbf{x})=\frac{e^{x_k}}{\sum_{i=1}^{K}e^{x_i}}.
\]

\section{Convolutional neural network}

\[
\mathtt{Conv2d}(X_{C_{\mathrm{in}},H,W})=b_{C_{\mathrm{out}}}\otimes 1_{H,W}+\mathrm{Weight}_{C_{\mathrm{out}},S*S}\star X_{C_{\mathrm{in}},H,W}
\]
\chapter*{Appendix}
\section*{1. Hoeffding's inequality}
\begin{lemma}
	Let $X$ be a random variable. Then
	\[
	\mathrm{P}(X>\epsilon) \leq \inf _{t \geq 0} \frac{\mathrm{E}\left[e^{t X}\right]}{e^{t\epsilon}}.
	\]
\end{lemma}
\begin{proof}
	For any $t>0$, we have
	\[
	\mathrm{P}(X>\epsilon)
	=\mathrm{P}\left(e^{tX}>e^{t\epsilon}\right)
	=\mathrm{P}\left(\frac{e^{t X}}{e^{t\epsilon}}>1\right) =\mathrm{E}\left(\mathbbm{1}\left\{\frac{e^{t X}}{e^{t\epsilon}}>1\right\}\right)
	\leq\frac{\mathrm{E}\left[e^{t X}\right]}{e^{t\epsilon}}.
	\]
	Notice that if $t=0$,
	\[
	\frac{\mathrm{E}\left[e^{t X}\right]}{e^{t\epsilon}}=1\ge\mathrm{P}(X>\epsilon).
	\]
	Thus we complete the proof.

\end{proof}

\begin{lemma}
	Suppose $X$ is a random variable such that $a \leq X \leq b$. Then for $t\in\mathbb{R}$,
	\[
	\mathrm{E}\left[e^{t X}\right]\leq e^{t \mu} e^{\frac{t^{2}(b-a)^{2}}{8}}
	\]
	where $\mu=\mathbb{E}[X]$.
\end{lemma}

\begin{proof}
Since $f(x)=e^{tx}$ is convex, by Jensen's inequality we get for $x\in[a,b]$
\[
e^{t x} \leq \frac{b-x}{b-a} e^{t a}+\frac{x-a}{b-a} e^{t b}.
\]
Let $Y$ be a random variable such that $\mathrm{E}[Y]=0$ and let $\lambda=-a/(b-a)$, we have
\[
\mathrm{E}[e^{t Y}] \leq \mathrm{E}\left[\frac{b-Y}{b-a} e^{ta}+\frac{Y-a}{b-a} e^{t b}\right]=\frac{be^{t a}-ae^{t b}}{b-a} =(1-\lambda) e^{-t\lambda(b-a)}+\lambda e^{t(1-\lambda)(b-a)}.
\]
Suppose $u=t(b-a)$ and
\[
g(u)=\ln\left((1-\lambda) e^{-\lambda u}+\lambda e^{(1-\lambda)u}\right)=-\lambda u+\ln\left(1-\lambda+\lambda e^u \right),\qquad u>0.
\]
Taylor's theorem implies that there exists $\xi\in [0,u]$ such that
\[
g(u)=g(0)+g^{\prime}(0) u+\frac{g^{\prime \prime}(\xi)}{2 !} u^{2}=\frac{(1-\lambda)\lambda e^\xi}{2(1-\lambda+\lambda e^\xi )^2}u^2.
\]
Since
\[
(1-\lambda+\lambda e^\xi )^2\ge 4(1-\lambda)\lambda e^\xi\implies  \frac{4(1-\lambda)\lambda e^\xi}{(1-\lambda+\lambda e^\xi )^2}\le1,
\]
we have $g(u)\le\frac{1}{8}u^2$. Thus we have
\[
\mathrm{E}[e^{t Y}]\le e^{g(u)}\le e^{\frac{1}{8}u^2}=e^{\frac{t^2(b-a)^2}{8}}.
\]
Take $Y=X-\mu$, we can show that
\[
\mathrm{E}\left[e^{t X}\right]\leq e^{t \mu} e^{\frac{t^{2}(b-a)^{2}}{8}}.
\]
\end{proof}
\begin{theorem}[Hoeffding's Inequality]
	Let $X_{1}, \dots, X_{n}$ be i.i.d.random variables such that $\mathbb{E}\left(X_{i}\right)=\mu$ and $a \leq X_{i} \leq b$. Then, for any $\epsilon>0$
	\[
	\mathrm{P}\left(\left|\bar{X}_{n}-\mu\right| \geq \epsilon\right) \leq 2 e^{-2 n \epsilon^{2} /(b-a)^{2}}
	\]
\end{theorem}

\begin{proof}
	Since $a \leq X_{i} \leq b$, we have $a -\mu\leq X_i-\mu \leq b-\mu$ and
	\[
	\mathrm{P}(X_i-\mu\geq \epsilon) \leq \inf _{t \geq 0} \frac{\mathrm{E}\left[e^{t\left(X_i-\mu\right)}\right]}{e^{t\epsilon}}\le\inf _{t \geq 0} \frac{e^{\frac{t^{2}(b-a)^{2}}{8}}}{e^{t\epsilon}}=e^{\tfrac{-4\epsilon}{(b-a)^2}}.
	\]
\end{proof}
\end{document}