LearningToControlClass/class02/eq_constraints.tex at b37c41bf56c8bfeee685431cac0897bc99acdeeb · LearningToOptimize/LearningToControlClass · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

%\section{Part II -- Equality constraints: KKT, Newton vs. Gauss–Newton}
\section{Constrained Optimization}

% ==== Equality constraints: KKT, Newton vs. Gauss–Newton ====

\begin{frame}{Equality-constrained minimization: geometry and conditions}
\textbf{Problem}; $\min_{x\in\mathbb{R}^n} f(x)\quad \text{s.t.}\quad C(x)=0, C:\mathbb{R}^n\to\mathbb{R}^m$.

\medskip
\textbf{Geometric picture.} At an optimum on the manifold $C(x)=0$, the negative gradient must lie in the tangent space:

$$
\grad f(x^\star)\ \perp\ \mathcal{T}_{x^\star}=\{p:\; J_C(x^\star)p=0\}.
$$

Equivalently, the gradient is a linear combination of constraint normals:

$$
\grad f(x^\star)+J_C(x^\star)^{\!T}\lambda^\star=0,\qquad C(x^\star)=0\quad(\lambda^\star\in\mathbb{R}^m).
$$

\medskip
\textbf{Lagrangian.}; $L(x,\lambda)=f(x)+\lambda^{\!T}C(x)$.
\end{frame}

\begin{frame}{A nicer visual explanation/derivation of KKT conditions}
\begin{center}
    Quick little whiteboard derivation
\end{center}

\end{frame}


\section{Constrained Optimization}

% ==== Slide 1: Picture-first intuition ====
\begin{frame}[t]{Equality constraints: picture first}
\setbeamercovered{invisible}

\textbf{Goal.} Minimize $f(x)$ while staying on the surface $C(x)=0$.

\uncover<2->{\textbf{Feasible set as a surface.} Think of $C(x)=0$ as a smooth surface embedded in $\mathbb{R}^n$ (a manifold).}

\uncover<3->{\textbf{Move without breaking the constraint.} Tangent directions are the “along-the-surface” moves that keep $C(x)$ unchanged to first order. Intuitively: tiny steps that slide on the surface.}

\uncover<4->{\textbf{What must be true at the best point.} At $x^\star$, there is no downhill direction that stays on the surface. Equivalently, the usual gradient of $f$ has \emph{no component along the surface}.}

\uncover<5->{\textbf{Normals enter the story.} If the gradient can’t point along the surface, it must point \emph{through} it—i.e., it aligns with a combination of the surface’s normal directions (one normal per constraint).}
\end{frame}

% ==== Slide 2: From picture to KKT ====
\begin{frame}[t]{From the picture to KKT (equality case)}
\setbeamercovered{invisible}

\textbf{KKT conditions at a regular local minimum (equality only):}

\uncover<1->{\textbf{1) Feasibility:} $C(x^\star)=0$. \emph{(We’re on the surface.)}}

\uncover<2->{\textbf{2) Stationarity:} $\nabla f(x^\star) + J_C(x^\star)^{\!T}\lambda^\star = 0$. \emph{(The gradient is a linear combination of the constraint normals.)}}

\uncover<3->{\textbf{Lagrangian viewpoint.} Define $L(x,\lambda)=f(x)+\lambda^{\!T}C(x)$. At a solution, $x^\star$ is a stationary point of $L$ w.r.t.\ $x$ (that’s the stationarity equation), while $C(x^\star)=0$ enforces feasibility.}

\uncover<4->{\textbf{What the multipliers mean.} The vector $\lambda^\star$ tells how strongly each constraint “pushes back” at the optimum; it also measures sensitivity of the optimal value to small changes in the constraints.}

\end{frame}


\begin{frame}{KKT system for equalities (first-order necessary conditions)}
\textbf{KKT (FOC).}

$$
\grad_x L(x,\lambda)=\grad f(x)+J_C(x)^{\!T}\lambda=0,\qquad \grad_\lambda L(x,\lambda)=C(x)=0.
$$

\textbf{Solve by Newton on KKT:} linearize both optimality and feasibility:

$$
\begin{bmatrix}
\hess f(x) + \sum_{i=1}^m \lambda_i\,\hess C_i(x) & J_C(x)^{\!T}\\[2pt]
J_C(x) & 0
\end{bmatrix}
\begin{bmatrix}\Delta x\\ \Delta\lambda\end{bmatrix}
=-
\begin{bmatrix}
\grad f(x)+J_C(x)^{\!T}\lambda\\ C(x)
\end{bmatrix}.
$$

\textit{Notes.} This is a symmetric \emph{saddle-point} system; typical solves use block elimination (Schur complement) or sparse factorizations.
\end{frame}


\begin{frame}{Move to Julia Code}
\begin{center}
    \textbf{Quick Demo of Julia Notebook: part2\_eq\_constraints.ipynb}
\end{center}
\end{frame}

\begin{frame}{Numerical practice: Newton on KKT}
  \setbeamercovered{invisible}


    \textbf{When it works best.}
    \begin{itemize}
      \item Near a regular solution with $J_{C}(x^\star)$ full row rank and positive-definite reduced Hessian.
      \item With a globalization (line search on a merit function) and mild regularization for robustness.
    \end{itemize}

    % --- Part 2: appears on the 2nd click only ---
    \uncover<2->{%
      \textbf{Common safeguards.}
      \begin{itemize}
        \item \emph{Regularize} the $(1,1)$ block to ensure a good search direction (e.g., add $\beta I$).
        \item \emph{Merit/penalty} line search to balance feasibility vs.\ optimality during updates.
        \item \emph{Scaling} constraints to improve conditioning of the KKT system.
      \end{itemize}
    }
\end{frame}


\begin{frame}{Gauss--Newton vs. full Newton on KKT}

\uncover<1->{
\textbf{Full Newton Hessian of the Lagrangian:}\quad
$\nabla_{xx}^2 L(x,\lambda) = \nabla^2 f(x) + \sum_{i=1}^m \lambda_i\, \nabla^2 C_i(x)$
}

\vspace{0.6em}

\uncover<2->{
\textbf{Gauss--Newton approximation:} drop the \emph{constraint-curvature} term
$\sum_{i=1}^m \lambda_i\, \nabla^2 C_i(x)$:
\begin{align*}
H_{\text{GN}}(x) &\approx \nabla^2 f(x).
\end{align*}
}

\uncover<3->{
\textbf{Trade-offs (high level).}
\begin{itemize}
  \item \emph{Full Newton:} fewer iterations near the solution, but each step is costlier and can be less robust far from it.
  \item \emph{Gauss--Newton:} cheaper per step and often more stable; may need more iterations but wins in wall-clock on many problems.
\end{itemize}
}

\end{frame}


% ==== Inequalities & KKT: complementarity ====

\begin{frame}{Inequality-constrained minimization and KKT}
\textbf{Problem.} $\quad \quad \min f(x)\quad\text{s.t.}\quad c(x)\ge 0,  \quad \quad c:\mathbb{R}^n\to\mathbb{R}^p$.

\textbf{KKT conditions (first-order).}

$$
\begin{aligned}
&\text{Stationarity:} && \grad f(x)-J_c(x)^{\!T}\lambda=0,\\
&\text{Primal feasibility:} && c(x)\ge 0,\\
&\text{Dual feasibility:} && \lambda\ge 0,\\
&\text{Complementarity:} && \lambda^{\!T}c(x)=0\quad(\text{i.e., }\lambda_i c_i(x)=0\ \forall i).
\end{aligned}
$$

\textbf{Interpretation.}
\begin{itemize}
\item \emph{Active} constraints: $c_i(x)=0 \Rightarrow \lambda_i\ge 0$ can be nonzero (acts like an equality).
\item \emph{Inactive} constraints: $c_i(x)>0 \Rightarrow \lambda_i=0$ (no influence on optimality).
\end{itemize}
\end{frame}


\begin{frame}{Complementarity in plain English (and why Newton is tricky)}
\footnotesize

\textbf{What $\lambda_i c_i(x)=0$ means.}
\begin{itemize}
\item Tight constraint ($c_i=0$) $\Rightarrow$ can press back ($\lambda_i\ge0$).
\item Loose constraint ($c_i>0$) $\Rightarrow$ no force ($\lambda_i=0$).
\end{itemize}

\textbf{Why naive Newton fails.}
\begin{itemize}
\item Complementarity = nonsmooth + inequalities ($\lambda\ge0$, $c(x)\ge0$).
\item Equality-style Newton can violate nonnegativity or bounce across boundary.
\end{itemize}

\textbf{Two main strategies (preview).}
\begin{itemize}
\item \emph{Active-set:} guess actives $\Rightarrow$ solve equality-constrained subproblem, update set.
\item \emph{Barrier/PDIP/ALM:} smooth or relax complementarity, damped Newton, drive relaxation $\to 0$.
\end{itemize}
\end{frame}