forked from optsuite/optlib
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNesterovAccelerationSecond.lean
More file actions
453 lines (430 loc) · 20.1 KB
/
NesterovAccelerationSecond.lean
File metadata and controls
453 lines (430 loc) · 20.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/-
Copyright (c) 2024 Yuxuan Wu, Chenyi Li. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Yuxuan Wu, Chenyi Li
-/
import Convex.Function.Proximal
import Mathlib.Algebra.Order.Field.Basic
/-!
# NesterovAccelerationSecond
## Main results
This file mainly concentrates on the second version of Nesterov algorithm for composite optimization problems.
We prove the O(1 / k ^ 2) rate for this algorithm.
-/
local notation "⟪" x ", " y "⟫" => @inner ℝ _ _ x y
section Nesterov_acceleration
variable {E : Type*} [NormedAddCommGroup E] [InnerProductSpace ℝ E] [CompleteSpace E]
variable [ProperSpace E]
variable {x0 : E}
variable {f h : E → ℝ} {f' : E → E}
open Set Real
class Nesterov_second (f h : E → ℝ) (f' : E → E) (x0 : E) :=
(l : NNReal) (hl : l > (0 : ℝ)) (x y : ℕ → E) (z : ℕ+ → E) (t γ : ℕ → ℝ)
(h₁ : ∀ x : E, HasGradientAt f (f' x) x) (convf: ConvexOn ℝ Set.univ f)
(h₂ : LipschitzWith l f') (convh : ConvexOn ℝ univ h)
(oriy : y 0 = x 0) (oriγ : γ 1 = 1) (initial : x0 = x 0)
(cond : ∀ n : ℕ+, (1 - γ (n + 1)) * (t (n + 1)) / (γ (n + 1)) ^ 2 ≤ (t n) / (γ n) ^ 2)
(tbound : ∀ k, (0 < t k) ∧ t k ≤ 1 / l) (γbound : ∀ n, (0 < γ n) ∧ γ n ≤ 1)
(update1 : ∀ (k : ℕ+), z k = (1 - γ k) • (x (k - 1)) + γ k • (y (k - 1)))
(update2 : ∀ (k : ℕ+), prox_prop ((t k / γ k) • h) (y (k - 1) - (t k / γ k) • (f' (z k))) (y k))
(update3 : ∀ (k : ℕ+), x k = (1 - γ k) • (x (k - 1)) + γ k • y k)
variable {alg : Nesterov_second f h f' x0}
variable {xm : E} (minφ : IsMinOn (f + h) Set.univ xm)
theorem Nesterov_second_convergence :
∀ (k : ℕ), f (alg.x (k + 1)) + h (alg.x (k + 1)) - f xm - h xm ≤
(alg.γ (k + 1)) ^ 2 / (2 * alg.t (k + 1)) * ‖x0 - xm‖ ^ 2 := by
let φ := fun z : E ↦ f z + h z
have φdef : ∀ z : E, φ z = f z + h z := by simp
have h1 : ∀ k : ℕ+, alg.γ k • (alg.y (k - 1) - alg.y k) - alg.t k • (f' (alg.z k))
∈ (SubderivAt (alg.t k • h) (alg.y k)) := by
intro k; obtain h1 := alg.update2 k
rw [prox_iff_subderiv] at h1
have upd2 := @SubderivAt.pos_smul _ _ _ ((alg.t k / alg.γ k) • h) (alg.y k) (alg.γ k) (alg.γbound k).1
rw [← smul_assoc, smul_eq_mul, mul_div, mul_comm, mul_div_cancel] at upd2
rw [upd2]
use (alg.y (↑k - 1) - (alg.t ↑k / alg.γ ↑k) • f' (alg.z k) - alg.y ↑k)
constructor
. exact h1
. simp
rw [sub_right_comm, smul_sub, ← smul_assoc, smul_eq_mul, mul_div, mul_comm, mul_div_cancel]
linarith [(alg.γbound k).1]
linarith [(alg.γbound k).1]
apply ConvexOn.smul
apply div_nonneg
linarith [(alg.tbound k).1]
linarith [(alg.γbound k).1]
exact alg.convh
have hieq1 : ∀ w : E, ∀ k : ℕ+, (alg.t k) * h w ≥ (alg.t k) * h (alg.y k) + ⟪alg.γ k •
(alg.y (k - 1) - alg.y k) - alg.t k • (f' (alg.z k)), w - (alg.y k)⟫ := by
intro w k
specialize h1 k
rw [← mem_SubderivAt, HasSubgradientAt] at h1
specialize h1 w; simp at h1; linarith [h1]
have cvxineq : ∀ k : ℕ+, h (alg.x k) ≤ (1 - alg.γ k) * h (alg.x (k - 1)) + alg.γ k * h (alg.y k) := by
intro k; obtain convh := alg.convh
rw [convexOn_iff_forall_pos] at convh;
rcases convh with ⟨_, fall⟩
have mem1 : (alg.x (k - 1)) ∈ univ := by simp
have mem2 : alg.y k ∈ univ := by simp
by_cases eq1 : alg.γ k = 1
. simp [eq1]
obtain update3 := alg.update3 k
simp [eq1] at update3
rw [update3]
. push_neg at eq1
have pos : 1 - alg.γ k > 0 := by
apply lt_iff_le_and_ne.mpr
constructor
. linarith [(alg.γbound k).2]
. contrapose eq1
push_neg at *
linarith [eq1]
specialize fall mem1 mem2 pos ((alg.γbound k).1) (by linarith)
rw [← (alg.update3 k)] at fall
apply fall
have bsc1 : ∀ k : ℕ, (alg.t k / alg.γ k) > 0 := by
intro k
apply div_pos
linarith [(alg.tbound k).1]
linarith [(alg.γbound k).1]
have hieq2 : ∀ w : E, ∀ k : ℕ+, alg.γ k * h (alg.y k) ≤ alg.γ k * h w - (alg.γ k / alg.t k) *
⟪alg.γ k • (alg.y (k - 1) - alg.y k) - alg.t k • (f' (alg.z k)), w - (alg.y k)⟫ := by
intro w k
rw [← mul_div_right_comm, ← mul_div, ← mul_sub]
apply (mul_le_mul_right (bsc1 k)).mp
rw [mul_comm, ← mul_assoc, div_mul_cancel]
rw [mul_assoc]
nth_rw 3 [mul_comm]
rw [← mul_assoc, mul_div_left_comm, div_self, mul_one]
specialize hieq1 w k
rw [mul_sub, mul_div_left_comm, div_self, mul_one]
linarith [hieq1]
linarith [(alg.tbound k).1]
linarith [(alg.γbound k).1]
linarith [(alg.γbound k).1]
have hieq3 : ∀ w : E, ∀ k : ℕ+, h (alg.x k) ≤
(1 - alg.γ k) * h (alg.x (k - 1)) + alg.γ k * h w - (alg.γ k / alg.t k)
* ⟪alg.γ k • (alg.y (k - 1) - alg.y k) - alg.t k • (f' (alg.z k)), w - (alg.y k)⟫ := by
intro w k
have ax := add_le_add (hieq2 w k) (cvxineq k)
linarith [ax]
have hieq4 : ∀ k : ℕ+, f (alg.x k) ≤ f (alg.z k) + ⟪f' (alg.z k), alg.x k - alg.z k⟫ +
alg.l / 2 * ‖alg.x k - alg.z k‖ ^ 2 := by
intro k
apply lipschitz_continuos_upper_bound' alg.h₁ alg.h₂
have hieq5 : ∀ k : ℕ+, f (alg.x k) ≤ f (alg.z k) + ⟪f' (alg.z k), alg.x k - alg.z k⟫
+ 1 / (2 * alg.t k) * ‖alg.x k - alg.z k‖ ^ 2 := by
intro k
apply le_trans (hieq4 k)
simp only [add_le_add_iff_left]
by_cases nm0 : ‖alg.x ↑k - alg.z k‖ ^ 2 = 0
. rw [nm0]
simp
. push_neg at nm0
have ax : ‖alg.x ↑k - alg.z k‖ ^ 2 > 0 := by
apply lt_of_le_of_ne
simp
symm
apply nm0
apply (mul_le_mul_right ax).mpr
have lc2 : alg.l / 2 > (0 : ℝ) := by linarith [alg.hl]
have tc2 : (2 * alg.t ↑k) > 0 := by linarith [(alg.tbound k).1]
rw [one_div]
apply (le_inv lc2 tc2).mpr
rw [← one_div, ← div_mul, mul_comm]
linarith [(alg.tbound k).2]
have hieq6 : ∀ k : ℕ+, f (alg.x k) ≤ f (alg.z k) + ⟪f' (alg.z k), (1 - alg.γ k) • alg.x (k - 1)
+ alg.γ k • alg.y k - alg.z k⟫ + ((alg.γ k) ^ 2 / (2 * alg.t k)) * ‖alg.y k - alg.y (k - 1)‖ ^ 2 := by
intro k
have ax1 : alg.x ↑k - alg.z k = alg.γ k • (alg.y k - alg.y (k - 1)) := by
rw [(alg.update1 k), (alg.update3 k)]
rw [add_sub_add_left_eq_sub, smul_sub]
specialize hieq5 k
nth_rw 2 [ax1] at hieq5
nth_rw 2 [(alg.update3 k)] at hieq5
rw [norm_smul, mul_pow, ← mul_assoc, Real.norm_eq_abs] at hieq5
have ax : ∀ xl : ℝ, |xl| ^ 2 = xl ^ 2 := by
intro xl
simp only [sq_abs]
rw [ax (alg.γ k), div_mul_eq_mul_div, one_mul] at hieq5
apply hieq5
have hieq7 : ∀ k : ℕ+, f (alg.x k) ≤ (1 - alg.γ k) * f (alg.x (k - 1))
+ alg.γ k * (f (alg.z k) + ⟪f' (alg.z k), alg.y k - alg.z k⟫)
+ ((alg.γ k) ^ 2 / (2 * alg.t k)) * ‖alg.y k - alg.y (k - 1)‖ ^ 2 := by
intro k
calc
f (alg.x k) ≤ f (alg.z k) + ⟪f' (alg.z k), (1 - alg.γ k) • alg.x (k - 1) + alg.γ k • alg.y k - alg.z k⟫
+ ((alg.γ k) ^ 2 / (2 * alg.t k)) * ‖alg.y k - alg.y (k - 1)‖ ^ 2 := by
apply hieq6
_ = (1 - alg.γ k) * (f (alg.z k) + ⟪f' (alg.z k), alg.x (k - 1) - alg.z k⟫) + alg.γ k
* (f (alg.z k) + ⟪f' (alg.z k), alg.y k - alg.z k⟫)
+ ((alg.γ k) ^ 2 / (2 * alg.t k)) * ‖alg.y k - alg.y (k - 1)‖ ^ 2 := by
simp only [add_left_inj]
have ax : (1 - alg.γ ↑k) • alg.x (↑k - 1) + alg.γ ↑k • alg.y ↑k - alg.z k =
(1 - alg.γ k) • (alg.x (k - 1) - alg.z k) + alg.γ k • (alg.y k - alg.z k) := by
rw [smul_sub, smul_sub, sub_add_eq_add_sub, ← add_sub_assoc, sub_sub, ← add_smul]
simp only [add_sub_cancel'_right, one_smul]
rw [ax]
nth_rw 1 [inner_add_right]
rw [inner_smul_right, inner_smul_right, mul_add, mul_add]
ring
_ ≤ (1 - alg.γ k) * (f (alg.x (k - 1))) + alg.γ k * (f (alg.z k) + ⟪f' (alg.z k), alg.y k - alg.z k⟫)
+ ((alg.γ k) ^ 2 / (2 * alg.t k)) * ‖alg.y k - alg.y (k - 1)‖ ^ 2 := by
simp only [add_le_add_iff_right]
by_cases eq1 : alg.γ k = 1
. simp [eq1]
. push_neg at eq1
have pos : 1 - alg.γ k > 0 := by
apply lt_iff_le_and_ne.mpr
constructor
. linarith [(alg.γbound k).2]
. contrapose eq1
push_neg at *
linarith [eq1]
apply (mul_le_mul_left pos).mpr
apply Convex_first_order_condition' (alg.h₁ (alg.z k)) alg.convf
simp
simp
specialize hieq3 xm
have hieqmajor : ∀ k : ℕ+, φ (alg.x k) - φ xm - (1 - alg.γ k) * (φ (alg.x (k - 1)) - φ xm) ≤
(alg.γ k) ^ 2 / (2 * alg.t k) * (‖alg.y (k - 1) - xm‖ ^ 2 - ‖alg.y k - xm‖ ^ 2) := by
intro k
specialize hieq3 k
specialize hieq7 k
calc
_ = φ (alg.x k) - (1 - alg.γ k) * φ (alg.x (k - 1)) - alg.γ k * φ xm := by
ring
_ ≤ alg.γ ↑k * h xm - alg.γ ↑k / alg.t ↑k * ⟪alg.γ ↑k • (alg.y (↑k - 1) - alg.y ↑k)
- alg.t ↑k • f' (alg.z k), xm - alg.y ↑k⟫
+ alg.γ ↑k * (f (alg.z k) + ⟪f' (alg.z k), alg.y ↑k - alg.z k⟫_ℝ)
+ alg.γ ↑k ^ 2 / (2 * alg.t ↑k) * ‖alg.y ↑k - alg.y (↑k - 1)‖ ^ 2 - alg.γ k * φ xm := by
have ax := add_le_add hieq3 hieq7
rw [φdef, φdef, mul_add]
apply sub_le_sub_right
apply sub_le_iff_le_add'.mpr
linarith [ax]
_ = - alg.γ ↑k * f xm - alg.γ ↑k / alg.t ↑k * ⟪alg.γ ↑k • (alg.y (↑k - 1) - alg.y ↑k)
- alg.t ↑k • f' (alg.z k), xm - alg.y ↑k⟫
+ alg.γ ↑k * (f (alg.z k) + ⟪f' (alg.z k), alg.y ↑k - alg.z k⟫_ℝ)
+ alg.γ ↑k ^ 2 / (2 * alg.t ↑k) * ‖alg.y ↑k - alg.y (↑k - 1)‖ ^ 2 := by
rw [φdef]
ring
_ ≤ - alg.γ ↑k * (f (alg.z k) + ⟪f' (alg.z k), xm - alg.z k⟫) - alg.γ ↑k / alg.t ↑k
* ⟪alg.γ ↑k • (alg.y (↑k - 1) - alg.y ↑k) - alg.t ↑k • f' (alg.z k), xm - alg.y ↑k⟫
+ alg.γ ↑k * (f (alg.z k) + ⟪f' (alg.z k), alg.y ↑k - alg.z k⟫_ℝ)
+ alg.γ ↑k ^ 2 / (2 * alg.t ↑k) * ‖alg.y ↑k - alg.y (↑k - 1)‖ ^ 2 := by
simp
have gpos : alg.γ k > 0 := by exact (alg.γbound k).1
apply (mul_le_mul_left gpos).mpr
apply Convex_first_order_condition' (alg.h₁ (alg.z k)) alg.convf
simp
simp
_ = (alg.γ ↑k) ^ 2 / (2 * alg.t k) * (‖alg.y (k - 1) - xm‖ ^ 2 - ‖alg.y k - xm‖ ^ 2) := by
rw [sub_add_eq_add_sub, neg_mul, ← sub_eq_neg_add, ← mul_sub, ← sub_sub,
← sub_add_eq_add_sub, sub_self, zero_add]
rw [← inner_sub_right, ← sub_add, (sub_add_eq_add_sub _ xm _), sub_add_cancel]
rw [inner_sub_left, mul_sub, ← sub_add, sub_add_eq_add_sub, inner_smul_left, conj_trivial]
rw [← mul_assoc, div_mul_cancel, ← mul_add, ← inner_add_right]
rw [sub_add_sub_cancel, sub_self, inner_zero_right, mul_zero, zero_sub]
rw [inner_smul_left, conj_trivial, ← mul_assoc, div_mul_eq_mul_div, ← pow_two]
have ax : alg.γ ↑k ^ 2 / alg.t ↑k = alg.γ ↑k ^ 2 / (2 * alg.t ↑k) * 2 := by
rw [div_mul, mul_comm, mul_div_cancel]
norm_num
rw [ax, ← sub_eq_neg_add, mul_assoc, ← mul_sub]
apply mul_eq_mul_left_iff.mpr
left
rw [← real_inner_self_eq_norm_sq, ← inner_smul_right]
nth_rw 4 [← neg_sub]
rw [inner_neg_left, sub_neg_eq_add, ← inner_add_right, smul_sub,
add_comm, add_sub, sub_add]
nth_rw 2 [two_smul]
rw [add_sub_cancel]
have sqsub : ∀ a b : E, ‖a‖ ^ 2 - ‖b‖ ^ 2 = ⟪- a - b, b - a⟫ := by
intro a b
rw [neg_sub_left, ← neg_sub a b, add_comm, inner_neg_left, inner_neg_right, neg_neg]
rw [inner_add_left, inner_sub_right, inner_sub_right, real_inner_comm a b,
add_sub, sub_add_cancel]
rw [real_inner_self_eq_norm_sq, real_inner_self_eq_norm_sq]
rw [sqsub, real_inner_comm, neg_sub, ← sub_add _ _ xm, add_comm _ xm, add_sub, add_sub]
rw [sub_sub, add_comm, ← sub_sub, two_smul]
rw [← sub_add, sub_add_eq_add_sub, sub_add_cancel]
linarith [(alg.tbound k).1]
have decrease : ∀ k : ℕ+, alg.t (k + 1) / (alg.γ (k + 1) ^ 2) * (φ (alg.x (k + 1)) - φ xm)
+ 1 / 2 * ‖alg.y (k + 1) - xm‖ ^ 2 ≤
alg.t k / (alg.γ k ^ 2) * ((φ (alg.x k)) - φ xm) + 1 / 2 * ‖alg.y k - xm‖ ^ 2 := by
intro k
apply le_add_neg_iff_add_le.mp
rw [neg_eq_zero_sub, add_zero_sub]
nth_rw 2 [add_sub_assoc]
rw [add_comm (alg.t ↑k / alg.γ ↑k ^ 2 * (φ (alg.x ↑k) - φ xm)) _]
apply add_neg_le_iff_le_add.mp
rw [neg_eq_zero_sub, add_zero_sub]
calc
_ ≤ alg.t (↑k + 1) / alg.γ (↑k + 1) ^ 2 * (φ (alg.x (↑k + 1)) - φ xm) - (1 - alg.γ (k + 1))
* (alg.t (k + 1)) / (alg.γ (k + 1)) ^ 2 * (φ (alg.x k) - φ xm) := by
apply sub_le_sub_left
apply mul_le_mul
exact alg.cond k
apply le_of_eq
simp
apply sub_nonneg.mpr
apply minφ
simp
apply div_nonneg
linarith [(alg.tbound k).1]
apply pow_two_nonneg
_ = (alg.t (↑k + 1)) / (alg.γ (↑k + 1)) ^ 2 * (φ (alg.x (↑k + 1)) - φ xm
- (1 - alg.γ (↑k + 1)) * (φ (alg.x k) - φ xm)) := by
rw [mul_comm _ (alg.t (↑k + 1)), mul_div_right_comm, mul_assoc, ← mul_sub]
_ ≤ 1 / 2 * ‖alg.y k - xm‖ ^ 2 - 1 / 2 * ‖alg.y (k + 1) - xm‖ ^ 2 := by
specialize hieqmajor (k + 1)
simp only [PNat.add_coe, PNat.one_coe, add_tsub_cancel_right] at hieqmajor
have ax : alg.t (k + 1) / alg.γ (k + 1) ^ 2 > 0 := by
apply div_pos
linarith [(alg.tbound (k + 1)).1]
rw [pow_two]
apply mul_pos
linarith [(alg.γbound (k + 1)).1]
linarith [(alg.γbound (k + 1)).1]
have ine := (mul_le_mul_iff_of_pos_right ax).mpr hieqmajor
rw [mul_comm, mul_right_comm, ← div_div, mul_div, div_mul_cancel,
div_right_comm, div_self, mul_sub (1 / 2) _] at ine
apply ine
symm
apply ne_of_lt
rw [pow_two]
apply mul_pos
linarith [(alg.γbound (k + 1)).1]
linarith [(alg.γbound (k + 1)).1]
linarith [(alg.tbound (k + 1)).1]
have bound (k : ℕ) : alg.t (k + 1) / (alg.γ (k + 1) ^ 2) * (φ (alg.x (k + 1)) - φ xm)
+ 1 / 2 * ‖alg.y (k + 1) - xm‖ ^ 2 ≤
alg.t 1 / (alg.γ 1 ^ 2) * (φ (alg.x 1) - φ xm) + 1 / 2 * ‖alg.y 1 - xm‖ ^ 2 := by
induction' k with k ik
. simp only [Nat.zero_eq]
apply le_of_eq
simp
have ine := decrease (Nat.toPNat' (k + 1))
simp only [Nat.toPNat'_coe, add_pos_iff, zero_lt_one, or_true, ↓reduceIte] at ine
apply le_trans ine
apply ik
specialize hieqmajor 1
intro k
have ax : ∀ n : ℕ, alg.t (n + 1) / alg.γ (n + 1) ^ 2 > 0 := by
intro n
apply div_pos
linarith [(alg.tbound (n + 1)).1]
rw [pow_two]
apply mul_pos
linarith [(alg.γbound (n + 1)).1]
linarith [(alg.γbound (n + 1)).1]
apply (mul_le_mul_iff_of_pos_left (ax (k))).mp
rw [← mul_assoc, mul_div, div_mul_cancel, ← div_div, div_right_comm, div_self]
simp only [PNat.one_coe, le_refl, tsub_eq_zero_of_le] at hieqmajor
rw [alg.oriγ, sub_self, zero_mul, sub_zero] at hieqmajor
calc
_ = alg.t (k + 1) / alg.γ (k + 1) ^ 2 * (φ (alg.x (k + 1)) - φ xm) := by
rw [sub_sub, ← φdef, ← φdef]
_ ≤ alg.t (k + 1) / alg.γ (k + 1) ^ 2 * (φ (alg.x (k + 1)) - φ xm) + 1 / 2 * ‖alg.y (k + 1) - xm‖ ^ 2 := by
simp
_ ≤ alg.t 1 / alg.γ 1 ^ 2 * (φ (alg.x 1) - φ xm) + 1 / 2 * ‖alg.y 1 - xm‖ ^ 2 := by
exact bound k
_ ≤ 1 / 2 * ‖x0 - xm‖ ^ 2:= by
specialize ax 0
rw [zero_add] at ax
have ieq := (mul_le_mul_iff_of_pos_left ax).mpr hieqmajor
apply (add_le_add_iff_right (-(1 / 2 * ‖alg.y 1 - xm‖ ^ 2))).mp
rw [← zero_sub, add_sub, add_sub, add_zero, add_zero, add_sub_cancel]
apply le_trans ieq
rw [alg.oriγ]
rw [one_pow, div_one, ← mul_assoc, mul_div, mul_one, ← div_div, div_right_comm, div_self]
rw [alg.oriy, ← alg.initial, mul_sub]
linarith [(alg.tbound 1).1]
linarith [(alg.tbound (k + 1)).1]
rw [pow_two]
symm
apply ne_of_lt
apply mul_pos
linarith [(alg.γbound (k + 1)).1]
linarith [(alg.γbound (k + 1)).1]
end Nesterov_acceleration
section Nesterov_second_fix_stepsize
variable {E : Type*} [NormedAddCommGroup E] [InnerProductSpace ℝ E] [CompleteSpace E]
variable [ProperSpace E]
variable {f h : E → ℝ} {f' : E → E} {x0 : E}
open Set Real PNat
class Nesterov_second_fix_stepsize (f h: E → ℝ) (f' : E → E) (x0 : E) :=
(l : NNReal) (hl : l > (0 : ℝ)) (x y : ℕ → E) (z : ℕ+ → E) (t γ : ℕ → ℝ)
(h₁ : ∀ x : E, HasGradientAt f (f' x) x) (convf: ConvexOn ℝ Set.univ f)
(h₂ : LipschitzWith l f') (convh : ConvexOn ℝ univ h)
(oriy : y 0 = x 0) (oriγ : γ 1 = 1) (initial : x0 = x 0)
(teq : ∀ n : ℕ, t n = 1 / l)
(γeq : ∀ n : ℕ, γ n = if n = 0 then (1 / (2 : ℝ)) else (2 : ℝ) / (1 + n))
(update1 : ∀ (k : ℕ+), z k = (1 - γ k) • (x (k - 1)) + γ k • (y (k - 1)))
(update2 : ∀ (k : ℕ+), prox_prop ((t k / γ k) • h) (y (k - 1) - (t k / γ k) • (f' (z k))) (y k))
(update3 : ∀ (k : ℕ+), x k = (1 - γ k) • (x (k - 1)) + γ k • y k)
instance {f h : E → ℝ} {f' : E → E} {x0 : E} [p : Nesterov_second_fix_stepsize f h f' x0] :
Nesterov_second f h f' x0 where
l := p.l
h₁ := p.h₁
convf := p.convf
h₂ := p.h₂
convh := p.convh
x := p.x; y := p.y; t := p.t; γ := p.γ;
oriy := p.oriy
oriγ := by simp [p.γeq 1]; norm_num
initial := p.initial
cond := by
intro n; rw [p.teq n, p.teq (n + 1), p.γeq n, p.γeq (n + 1)]; field_simp [p.hl]
rw [← div_div, ← div_div, ← div_div]
repeat apply div_le_div_of_nonneg_right _ (by positivity)
rw [pow_two, ← mul_assoc, mul_div_assoc, div_self, mul_one]
· calc
_ = n ^ 2 + (2 : ℝ) * n := by ring_nf
_ ≤ 1 + (2 : ℝ) * n + n ^ 2 := by linarith
_ = (1 + n) ^ 2 := by rw [add_pow_two]; simp
· linarith
tbound := by
intro k; rw [p.teq k]; simp; exact p.hl
hl := p.hl
γbound := by
intro k;rw [p.γeq]; constructor
· by_cases hk : k = 0
· rw [hk]; simp
simp [hk]; positivity
· by_cases hk : k = 0
rw [hk]; simp; norm_num; push_neg at hk
simp [hk]; rw [div_le_iff (by positivity)]; simp [hk]
have : (k : ℝ) ≥ 1 := by
rw [← Nat.pos_iff_ne_zero, Nat.lt_iff_add_one_le, zero_add] at hk; simp [hk]
linarith
update1 := p.update1
update2 := p.update2
update3 := p.update3
variable {alg : Nesterov_second_fix_stepsize f h f' x0}
variable {xm : E} (minφ : IsMinOn (f + h) univ xm)
theorem Nesterov_second_fix_stepsize_converge:
∀ (k : ℕ), f (alg.x (k + 1)) + h (alg.x (k + 1)) - f xm - h xm ≤
2 * alg.l / (k + 2) ^ 2 * ‖x0 - xm‖ ^ 2 := by
intro k
calc
f (alg.x (k + 1)) + h (alg.x (k + 1)) - f xm - h xm ≤
(alg.γ (k + 1)) ^ 2 / (2 * alg.t (k + 1)) * ‖x0 - xm‖ ^ 2 := by
have h1 :
f (Nesterov_second_fix_stepsize.x f h f' x0 (k + 1)) +
h (Nesterov_second_fix_stepsize.x f h f' x0 (k + 1)) -
f xm - h xm = f (Nesterov_second.x f h f' x0 (k + 1))
+ h (Nesterov_second.x f h f' x0 (k + 1)) - f xm - h xm := rfl
have h2 :
Nesterov_second_fix_stepsize.γ f h f' x0 (k + 1) ^ 2 /
(2 * Nesterov_second_fix_stepsize.t f h f' x0 (k + 1)) *
‖x0 - xm‖ ^ 2 =
Nesterov_second.γ f h f' x0 (k + 1) ^ 2 / (2 * Nesterov_second.t f h f' x0 (k + 1)) *
‖x0 - xm‖ ^ 2 := rfl
rw [h1, h2]; apply Nesterov_second_convergence minφ
_ ≤ 2 * alg.l / (k + 2) ^ 2 * ‖x0 - xm‖ ^ 2 := by
apply mul_le_mul_of_nonneg_right _ (sq_nonneg _)
rw [alg.γeq (k + 1), alg.teq (k + 1)]; field_simp
rw [pow_two, add_comm]; rw [add_assoc, one_add_one_eq_two, ← div_div]
apply le_of_eq; ring_nf
end Nesterov_second_fix_stepsize