Skip to content

Commit 031bdb2

Browse files
authored
chore: add comments to Compil.lean (#15)
This PR adds comments to the Compil.lean file taken from the Leroy's original source code written in Rocq.
1 parent f768f2c commit 031bdb2

1 file changed

Lines changed: 225 additions & 13 deletions

File tree

LeroyCompilerVerificationCourse/Compil.lean

Lines changed: 225 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,69 @@ Authors: Wojciech Różowski
66

77
import LeroyCompilerVerificationCourse.Imp
88

9+
/-
10+
2. The target language: a stack machine
11+
12+
Our compiler will translate IMP to the language of a simple stack
13+
machine. This machine is similar to the "Reverse Polish Notation"
14+
used by old HP pocket calculators: a stack holds intermediate
15+
results, and individual instructions pop arguments off the stack and
16+
push results back on the stack.
17+
-/
18+
19+
/-
20+
2.1 Instruction set
21+
22+
Here is the instruction set of the machine:
23+
-/
924
@[grind] inductive instr : Type where
10-
| Iconst (n : Int)
11-
| Ivar (x : ident)
12-
| Isetvar (x : ident)
13-
| Iadd
14-
| Iopp
15-
| Ibranch (d : Int)
16-
| Ibeq (d1 : Int) (d0 : Int)
17-
| Ible (d1 : Int) (d0 : Int)
18-
| Ihalt
25+
| Iconst (n : Int) /-r push the integer `n` -/
26+
| Ivar (x : ident) /-r push the current value of variable `x` -/
27+
| Isetvar (x : ident) /-r pop an integer and assign it to variable `x` -/
28+
| Iadd /-r pop two integers, push their sum -/
29+
| Iopp /-r pop one integer, push its opposite -/
30+
| Ibranch (d : Int) /-r skip forward or backward `d` instructions -/
31+
| Ibeq (d1 : Int) (d0 : Int) /-r pop two integers, skip `d1` instructions if equal, `d0` if not equal -/
32+
| Ible (d1 : Int) (d0 : Int) /-r pop two integer, skip `d1` instructions if less or equal, `d0` if greater -/
33+
| Ihalt /-r stop execution -/
1934
deriving Repr
2035

36+
/-
37+
A piece of machine code is a list of instructions.
38+
The length (number of instructions) of a piece of code.
39+
-/
2140
@[grind] def codelen (c : List instr) : Int := c.length
2241

42+
/-
43+
2.2 Operational semantics
44+
45+
The machine operates on a code `C` (a fixed list of instructions)
46+
and three variable components:
47+
- a program counter `pc`, denoting a position in `C`
48+
- an evaluation stack, containing integers
49+
- a store assigning integer values to variables.
50+
-/
51+
2352
def stack : Type := List Int
2453

2554
def config : Type := Int × stack × store
2655

56+
/-
57+
`instr_at C pc = .some i` if `i` is the instruction at position `pc` in `C`.
58+
-/
2759
@[grind] def instr_at (C : List instr) (pc : Int) : Option instr :=
2860
match C with
2961
| [] => .none
3062
| i :: C' => if pc = 0 then .some i else instr_at C' (pc - 1)
3163

64+
/-
65+
The semantics of the machine is given in small-step style as a transition system:
66+
a relation between machine configurations "before" and "after" execution
67+
of the instruction at the current program point.
68+
The transition relation is parameterized by the code `C` for the whole program.
69+
There is one transition for each kind of instruction,
70+
except `.Ihalt`, which has no transition.
71+
-/
3272
@[grind] inductive transition (C : List instr) : config → config → Prop where
3373
| trans_const : ∀ pc stk s n,
3474
instr_at C pc = .some (.Iconst n) →
@@ -66,29 +106,69 @@ def config : Type := Int × stack × store
66106
transition C (pc , n2 :: n1 :: stk, s)
67107
(pc', stk , s)
68108

109+
/-
110+
As usual with small-step semantics, we form sequences of machine transitions
111+
to define the behavior of a code. Zero, one or several transitions
112+
correspond to the reflexive transitive closure of relation `transition C`.
113+
-/
69114
@[grind] def transitions (C : List instr) : config → config → Prop :=
70115
star (transition C)
71116

117+
/-
118+
We always start with `pc = 0` and an empty evaluation stack.
119+
We stop successfully if `pc` points to an `.Ihalt` instruction
120+
and the evaluation stack is empty.
121+
-/
72122
def machine_terminates (C : List instr) (s_init : store) (s_final : store) : Prop :=
73123
∃ pc, transitions C (0, [], s_init) (pc, [], s_final)
74124
∧ instr_at C pc = .some .Ihalt
75125

126+
/-
127+
The machine can also run forever, making infinitely many transitions.
128+
-/
76129
def machine_diverges (C : List instr) (s_init : store) : Prop :=
77130
infseq (transition C) (0, [], s_init)
78131

132+
/-
133+
Yet another possibility is that the machine gets stuck after some transitions.
134+
-/
79135
def machine_goes_wrong (C : List instr) (s_init : store) : Prop :=
80136
∃ pc stk s,
81137
transitions C (0, [], s_init) (pc, stk, s)
82138
∧ irred (transition C) (pc, stk, s)
83139
∧ (instr_at C pc ≠ .some .Ihalt ∨ stk ≠ [])
84140

141+
/-
142+
3. The compilation scheme
143+
144+
We now define the compilation of IMP expressions and commands to
145+
sequence of machine instructions.
146+
The code for an arithmetic expression `a` executes in sequence (no
147+
branches), and deposits the value of `a` at the top of the stack.
148+
This is the familiar translation to "reverse Polish notation".
149+
150+
The only twist here is that the machine has no "subtract" instruction.
151+
However, it can compute the difference `a - b` by adding `a` to the
152+
opposite of `b`.
153+
-/
85154
@[grind] def compile_aexp (a : aexp) : List instr :=
86155
match a with
87156
| .CONST n => .Iconst n :: []
88157
| .VAR x => .Ivar x :: []
89158
| .PLUS a1 a2 => (compile_aexp a1) ++ (compile_aexp a2) ++ (.Iadd :: [])
90159
| .MINUS a1 a2 => compile_aexp a1 ++ compile_aexp a2 ++ (.Iopp :: .Iadd :: [])
91160

161+
/-
162+
For Boolean expressions `b`, we could produce code that deposits `1` or `0`
163+
at the top of the stack, depending on whether `b` is true or false.
164+
The compiled code for `.IFTHENELSE` and `.WHILE` commands would, then,
165+
compare this 0/1 value against 0 and branch to the appropriate piece of code.
166+
167+
However, it is simpler and more efficient to compile a Boolean expression `b`
168+
to a piece of code that directly jumps `d1` or `d0` instructions forward,
169+
depending on whether `b` is true or false. The actual value of `b` is
170+
never computed as an integer, and the stack is unchanged.
171+
-/
92172
@[grind] def compile_bexp (b : bexp) (d1 : Int) (d0 : Int) : List instr :=
93173
match b with
94174
| .TRUE => if d1 = 0 then [] else .Ibranch d1 :: []
@@ -101,6 +181,13 @@ def machine_goes_wrong (C : List instr) (s_init : store) : Prop :=
101181
let code1 := compile_bexp b1 0 (codelen code2 + d0)
102182
code1 ++ code2
103183

184+
/-
185+
The code for a command `c`:
186+
- updates the store (the values of variables) as prescribed by `c`
187+
- preserves the stack
188+
- finishes "at the end" of the generated code: the next instruction
189+
executed is the instruction that will follow the generated code.
190+
-/
104191
@[grind] def compile_com (c : com) : List instr :=
105192
match c with
106193
| .SKIP =>
@@ -123,6 +210,10 @@ def machine_goes_wrong (C : List instr) (s_init : store) : Prop :=
123210
++ code_body
124211
++ .Ibranch (- (codelen code_test + codelen code_body + 1)) :: []
125212

213+
/-
214+
The code for a program `p` (a command) is similar, but terminates
215+
cleanly on an `.Ihalt` instruction.
216+
-/
126217
def compile_program (p : com) : List instr :=
127218
compile_com p ++ .Ihalt :: []
128219

@@ -133,11 +224,23 @@ def compile_program (p : com) : List instr :=
133224
def smart_Ibranch (d : Int) : List instr :=
134225
if d = 0 then [] else .Ibranch d :: []
135226

227+
/-
228+
4. First compiler correctness proofs
229+
230+
To reason about the execution of compiled code, we need to consider
231+
code sequences `C2` that are at position `pc` in a bigger code
232+
sequence `C = C1 ++ C2 ++ C3`. The following predicate
233+
`code_at C pc C2` does just this.
234+
-/
136235
@[grind] inductive code_at : List instr → Int → List instr → Prop where
137236
| code_at_intro : ∀ C1 C2 C3 pc,
138237
pc = codelen C1 ->
139238
code_at (C1 ++ C2 ++ C3) pc C2
140239

240+
/-
241+
We show a number of simple lemmas about `instr_at` and `code_at`,
242+
and annotate them with `@[grind]`, so that `grind` can use them automatically.
243+
-/
141244
@[grind =] theorem codelen_cons :
142245
∀ i c, codelen (i :: c) = codelen c + 1 := by grind
143246

@@ -232,6 +335,19 @@ theorem code_at_head :
232335
@[grind] theorem code_at_to_instr_at : code_at C pc (c1 ++ i :: c2) → instr_at C (pc + codelen c1) = .some i := by
233336
grind
234337

338+
/-
339+
4.1 Correctness of generated code for expressions.
340+
341+
Remember the informal specification we gave for the code generated
342+
for an arithmetic expression `a`. It should
343+
- execute in sequence (no branches)
344+
- deposit the value of `a` at the top of the stack
345+
- preserve the variable state.
346+
347+
We now prove that the code `compile_aexp a` fulfills this contract.
348+
The proof is a nice induction on the structure of `a`.
349+
-/
350+
235351
theorem compile_aexp_correct (C : List instr) (s : store) (a : aexp) (pc : Int) (stk : stack) :
236352
code_at C pc (compile_aexp a) →
237353
transitions C (pc, stk, s) (pc + codelen (compile_aexp a), aeval s a :: stk, s) := by
@@ -275,7 +391,16 @@ theorem compile_aexp_correct (C : List instr) (s : store) (a : aexp) (pc : Int)
275391
· have := @code_at_to_instr_at C pc (compile_aexp a1 ++ compile_aexp a2 ++ [instr.Iopp])
276392
have := @transition.trans_add
277393
grind
278-
-- Miss 5
394+
395+
/-
396+
The proof for the compilation of Boolean expressions is similar.
397+
We recall the informal specification for the code generated by
398+
[compile_bexp b d1 d0]: it should
399+
- skip `d1` instructions if `b` evaluates to true,
400+
`d0` if `b` evaluates to false
401+
- leave the stack unchanged
402+
- leave the store unchanged.
403+
-/
279404
theorem compile_bexp_correct (C : List instr) (s : store) (b : bexp) (d1 d0 : Int) (pc : Int) (stk : stack) (h : code_at C pc (compile_bexp b d1 d0)) :
280405
transitions C
281406
(pc, stk, s)
@@ -344,7 +469,9 @@ theorem compile_bexp_correct (C : List instr) (s : store) (b : bexp) (d1 d0 : In
344469
exact b2_ih
345470
case neg isFalse =>
346471
grind
347-
472+
/-
473+
4.2 Correctness of generated code for commands, terminating case.
474+
-/
348475
theorem compile_com_correct_terminating (s s' : store) (c : com) (h₁ : cexec s c s') :
349476
∀ C pc stk, code_at C pc (compile_com c) →
350477
transitions C
@@ -453,6 +580,36 @@ theorem compile_com_correct_terminating (s s' : store) (c : com) (h₁ : cexec s
453580
apply ih2
454581
grind
455582

583+
/-
584+
7. Full proof of compiler correctness
585+
586+
We would like to strengthen the correctness result above so that it
587+
is not restricted to terminating source programs, but also applies to
588+
source program that diverge. To this end, we abandon the big-step
589+
semantics for commands and switch to the small-step semantics with
590+
continuations. We then show a simulation theorem, establishing that
591+
every transition of the small-step semantics in the source program
592+
is simulated (in a sense to be made precise below) by zero, one or
593+
several transitions of the machine executing the compiled code for
594+
the source program.
595+
596+
Our first task is to relate configurations `(c, k, s)` of the small-step
597+
semantics with configurations `(C, pc, stk, s)` of the machine.
598+
We already know how to relate a command `c` with the machine code,
599+
using the `codeseq_at` predicate. What needs to be defined is a relation
600+
between the continuation `k` and the machine code.
601+
602+
Intuitively, when the machine finishes executing the generated code for
603+
command `c`, that is, when it reaches the program point
604+
`pc + codelen(compile_com c)`, the machine should continue by executing
605+
instructions that perform the pending computations described by
606+
continuation `k`, then reach an `.Ihalt` instruction to stop cleanly.
607+
608+
We formalize this intution by the following inductive predicate
609+
`compile_cont C k pc`, which states that, starting at program point `pc`,
610+
there are instructions that perform the computations described in `k`
611+
and reach an `.Ihalt` instruction.
612+
-/
456613
inductive compile_cont (C : List instr) : cont -> Int -> Prop where
457614
| ccont_stop : ∀ pc,
458615
instr_at C pc = .some .Ihalt ->
@@ -474,13 +631,51 @@ inductive compile_cont (C : List instr) : cont -> Int -> Prop where
474631
pc' = pc + 1 + d ->
475632
compile_cont C k pc' ->
476633
compile_cont C k pc
477-
634+
/-
635+
Then, a configuration `(c,k,s)` of the small-step semantics matches
636+
a configuration `(C, pc, stk, s')` of the machine if the following conditions hold:
637+
- The stores are identical: `s' = s`.
638+
- The machine stack is empty: `stk = []]`.
639+
- The machine code at point `pc` is the compiled code for `c`:
640+
`code_at C pc (compile_com c)`.
641+
- The machine code at point `pc + codelen (compile_com c)` matches continuation
642+
`k`, in the sense of `compile_cont` above.
643+
-/
478644
inductive match_config (C : List instr) : com × cont × store -> config -> Prop where
479645
| match_config_intro : ∀ c k st pc,
480646
code_at C pc (compile_com c) ->
481647
compile_cont C k (pc + codelen (compile_com c)) ->
482648
match_config C (c, k, st) (pc, [], st)
483649

650+
/-
651+
We are now ready to prove the expected simulation property.
652+
Since some transitions in the source command correspond to zero transitions
653+
in the compiled code, we need a simulation diagram of the "star" kind.
654+
match_config
655+
c / k / st ----------------------- machconfig
656+
| |
657+
| | + or ( * and |c',k'| < |c,k| )
658+
| |
659+
v v
660+
c' / k' / st' ----------------------- machconfig'
661+
match_config
662+
663+
Note the stronger conclusion on the right:
664+
- either the virtual machine does one or several transitions
665+
- or it does zero, one or several transitions, but the size of the `c,k`
666+
pair decreases strictly.
667+
668+
It would be equivalent to state:
669+
- either the virtual machine does one or several transitions
670+
- or it does zero transitions, but the size of the `c,k` pair decreases strictly.
671+
672+
However, the formulation above, with the "star" case, is often more convenient.
673+
674+
Finding an appropriate "anti-stuttering" measure is a bit of a black art.
675+
After trial and error, we find that the following measure works. It is
676+
the sum of the sizes of the command `c` under focus and all the commands
677+
appearing in the continuation `k`.
678+
-/
484679
def com_size (c : com) : Nat :=
485680
match c with
486681
| .SKIP => 1
@@ -489,7 +684,6 @@ def com_size (c : com) : Nat :=
489684
| .IFTHENELSE _ c1 c2 => (com_size c1 + com_size c2 + 1)
490685
| .WHILE _ c1 => (com_size c1 + 1)
491686

492-
493687
theorem com_size_nonzero : ∀ c, (com_size c > 0) := by
494688
intro c
495689
fun_induction com_size with grind
@@ -504,6 +698,9 @@ def measure' (impconf : com × cont × store) : Nat :=
504698
match impconf with
505699
| (c, k, _) => (com_size c + cont_size k)
506700

701+
/-
702+
A few technical lemmas to help with the simulation proof.
703+
-/
507704
theorem compile_cont_Kstop_inv (C : List instr) (pc : Int) (s : store) :
508705
compile_cont C .Kstop pc →
509706
∃ pc',
@@ -560,6 +757,9 @@ theorem match_config_skip (C : List instr) (k : cont) (s : store) (pc : Int) (H
560757
· cases H <;> grind
561758
· grind
562759

760+
/-
761+
At last, we can state and prove the simulation diagram.
762+
-/
563763
theorem simulation_step :
564764
∀ C impconf1 impconf2 machconf1,
565765
step impconf1 impconf2 ->
@@ -731,6 +931,13 @@ theorem simulation_step :
731931
· exact w₂.1
732932
· exact w₂.2
733933

934+
/-
935+
The hard work is done! Nice consequences will follow.
936+
937+
First, we get an alternate proof of `compile_program_correct_terminating`,
938+
using the continuation semantics instead of the big-step semantics
939+
to characterize termination of the source program.
940+
-/
734941
theorem simulation_steps :
735942
∀ C impconf1 impconf2, star step impconf1 impconf2 ->
736943
∀ machconf1, match_config C impconf1 machconf1 ->
@@ -796,6 +1003,11 @@ theorem compile_program_correct_terminating_2 :
7961003
exact D
7971004
· exact E
7981005

1006+
/-
1007+
Second, and more importantly, we get a proof of semantic preservation
1008+
for diverging source programs: if the program makes infinitely many steps,
1009+
the generated code makes infinitely many machine transitions.
1010+
-/
7991011
theorem simulation_infseq_inv :
8001012
∀ C n impconf1 machconf1,
8011013
infseq step impconf1 -> match_config C impconf1 machconf1 ->

0 commit comments

Comments
 (0)