diff --git a/CHANGELOG.md b/CHANGELOG.md index fd25ed6..7aa2bc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@ Prose references a version as `v0.X.Y`; headings stay bare `[0.X.Y]`. ### Added +- Codegen for `if` / `else` / `else if` chains (`compiler/src/codegen.c`). The v0.3 codegen rejected `if` statements; this lifts the restriction. Cleave-compiled WASM modules can now do branching. New `emit_if` lowers to WASM's native `if 0x40 ... else ... end` instruction sequence with an empty block type (if-as-expression remains gated on the parser supporting it). `else if` chains nest naturally as `if ... else (if ...)`. Closes #49. +- Cond coercion to i32. WASM's `if` instruction consumes an i32, but Cleave-compiled expressions usually leave i64 on the stack (literals, identifier reads, calls, arithmetic). Codegen now emits `i32.wrap_i64` (0xA7) before the `if` byte when needed. Comparisons (`==`, `!=`, `<`, `>`, `<=`, `>=`) already produce i32 per the WASM spec, so no wrap is emitted in that case; a new `leaves_i32_on_stack` helper distinguishes the two. +- New `leaves_value_on_stack` helper that returns false for assignment expressions. Used in both the if-branch and fn-body code paths so a block whose trailing expression is `x = expr` does not get a spurious `drop` or default-zero. Fixes a latent bug that existed before this PR: a fn whose body ended in an assignment produced invalid WASM. The pre-existing tests did not catch it because every example always had a non-assignment trailing expression. +- 6 codegen tests covering: if without else, if with else, else-if chain, comparison-cond (no wrap), bool-literal cond (wrap), and the assignment-branch-no-drop regression that surfaced during this PR's development. +- `codegen_if_heavy` bench case: 5-way else-if chain. +- New opcodes registered in `wasm.h`: `WASM_OP_BLOCK 0x02`, `WASM_OP_LOOP 0x03`, `WASM_OP_IF 0x04`, `WASM_OP_ELSE 0x05`, `WASM_OP_I32_WRAP_I64 0xA7`. New constant `WASM_BLOCKTYPE_EMPTY 0x40`. - CI gains four lower-cost gates: - **`wasm-validate` on emitted modules.** The compiler job installs wabt, compiles every example marked "yes" in `examples/README.md`, and runs `wasm-validate` on the output. Catches structurally invalid modules that pass our byte-level codegen unit tests. - **Determinism check.** The compiler job compiles `examples/counter-mvp.cv` twice and `cmp -s` the outputs; non-zero exit on any difference. Guards against nondeterminism from HashMap iteration order, pointer addresses, build timestamps. diff --git a/compiler/bench/codegen_bench.c b/compiler/bench/codegen_bench.c index cc64f22..913e5bb 100644 --- a/compiler/bench/codegen_bench.c +++ b/compiler/bench/codegen_bench.c @@ -31,6 +31,22 @@ static const char *ARITHMETIC_HEAVY = " }\n" "}"; +/* Exercises the if/else codegen path (issue #49): an else-if chain + * that drives the cond coercion + nested-if logic. */ +static const char *IF_HEAVY = + "module M {\n" + " state x: u64\n" + " fn classify(n: u64) -> u64 {\n" + " if n == 0 { x = 1 }\n" + " else if n == 1 { x = 2 }\n" + " else if n == 2 { x = 3 }\n" + " else if n == 3 { x = 4 }\n" + " else if n == 4 { x = 5 }\n" + " else { x = 6 }\n" + " x\n" + " }\n" + "}"; + /* Exercises the let-binding codegen path (issue #44): eight local * variables, the trailing expression reads several of them. The * pre-scan walks 8 stmts; the locals declaration is one i64 group @@ -92,6 +108,18 @@ int main(void) { wasm_free(&bin); }); + BENCH("codegen_if_heavy", { + Lexer lex; lexer_init(&lex, IF_HEAVY); + Parser p; parser_init(&p, &lex); + AstNode *prog = parser_parse_program(&p); + TypeChecker tc; typecheck_init(&tc, DEV_NULL); + (void)typecheck_program(&tc, prog); + Codegen cg; cg_init(&cg, DEV_NULL); + WasmBuf bin; + (void)cg_compile_program(&cg, prog, &bin); + wasm_free(&bin); + }); + fclose(DEV_NULL); return 0; } diff --git a/compiler/include/wasm.h b/compiler/include/wasm.h index eae351e..6d5db31 100644 --- a/compiler/include/wasm.h +++ b/compiler/include/wasm.h @@ -96,6 +96,10 @@ enum { /* Opcodes used by Cleave codegen. Names mirror the spec's text format. */ enum { + WASM_OP_BLOCK = 0x02, + WASM_OP_LOOP = 0x03, + WASM_OP_IF = 0x04, + WASM_OP_ELSE = 0x05, WASM_OP_END = 0x0B, WASM_OP_RETURN = 0x0F, WASM_OP_CALL = 0x10, @@ -124,7 +128,17 @@ enum { WASM_OP_I64_REM_S = 0x81, WASM_OP_I64_REM_U = 0x82, WASM_OP_I64_AND = 0x83, - WASM_OP_I64_OR = 0x84 + WASM_OP_I64_OR = 0x84, + WASM_OP_I32_WRAP_I64 = 0xA7 /* drops upper 32 bits of an i64; used to + coerce a bool-shaped i64 (0/1) into an + i32 for the if instruction */ +}; + +/* Block-type byte that immediately follows a block / loop / if instruction. + * `empty` means no result on the stack at the end of the block (the only + * case the v0 codegen needs since if-as-expression is not yet supported). */ +enum { + WASM_BLOCKTYPE_EMPTY = 0x40 }; /* Export descriptor kinds. */ diff --git a/compiler/src/codegen.c b/compiler/src/codegen.c index 7a65a92..7e5bccf 100644 --- a/compiler/src/codegen.c +++ b/compiler/src/codegen.c @@ -26,11 +26,15 @@ * - state = expr -> i32.const ; ; call state_set * - local = expr -> ; local.set * - module-fn call -> args...; call + * - if / else / else-if -> ; if; ; else; ; end (issue #49) * - block trailing expr -> becomes the function's return value * * What we deliberately reject * --------------------------- - * - control flow (if/match/loops) -- see issues #43, #49 + * - match (issue #43); loops (no issue yet, deliberate) + * - if-as-expression (`let x = if c { 1 } else { 2 }`): grammar does + * not parse if at expression position today; lift when the parser + * does * - sum types (`Result`, `Option`) -- see issue #48 * - identifiers we cannot resolve * @@ -333,6 +337,93 @@ static void emit_call(CodegenCtx *ctx, WasmBuf *body, AstNode *node) { wasm_write_leb_u32(body, f->fn_index); } +/* Returns 1 if `expr`, when emitted, pushes a value onto the operand + * stack. The only v0 expression that does NOT push is an assignment + * (`x = y`), because both state_set (a void hostcall) and local.set + * consume the value silently. Callers use this to decide whether + * trailing-expression slots and if-branch results need a `drop` or + * fn-body default-zero to balance the stack. */ +static int leaves_value_on_stack(const AstNode *expr) { + if (!expr) return 0; + if (expr->kind == AST_EXPR_BINARY + && expr->as.binary.op.length == 1 + && expr->as.binary.op.start[0] == '=') { + return 0; + } + return 1; +} + +/* Returns 1 if `node`, when emitted, leaves an i32 on the WASM operand + * stack rather than the usual i64. The v0 codegen produces i32 only for + * comparison binary operators (i64.eq, i64.lt_u, etc. all return i32 + * per the WASM spec). Everything else (literals, identifiers, calls, + * arithmetic) leaves i64. Used by emit_if to decide whether the cond + * value needs an i32.wrap_i64 coercion before the `if` instruction + * consumes it. */ +static int leaves_i32_on_stack(const AstNode *node) { + if (!node) return 0; + if (node->kind != AST_EXPR_BINARY) return 0; + StrRef op = node->as.binary.op; + if (op.length == 1) { + return op.start[0] == '<' || op.start[0] == '>'; + } + if (op.length == 2) { + return memcmp(op.start, "==", 2) == 0 + || memcmp(op.start, "!=", 2) == 0 + || memcmp(op.start, "<=", 2) == 0 + || memcmp(op.start, ">=", 2) == 0; + } + return 0; +} + +static void emit_if(CodegenCtx *ctx, WasmBuf *body, AstNode *node); +static void emit_block(CodegenCtx *ctx, WasmBuf *body, AstNode *node); + +/* Emit one branch of an if statement. v0 if statements have an empty + * block type, so any value the branch leaves on the stack must be + * dropped to keep the WASM type checker happy. When sum types and + * if-as-expression land, this layer learns to pass the value through + * instead of dropping it. */ +static void emit_if_branch(CodegenCtx *ctx, WasmBuf *body, AstNode *branch) { + if (!branch) return; + if (branch->kind == AST_EXPR_BLOCK) { + emit_block(ctx, body, branch); + /* Drop the trailing result so the block produces no value, but + * only when the result expression actually pushed something. + * Assignments (state_set / local.set) leave the stack empty; + * a `drop` there would underflow. */ + if (leaves_value_on_stack(branch->as.block.result)) { + wasm_write_byte(body, WASM_OP_DROP); + } + } else if (branch->kind == AST_STMT_IF) { + /* `else if` chain: nest another if/else/end. */ + emit_if(ctx, body, branch); + } +} + +static void emit_if(CodegenCtx *ctx, WasmBuf *body, AstNode *node) { + StmtIf *i = &node->as.if_stmt; + + emit_expr(ctx, body, i->cond); + if (!leaves_i32_on_stack(i->cond)) { + /* The condition came out as i64 (bool literal, identifier read, + * call result, etc.). WASM `if` expects an i32, so narrow. */ + wasm_write_byte(body, WASM_OP_I32_WRAP_I64); + } + + wasm_write_byte(body, WASM_OP_IF); + wasm_write_byte(body, WASM_BLOCKTYPE_EMPTY); + + emit_if_branch(ctx, body, i->then_branch); + + if (i->else_branch) { + wasm_write_byte(body, WASM_OP_ELSE); + emit_if_branch(ctx, body, i->else_branch); + } + + wasm_write_byte(body, WASM_OP_END); +} + static void emit_block(CodegenCtx *ctx, WasmBuf *body, AstNode *node) { ExprBlock *b = &node->as.block; for (size_t i = 0; i < b->n_stmts; ++i) { @@ -374,6 +465,8 @@ static void emit_block(CodegenCtx *ctx, WasmBuf *body, AstNode *node) { emit_expr(ctx, body, l->value); wasm_write_byte(body, WASM_OP_LOCAL_SET); wasm_write_leb_u32(body, bind->local_index); + } else if (s->kind == AST_STMT_IF) { + emit_if(ctx, body, s); } else { cg_report(ctx, &s->span, "statement kind %s not yet supported in v0 codegen", @@ -512,10 +605,14 @@ static void emit_fn_body(CodegenCtx *ctx, AstNode *fn_decl, } else { emit_block(ctx, body_out, f->body); /* If the block has no trailing expression and no return, push a - * default 0 so the function's i64 return type is satisfied. */ + * default 0 so the function's i64 return type is satisfied. + * `leaves_value_on_stack` treats an assignment-as-result as + * not-a-value, which is the right call here: a fn that ends + * with `x = expr` needs the synthetic 0 just like one with no + * trailing expression at all. */ ExprBlock *bb = &f->body->as.block; int needs_default = 1; - if (bb->result) needs_default = 0; + if (leaves_value_on_stack(bb->result)) needs_default = 0; for (size_t i = 0; i < bb->n_stmts; ++i) { if (bb->stmts[i]->kind == AST_STMT_RETURN) { needs_default = 0; diff --git a/compiler/tests/codegen_test.c b/compiler/tests/codegen_test.c index a359291..cad4605 100644 --- a/compiler/tests/codegen_test.c +++ b/compiler/tests/codegen_test.c @@ -368,6 +368,131 @@ TEST(test_let_assignment_writes_back_to_local) { wasm_free(&bin); } +/* ============== if / else / else-if (issue #49) ============== */ + +TEST(test_if_without_else_emits_if_blocktype_end) { + /* Simple if with no else. We expect to find: + * if 0x40 (0x04 0x40, empty block type) + * ... then body ... + * end (0x0B) + * No else byte (0x05). Use a comparison condition so no i32.wrap_i64 + * coercion is emitted between cond and the if instruction. */ + const char *src = + "module M {\n" + " state x: u64\n" + " fn f() -> u64 { if x == 0 { x = 1 }; x }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + const uint8_t needle[] = { 0x04, 0x40 }; + ASSERT(contains_bytes(&bin, needle, sizeof(needle))); + wasm_free(&bin); +} + +TEST(test_if_with_else_emits_if_else_end) { + /* Look for `if 0x40 ... else (0x05) ... end (0x0B)` substring. */ + const char *src = + "module M {\n" + " state x: u64\n" + " fn f() -> u64 { if x == 0 { x = 1 } else { x = 2 }; x }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + /* The exact then-body bytes for `x = 1` are: + * i32.const 0 (slot) 0x41 0x00 + * i64.const 1 0x42 0x01 + * call 1 (state_set) 0x10 0x01 + * Then else (0x05). */ + const uint8_t needle[] = { + 0x04, 0x40, /* if empty */ + 0x41, 0x00, 0x42, 0x01, 0x10, 0x01, /* then: x = 1 */ + 0x05, /* else */ + }; + ASSERT(contains_bytes(&bin, needle, sizeof(needle))); + wasm_free(&bin); +} + +TEST(test_else_if_chain_nests_inner_if) { + /* `if a { ... } else if b { ... }` should emit two `if 0x40` and + * two `end` bytes. */ + const char *src = + "module M {\n" + " state x: u64\n" + " fn f() -> u64 { if x == 0 { x = 1 } else if x == 1 { x = 2 } else { x = 3 }; x }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + /* Two `if 0x40` substrings should appear (outer + inner else-if). */ + size_t first = 0, second = 0; + const uint8_t needle[] = { 0x04, 0x40 }; + /* Count occurrences. */ + int hits = 0; + for (size_t i = 0; i + sizeof(needle) <= bin.len; ++i) { + if (memcmp(bin.data + i, needle, sizeof(needle)) == 0) { + if (hits == 0) first = i; + else second = i; + hits++; + } + } + (void)first; (void)second; + ASSERT(hits >= 2); + wasm_free(&bin); +} + +TEST(test_comparison_cond_does_not_need_i32_wrap) { + /* Comparison opcodes (i64.eq etc.) return i32 in WASM, so the cond + * value is already the right shape for `if`. The codegen should NOT + * emit i32.wrap_i64 (0xA7) immediately before the `if 0x40` bytes. */ + const char *src = + "module M {\n" + " fn f(a: u64) -> u64 { if a == 1 { } else { }; 0 }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + /* The sequence `0xA7 0x04 0x40` would indicate a redundant wrap. + * Confirm it does NOT appear. */ + const uint8_t bad[] = { 0xA7, 0x04, 0x40 }; + ASSERT(!contains_bytes(&bin, bad, sizeof(bad))); + wasm_free(&bin); +} + +TEST(test_bool_literal_cond_emits_i32_wrap_i64) { + /* A bool literal (or any non-comparison expression) leaves i64 on + * the stack; the codegen must emit i32.wrap_i64 (0xA7) before the + * if instruction. */ + const char *src = + "module M {\n" + " fn f() -> u64 { if true { } else { }; 0 }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + /* Expect: i64.const 1 (0x42 0x01), then i32.wrap_i64 (0xA7), + * then if (0x04 0x40). */ + const uint8_t needle[] = { 0x42, 0x01, 0xA7, 0x04, 0x40 }; + ASSERT(contains_bytes(&bin, needle, sizeof(needle))); + wasm_free(&bin); +} + +TEST(test_if_with_assignment_branch_does_not_emit_drop) { + /* Regression test: when an if branch is `{ x = 5 }`, the parser + * puts the assignment in block.result (no semicolon, sits right + * before `}`). The branch leaves nothing on the stack because + * state_set is void. The codegen must NOT emit a `drop` (0x1A), + * since that would underflow. */ + const char *src = + "module M {\n" + " state x: u64\n" + " fn f() -> u64 { if 1 == 1 { x = 5 }; x }\n" + "}"; + WasmBuf bin; + ASSERT(compile_source(src, &bin)); + /* If the bug returns, a drop byte (0x1A) appears right after + * `call 1` (state_set, 0x10 0x01) and before `end` (0x0B). */ + const uint8_t bad[] = { 0x10, 0x01, 0x1A }; + ASSERT(!contains_bytes(&bin, bad, sizeof(bad))); + wasm_free(&bin); +} + TEST(test_no_let_bindings_emits_zero_local_groups) { /* Functions with no let bindings should still emit a leading 0 * (zero local groups) in their code section. Regression guard for @@ -416,5 +541,13 @@ int main(void) { RUN(test_let_assignment_writes_back_to_local); RUN(test_no_let_bindings_emits_zero_local_groups); + /* if / else / else-if (issue #49) */ + RUN(test_if_without_else_emits_if_blocktype_end); + RUN(test_if_with_else_emits_if_else_end); + RUN(test_else_if_chain_nests_inner_if); + RUN(test_comparison_cond_does_not_need_i32_wrap); + RUN(test_bool_literal_cond_emits_i32_wrap_i64); + RUN(test_if_with_assignment_branch_does_not_emit_drop); + REPORT(); }