Skip to content

Commit 813d04d

Browse files
levkroppclaude
andcommitted
Fix: peephole phi_coalesce removes param pre-stores across calls
The phi_coalesce peephole pass coalesces register copies like `movq %rdi, %r12; ... movslq %r12d, %r13; movq %r13, %rdi` by eliminating the initial copy and rewriting to use %rdi directly. This is incorrect when a function call between the copy and the copy-back clobbers the caller-saved source register (%rdi). Fix: block coalescing when the source is a caller-saved register and the destination is callee-saved. These copies save values that must survive across function calls. Also adds fallback param pre-store logic for promoted allocas (where mem2reg eliminated the alloca but the register allocator still assigned the promoted value to a callee-saved register). SQLite status: works with CCC_NO_PEEPHOLE=1. Multiple peephole passes still have unsound transformations affecting callee-saved register stores. The phi_coalesce fix addresses one class but other passes (dead store elimination, copy propagation) have similar issues requiring a broader peephole audit. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 04ad1f5 commit 813d04d

3 files changed

Lines changed: 140 additions & 4 deletions

File tree

src/backend/x86/codegen/peephole/passes/dead_code.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,36 @@ pub(super) fn eliminate_dead_reg_moves(store: &LineStore, infos: &mut [LineInfo]
5757
i += 1;
5858
continue;
5959
}
60+
// Don't eliminate moves to callee-saved registers in the prologue area.
61+
// The register allocator assigns param values to callee-saved regs (rbx,
62+
// r12-r15), and the pre-store in emit_store_params copies ABI arg regs
63+
// to them. The dead-move analysis can't see past function calls (which
64+
// are barriers), so it incorrectly considers these writes dead when the
65+
// next visible use is after a call. Callee-saved regs survive calls,
66+
// so their pre-call writes are never truly dead.
67+
// Register encoding: rbx=1 (encoded differently in peephole)
68+
// We check: if the instruction is in the first few lines (prologue area)
69+
// and writes to a callee-saved register, skip elimination.
70+
// A simpler and more robust check: if the destination is a callee-saved
71+
// register (rbx/r12/r13/r14/r15) AND source is an arg register (rdi/rsi/
72+
// rdx/rcx/r8/r9), this is likely a param pre-store — don't eliminate.
73+
// Protect param pre-stores: movq from ABI arg regs to callee-saved regs.
74+
// These save function parameters before calls clobber the arg registers.
75+
// The dead-move scanner can't see past calls (barriers), so it incorrectly
76+
// considers these writes dead. Callee-saved regs survive calls.
77+
// Arg regs: rdi=7, rsi=6, rdx=2, rcx=1, r8=8, r9=9
78+
// Callee-saved: rbx=3, r12=12, r13=13, r14=14, r15=15
79+
{
80+
let trimmed = infos[i].trimmed(store.get(i));
81+
if let Some((src_id, _dst_id)) = parse_reg_to_reg_movq(&infos[i], trimmed) {
82+
let src_is_arg = matches!(src_id, 1 | 2 | 6 | 7 | 8 | 9);
83+
let dst_is_callee = matches!(dst_reg, 3 | 12 | 13 | 14 | 15);
84+
if src_is_arg && dst_is_callee {
85+
i += 1;
86+
continue;
87+
}
88+
}
89+
}
6090

6191
let dst_mask = 1u16 << dst_reg;
6292
let mut dead = false;

src/backend/x86/codegen/peephole/passes/local_patterns.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,14 @@ pub(super) fn coalesce_phi_register_copies(
21112111
// Don't coalesce rax(0) or rcx(1) as SRC — they're accumulator regs
21122112
// with heavy implicit use
21132113
if src_family <= 1 || tmp_family <= 1 { i += 1; continue; }
2114+
// Don't coalesce when SRC is a caller-saved register and TMP is callee-saved.
2115+
// Caller-saved registers (rax=0, rcx=1, rdx=2, rsi=6, rdi=7, r8-r11=8-11)
2116+
// get clobbered by function calls. Coalescing away the copy to a callee-saved
2117+
// register loses the value across calls. This is critical for parameter
2118+
// pre-stores (movq %rdi, %r12) that save params before calls.
2119+
let src_is_caller_saved = matches!(src_family, 0 | 1 | 2 | 6 | 7 | 8 | 9 | 10 | 11);
2120+
let tmp_is_callee_saved = matches!(tmp_family, 3 | 5 | 12 | 13 | 14 | 15);
2121+
if src_is_caller_saved && tmp_is_callee_saved { i += 1; continue; }
21142122

21152123
let src_bit = 1u16 << src_family;
21162124
let tmp_bit = 1u16 << tmp_family;

src/backend/x86/codegen/prologue.rs

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -434,16 +434,22 @@ impl X86Codegen {
434434
let has_slot = find_param_alloca(func, i)
435435
.and_then(|(dest, _)| self.state.get_slot(dest.0))
436436
.is_some();
437+
if std::env::var("CCC_DEBUG_PARAM_STORE").is_ok() {
438+
let reg = self.reg_assignments.get(&paramref_dest.0);
439+
eprintln!("[PRE-STORE] param {} paramref_dest={} has_slot={} reg={:?}", i, paramref_dest.0, has_slot, reg);
440+
}
437441
if !has_slot {
438442
if let Some(&phys_reg) = self.reg_assignments.get(&paramref_dest.0) {
439443
// Only pre-store to callee-saved registers (PhysReg 1-5).
440444
// Caller-saved registers (rdi, rsi, r8-r11) cannot be used
441445
// because they overlap with ABI argument registers that
442446
// haven't been saved yet.
443447
let is_callee_saved = phys_reg.0 >= 1 && phys_reg.0 <= 5;
448+
if std::env::var("CCC_DEBUG_PARAM_STORE").is_ok() {
449+
let shared = reg_to_params.get(&phys_reg.0).is_some_and(|u| u.len() > 1);
450+
eprintln!("[PRE-STORE] callee={} shared={} class={:?}", is_callee_saved, shared, class);
451+
}
444452
if is_callee_saved {
445-
// Safety check: if another param's dest is also assigned
446-
// to this register, skip pre-store to avoid conflicts.
447453
let shared = reg_to_params.get(&phys_reg.0)
448454
.is_some_and(|users| users.len() > 1);
449455
if !shared {
@@ -452,15 +458,107 @@ impl X86Codegen {
452458
self.state.out.emit_instr_reg_reg(
453459
" movq", X86_ARG_REGS[reg_idx], dest_reg);
454460
self.state.param_pre_stored.insert(i);
455-
// Track the source arg register for this callee-saved reg
456-
// so register-direct call arg loading can avoid round-trips
457461
self.param_source_regs.insert(phys_reg.0, X86_ARG_REGS[reg_idx]);
458462
} // TODO: handle StackSlot/SSE params
459463
}
460464
}
461465
}
462466
continue;
463467
}
468+
} else {
469+
// DEBUG: dump entry block instructions for this param
470+
if std::env::var("CCC_DEBUG_PARAM_STORE").is_ok() {
471+
if let Some((alloca_dest, _)) = find_param_alloca(func, i) {
472+
eprintln!("[PARAM-STORE] param {} has alloca dest={}, no ParamRef", i, alloca_dest.0);
473+
eprintln!("[PARAM-STORE] has_slot={}", self.state.get_slot(alloca_dest.0).is_some());
474+
for (bi, block) in func.blocks.iter().enumerate() {
475+
for inst in &block.instructions {
476+
match inst {
477+
Instruction::Store { ptr, val, .. } if ptr.0 == alloca_dest.0 =>
478+
eprintln!("[PARAM-STORE] block[{}] Store to alloca: val={:?}", bi, val),
479+
Instruction::Load { ptr, dest, .. } if ptr.0 == alloca_dest.0 =>
480+
eprintln!("[PARAM-STORE] block[{}] Load from alloca: dest={}", bi, dest.0),
481+
Instruction::Copy { dest, src } =>
482+
eprintln!("[PARAM-STORE] block[{}] Copy dest={} src={:?}", bi, dest.0, src),
483+
Instruction::ParamRef { dest, param_idx, .. } =>
484+
eprintln!("[PARAM-STORE] block[{}] ParamRef dest={} idx={}", bi, dest.0, param_idx),
485+
_ => {}
486+
}
487+
}
488+
}
489+
for (&vid, &reg) in self.reg_assignments.iter() {
490+
eprintln!("[PARAM-STORE] reg_assign: val={} -> PhysReg({})", vid, reg.0);
491+
}
492+
}
493+
}
494+
// No ParamRef for this param. The alloca may have been promoted
495+
// by mem2reg, converting Store/Load chains to direct SSA references.
496+
// After promotion, the param value flows through Copy/Cast chains.
497+
//
498+
// The register allocator may have assigned a promoted value to a
499+
// callee-saved register. We must copy the ABI arg register to it
500+
// in the prologue, because ABI registers get clobbered by calls.
501+
//
502+
// Strategy: find the alloca for this param, then search for Store
503+
// instructions that write TO that alloca. The Store's source value
504+
// (which may be a Copy from the original param) tells us which SSA
505+
// value represents the parameter after store-to-load forwarding.
506+
let has_slot = find_param_alloca(func, i)
507+
.and_then(|(dest, _)| self.state.get_slot(dest.0))
508+
.is_some();
509+
if !has_slot {
510+
if let Some((alloca_dest, _)) = find_param_alloca(func, i) {
511+
let alloca_id = alloca_dest.0;
512+
// Collect all SSA values stored to this alloca, then
513+
// propagate through Copy chains to find all derived values.
514+
// Any of these that are register-assigned need the ABI arg
515+
// saved in the prologue.
516+
let mut param_vals: Vec<u32> = Vec::new();
517+
for block in &func.blocks {
518+
for inst in &block.instructions {
519+
if let Instruction::Store { ptr, val, .. } = inst {
520+
if ptr.0 == alloca_id {
521+
if let crate::ir::instruction::Operand::Value(v) = val {
522+
param_vals.push(v.0);
523+
}
524+
}
525+
}
526+
}
527+
}
528+
// Propagate through Copy chains
529+
let mut all_vals: crate::common::fx_hash::FxHashSet<u32> = param_vals.iter().copied().collect();
530+
let mut changed_prop = true;
531+
while changed_prop {
532+
changed_prop = false;
533+
for block in &func.blocks {
534+
for inst in &block.instructions {
535+
if let Instruction::Copy { dest, src: crate::ir::instruction::Operand::Value(v) } = inst {
536+
if all_vals.contains(&v.0) && all_vals.insert(dest.0) {
537+
changed_prop = true;
538+
}
539+
}
540+
}
541+
}
542+
}
543+
// Check if any derived value is callee-saved register-assigned
544+
for &vid in &all_vals {
545+
if let Some(&phys_reg) = self.reg_assignments.get(&vid) {
546+
let is_callee_saved = phys_reg.0 >= 1 && phys_reg.0 <= 5;
547+
if is_callee_saved {
548+
let dest_reg = phys_reg_name(phys_reg);
549+
if let ParamClass::IntReg { reg_idx } = class {
550+
self.state.out.emit_instr_reg_reg(
551+
" movq", X86_ARG_REGS[reg_idx], dest_reg);
552+
self.state.param_pre_stored.insert(i);
553+
self.param_source_regs.insert(phys_reg.0, X86_ARG_REGS[reg_idx]);
554+
break;
555+
}
556+
}
557+
}
558+
}
559+
}
560+
continue;
561+
}
464562
}
465563

466564
let (slot, ty) = if let Some((dest, ty)) = find_param_alloca(func, i) {

0 commit comments

Comments
 (0)