From 319b3cef10f4f0c6dc8adc313523ec5260770bd2 Mon Sep 17 00:00:00 2001 From: Zhijin Zeng Date: Thu, 21 May 2026 14:45:13 +0800 Subject: [PATCH] [RV64_DYNAREC] Modify nat flag register save logic to now save all scratch registers 1. Rename macro to SPILL_NF_REGISTERS, add implementation for LA64 and PPC64LE. 2. Modify nat flag register spill logic to now save all scratch registers. Fix scratch register corruption caused by non-consecutive flags producer and consumer when BOX64_DYNAREC_TRACE is enabled, which introduced by #3876. For example, the `test rax, rax` flags producer stores its flag calculation operands in scratch register `t3`. The next `mov r14, rax` instruction does not use scratch registers, but its associated trace code can still overwrite t3's value. This means we need to reference the flags consumer that is `jz 0x0000003F0000ABC0` to identify which registers require saving. But this is too complicated. So we went with the simpler approach of saving all scratch registers, this won't add noticeable performance overhead in trace mode. ``` [BOX64] 0x3f00009f68: 48 85 C0 test rax, rax [BOX64] 0x3ff7afaef0: 53 emitted opcodes, inst=14, barrier=0 state=3/1(0), set=3F/80, use=0, need=0/80, fuse=1/0, sm=0(0/0), sew@entry=7, sew@exit=7, pred=13 03f00e13 ADDI t3, zero, 0x3f(63) 45ccaa23 SW t3, emu_s9, 0x454(1108) 01087e33 AND t3, rax_a6, rax_a6 47ccb423 SD t3, emu_s9, 0x468(1128) [BOX64] New Instruction x64:0x3f00009f6b, native:0x3ff7afafc4 [BOX64] TRACE ---- [BOX64] n1:0 n2:0 ---- 01f80b37 LUI rip_s6, 0x1f80000(33030144) 005b0b1b ADDIW rip_s6, rip_s6, 0x5(5) 00db1b13 SLLI rip_s6, rip_s6, 0xd(13) f6bb0b13 ADDI rip_s6, rip_s6, 0xffffff6b(-149) ............................................................... ............................................................... ............................................................... 048cb783 LD r9_a5, emu_s9, 0x48(72) 000cb803 LD rax_a6, emu_s9, 0x0(0) 088cbb03 LD rip_s6, emu_s9, 0x88(136) 080cbb83 LD flags_s7, emu_s9, 0x80(128) fdfbfb93 ANDI flags_s7, flags_s7, 0xffffffdf(-33) 006bdf93 SRLI t6, flags_s7, 0x6(6) 020fff93 ANDI t6, t6, 0x20(32) 01fbebb3 OR flags_s7, flags_s7, t6 [BOX64] ---------- [BOX64] 0x3f00009f6b: 49 89 C6 mov r14, rax [BOX64] 0x3ff7afafc4: 54 emitted opcodes, inst=15, barrier=0 state=0/3(0), set=0/0, use=0, need=80/80, fuse=0/1, sm=0(0/0), sew@entry=7, sew@exit=7, pred=14 00080a13 MV r14_s4, rax_a6 [BOX64] New Instruction x64:0x3f00009f6e, native:0x3ff7afb09c [BOX64] TRACE ---- [BOX64] n1:28 n2:0 ---- ff010113 ADDI sp, sp, 0xfffffff0(-16) 01c13023 SD t3, sp, 0x0(0) 01f80b37 LUI rip_s6, 0x1f80000(33030144) 01fbebb3 OR flags_s7, flags_s7, t6 ............................................................... ............................................................... ............................................................... 00013e03 LD t3, sp, 0x0(0) 01010113 ADDI sp, sp, 0x10(16) [BOX64] ---------- [BOX64] 0x3f00009f6e: 0F 84 4C 0C 00 00 jz 0x0000003F0000ABC0 [BOX64] 0x3ff7afb09c: 67 emitted opcodes, inst=16, barrier=2 state=0/3(0), set=0/0, use=0, need=80/80, fuse=1/0, sm=0(0/0), sew@entry=7, sew@exit=7, pred=15, jmp=out 020e1463 BNE t3, zero, 0x28(40) # +10i(0x3ff7afb1a8) 00000013 NOP ``` --- src/dynarec/arm64/dynarec_arm64_private.h | 4 -- src/dynarec/dynarec_native_pass.c | 8 +++- src/dynarec/la64/dynarec_la64_private.h | 30 +++++++++++-- src/dynarec/ppc64le/dynarec_ppc64le_private.h | 30 +++++++++++-- src/dynarec/rv64/dynarec_rv64_private.h | 44 +++++++++---------- 5 files changed, 82 insertions(+), 34 deletions(-) diff --git a/src/dynarec/arm64/dynarec_arm64_private.h b/src/dynarec/arm64/dynarec_arm64_private.h index 477ab5daf4..c00ce4bba4 100644 --- a/src/dynarec/arm64/dynarec_arm64_private.h +++ b/src/dynarec/arm64/dynarec_arm64_private.h @@ -225,10 +225,6 @@ int Table64(dynarec_arm_t *dyn, uint64_t val, int pass); // add a value to tabl void CreateJmpNext(void* addr, void* next); -// TODO: Save and restore the temp register. -#define SAVE_ACTIVE_SCRATCH_REGISTERS do{} while(0); -#define LOAD_ACTIVE_SCRATCH_REGISTERS do{} while(0); - #define GO_TRACE(A, B, s0) \ GETIP(addr); \ MOVx_REG(x1, xRIP); \ diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 75798ae6c8..e1852d62e7 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -257,11 +257,15 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int if((trace_end == 0) || ((ip >= trace_start) && (ip < trace_end))) { MESSAGE(LOG_DUMP, "TRACE ----\n"); - if (BOX64ENV(dynarec_nativeflags)) SAVE_ACTIVE_SCRATCH_REGISTERS; + #if defined (SPILL_NF_REGISTERS) + if (BOX64ENV(dynarec_nativeflags)) SPILL_NF_REGISTERS; + #endif fpu_reflectcache(dyn, ninst, x1, x2, x3); GO_TRACE(PrintTrace, 1, x5); fpu_unreflectcache(dyn, ninst, x1, x2, x3); - if (BOX64ENV(dynarec_nativeflags)) LOAD_ACTIVE_SCRATCH_REGISTERS; + #if defined (RESTORE_NF_REGISTERS) + if (BOX64ENV(dynarec_nativeflags)) RESTORE_NF_REGISTERS; + #endif MESSAGE(LOG_DUMP, "----------\n"); } } diff --git a/src/dynarec/la64/dynarec_la64_private.h b/src/dynarec/la64/dynarec_la64_private.h index e70be9c261..003f62ee2c 100644 --- a/src/dynarec/la64/dynarec_la64_private.h +++ b/src/dynarec/la64/dynarec_la64_private.h @@ -200,9 +200,33 @@ int Table64(dynarec_la64_t *dyn, uint64_t val, int pass); // add a value to tab void CreateJmpNext(void* addr, void* next); -// TODO: Save and restore the temp register. -#define SAVE_ACTIVE_SCRATCH_REGISTERS do{} while(0); -#define LOAD_ACTIVE_SCRATCH_REGISTERS do{} while(0); +// While we could theoretically traverse forward to find the flags-consuming x86 +// instruction and get the exact scratch registers to save, this is too complicated. +// So we went with the simpler approach of saving all scratch registers, this won't +// add noticeable performance overhead in trace mode. +#define SPILL_NF_REGISTERS \ + do { \ + ADDI_D(xSP, xSP, -64); \ + ST_D(x1, xSP, 0 * 8); \ + ST_D(x2, xSP, 1 * 8); \ + ST_D(x3, xSP, 2 * 8); \ + ST_D(x4, xSP, 3 * 8); \ + ST_D(x5, xSP, 4 * 8); \ + ST_D(x6, xSP, 5 * 8); \ + ST_D(x7, xSP, 6 * 8); \ + } while(0); + +#define RESTORE_NF_REGISTERS \ + do { \ + LD_D(x1, xSP, 0 * 8); \ + LD_D(x2, xSP, 1 * 8); \ + LD_D(x3, xSP, 2 * 8); \ + LD_D(x4, xSP, 3 * 8); \ + LD_D(x5, xSP, 4 * 8); \ + LD_D(x6, xSP, 5 * 8); \ + LD_D(x7, xSP, 6 * 8); \ + ADDI_D(xSP, xSP, 64); \ + } while(0); #define GO_TRACE(A, B, s0) \ GETIP(addr, s0); \ diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_private.h b/src/dynarec/ppc64le/dynarec_ppc64le_private.h index e3a1ab85a2..4dfe101d7c 100644 --- a/src/dynarec/ppc64le/dynarec_ppc64le_private.h +++ b/src/dynarec/ppc64le/dynarec_ppc64le_private.h @@ -218,9 +218,33 @@ int Table64(dynarec_ppc64le_t *dyn, uint64_t val, int pass); // add a value to void CreateJmpNext(void* addr, void* next); -// TODO: Save and restore the temp register. -#define SAVE_ACTIVE_SCRATCH_REGISTERS do{} while(0); -#define LOAD_ACTIVE_SCRATCH_REGISTERS do{} while(0); +// While we could theoretically traverse forward to find the flags-consuming x86 +// instruction and get the exact scratch registers to save, this is too complicated. +// So we went with the simpler approach of saving all scratch registers, this won't +// add noticeable performance overhead in trace mode. +#define SPILL_NF_REGISTERS \ + do { \ + ADDI(xSP, xSP, -64); \ + STD(x1, 0 * 8, xSP); \ + STD(x2, 1 * 8, xSP); \ + STD(x3, 2 * 8, xSP); \ + STD(x4, 3 * 8, xSP); \ + STD(x5, 4 * 8, xSP); \ + STD(x6, 5 * 8, xSP); \ + STD(x7, 6 * 8, xSP); \ + } while(0); + +#define RESTORE_NF_REGISTERS \ + do { \ + LD(x1, 0 * 8, xSP); \ + LD(x2, 1 * 8, xSP); \ + LD(x3, 2 * 8, xSP); \ + LD(x4, 3 * 8, xSP); \ + LD(x5, 4 * 8, xSP); \ + LD(x6, 5 * 8, xSP); \ + LD(x7, 6 * 8, xSP); \ + ADDI(xSP, xSP, 64); \ + } while(0); #define GO_TRACE(A, B, s0) \ GETIP(addr, s0); \ diff --git a/src/dynarec/rv64/dynarec_rv64_private.h b/src/dynarec/rv64/dynarec_rv64_private.h index 416832a2f7..160f6cec9f 100644 --- a/src/dynarec/rv64/dynarec_rv64_private.h +++ b/src/dynarec/rv64/dynarec_rv64_private.h @@ -216,30 +216,30 @@ int Table64(dynarec_rv64_t *dyn, uint64_t val, int pass); // add a value to tab void CreateJmpNext(void* addr, void* next); -#define SAVE_ACTIVE_SCRATCH_REGISTERS \ - do { \ - uint8_t n1 = dyn->insts[ninst].nat_flags_op1; \ - uint8_t n2 = dyn->insts[ninst].nat_flags_op2; \ - if (IS_SCRATCH(n1) || IS_SCRATCH(n2)) { \ - SUBI(xSP, xSP, 16); \ - if (IS_SCRATCH(n1)) \ - SD(n1, xSP, 0); \ - if (n1 != n2 && IS_SCRATCH(n2)) \ - SD(n2, xSP, 8); \ - } \ +// While we could theoretically traverse forward to find the flags-consuming x86 +// instruction and get the exact scratch registers to save, this is too complicated. +// So we went with the simpler approach of saving all scratch registers, this won't +// add noticeable performance overhead in trace mode. +#define SPILL_NF_REGISTERS \ + do { \ + SUBI(xSP, xSP, 6 * 8); \ + SD(x1, xSP, 0 * 8); \ + SD(x2, xSP, 1 * 8); \ + SD(x3, xSP, 2 * 8); \ + SD(x4, xSP, 3 * 8); \ + SD(x5, xSP, 4 * 8); \ + SD(x6, xSP, 5 * 8); \ } while(0); -#define LOAD_ACTIVE_SCRATCH_REGISTERS \ - do { \ - uint8_t n1 = dyn->insts[ninst].nat_flags_op1; \ - uint8_t n2 = dyn->insts[ninst].nat_flags_op2; \ - if (IS_SCRATCH(n1) || IS_SCRATCH(n2)) { \ - if (IS_SCRATCH(n1)) \ - LD(n1, xSP, 0); \ - if (n1 != n2 && IS_SCRATCH(n2)) \ - LD(n2, xSP, 8); \ - ADDI(xSP, xSP, 16); \ - } \ +#define RESTORE_NF_REGISTERS \ + do { \ + LD(x1, xSP, 0 * 8); \ + LD(x2, xSP, 1 * 8); \ + LD(x3, xSP, 2 * 8); \ + LD(x4, xSP, 3 * 8); \ + LD(x5, xSP, 4 * 8); \ + LD(x6, xSP, 5 * 8); \ + ADDI(xSP, xSP, 6 * 8); \ } while(0); #define GO_TRACE(A, B, s0) \