From: Julian Seward Date: Sun, 25 Mar 2007 04:14:58 +0000 (+0000) Subject: x86 back end: use 80-bit loads/stores for floating point spills rather X-Git-Tag: svn/VALGRIND_3_3_1^2~46 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=142fa90be87cd1efa89bd797de17715b2b410ae5;p=thirdparty%2Fvalgrind.git x86 back end: use 80-bit loads/stores for floating point spills rather than 64-bit ones, to reduce accuracy loss. To support this, in reg-alloc, allocate 2 64-bit spill slots for each HRcFlt64 vreg instead of just 1. git-svn-id: svn://svn.valgrind.org/vex/trunk@1744 --- diff --git a/VEX/priv/host-generic/h_generic_regs.h b/VEX/priv/host-generic/h_generic_regs.h index 8c5a006c26..82fc9ad40e 100644 --- a/VEX/priv/host-generic/h_generic_regs.h +++ b/VEX/priv/host-generic/h_generic_regs.h @@ -87,10 +87,17 @@ typedef UInt HReg; available on any specific host. For example on x86, the available classes are: Int32, Flt64, Vec128 only. - IMPORTANT NOTE: Vec128 is the only >= 128-bit-sized class, and - reg_alloc2.c handles it specially when assigning spill slots. If - you add another 128-bit or larger regclass, you must remember to - update reg_alloc2.c accordingly. + IMPORTANT NOTE: reg_alloc2.c needs how much space is needed to spill + each class of register. It has the following knowledge hardwired in: + + HRcInt32 32 bits + HRcInt64 64 bits + HRcFlt64 80 bits (on x86 these are spilled by fstpt/fldt) + HRcVec64 64 bits + HRcVec128 128 bits + + If you add another regclass, you must remember to update + reg_alloc2.c accordingly. */ typedef enum { diff --git a/VEX/priv/host-generic/reg_alloc2.c b/VEX/priv/host-generic/reg_alloc2.c index 9a6695e0a3..638570fe4e 100644 --- a/VEX/priv/host-generic/reg_alloc2.c +++ b/VEX/priv/host-generic/reg_alloc2.c @@ -778,8 +778,9 @@ HInstrArray* doRegisterAllocation ( /* --------- Stage 3: allocate spill slots. --------- */ - /* Each spill slot is 8 bytes long. For 128-bit vregs - we have to allocate two spill slots. + /* Each spill slot is 8 bytes long. For vregs which take more than + 64 bits to spill (classes Flt64 and Vec128), we have to allocate + two spill slots. Do a rank-based allocation of vregs to spill slot numbers. We put as few values as possible in spill slows, but nevertheless @@ -799,43 +800,44 @@ HInstrArray* doRegisterAllocation ( continue; } - /* The spill slots are 64 bits in size. That means, to spill a - Vec128-class vreg, we'll need to find two adjacent spill - slots to use. Note, this special-casing needs to happen for - all 128-bit sized register classes. Currently though - HRcVector is the only such class. */ + /* The spill slots are 64 bits in size. As per the comment on + definition of HRegClass in h_generic_regs.h, that means, to + spill a vreg of class Flt64 or Vec128, we'll need to find two + adjacent spill slots to use. Note, this logic needs to kept + in sync with the size info on the definition of HRegClass. */ - if (vreg_lrs[j].reg_class != HRcVec128) { + if (vreg_lrs[j].reg_class == HRcVec128 + || vreg_lrs[j].reg_class == HRcFlt64) { - /* The ordinary case -- just find a single spill slot. */ + /* Find two adjacent free slots in which between them provide + up to 128 bits in which to spill the vreg. */ - /* Find the lowest-numbered spill slot which is available at - the start point of this interval, and assign the interval - to it. */ - for (k = 0; k < N_SPILL64S; k++) - if (ss_busy_until_before[k] <= vreg_lrs[j].live_after) + for (k = 0; k < N_SPILL64S-1; k++) + if (ss_busy_until_before[k] <= vreg_lrs[j].live_after + && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after) break; - if (k == N_SPILL64S) { + if (k == N_SPILL64S-1) { vpanic("LibVEX_N_SPILL_BYTES is too low. " "Increase and recompile."); } - ss_busy_until_before[k] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+0] = vreg_lrs[j].dead_before; + ss_busy_until_before[k+1] = vreg_lrs[j].dead_before; } else { - /* Find two adjacent free slots in which to spill a 128-bit - vreg. */ + /* The ordinary case -- just find a single spill slot. */ - for (k = 0; k < N_SPILL64S-1; k++) - if (ss_busy_until_before[k] <= vreg_lrs[j].live_after - && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after) + /* Find the lowest-numbered spill slot which is available at + the start point of this interval, and assign the interval + to it. */ + for (k = 0; k < N_SPILL64S; k++) + if (ss_busy_until_before[k] <= vreg_lrs[j].live_after) break; - if (k == N_SPILL64S-1) { + if (k == N_SPILL64S) { vpanic("LibVEX_N_SPILL_BYTES is too low. " "Increase and recompile."); } - ss_busy_until_before[k+0] = vreg_lrs[j].dead_before; - ss_busy_until_before[k+1] = vreg_lrs[j].dead_before; + ss_busy_until_before[k] = vreg_lrs[j].dead_before; } diff --git a/VEX/priv/host-x86/hdefs.c b/VEX/priv/host-x86/hdefs.c index 9f6157f1a9..127285cf64 100644 --- a/VEX/priv/host-x86/hdefs.c +++ b/VEX/priv/host-x86/hdefs.c @@ -737,7 +737,7 @@ X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) { i->Xin.FpLdSt.sz = sz; i->Xin.FpLdSt.reg = reg; i->Xin.FpLdSt.addr = addr; - vassert(sz == 4 || sz == 8); + vassert(sz == 4 || sz == 8 || sz == 10); return i; } X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz, @@ -1005,12 +1005,14 @@ void ppX86Instr ( X86Instr* i, Bool mode64 ) { break; case Xin_FpLdSt: if (i->Xin.FpLdSt.isLoad) { - vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F'); + vex_printf("gld%c " , i->Xin.FpLdSt.sz==10 ? 'T' + : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F')); ppX86AMode(i->Xin.FpLdSt.addr); vex_printf(", "); ppHRegX86(i->Xin.FpLdSt.reg); } else { - vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F'); + vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T' + : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F')); ppHRegX86(i->Xin.FpLdSt.reg); vex_printf(", "); ppX86AMode(i->Xin.FpLdSt.addr); @@ -1558,7 +1560,7 @@ X86Instr* genSpill_X86 ( HReg rreg, Int offsetB, Bool mode64 ) case HRcInt32: return X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am ); case HRcFlt64: - return X86Instr_FpLdSt ( False/*store*/, 8, rreg, am ); + return X86Instr_FpLdSt ( False/*store*/, 10, rreg, am ); case HRcVec128: return X86Instr_SseLdSt ( False/*store*/, rreg, am ); default: @@ -1578,7 +1580,7 @@ X86Instr* genReload_X86 ( HReg rreg, Int offsetB, Bool mode64 ) case HRcInt32: return X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg ); case HRcFlt64: - return X86Instr_FpLdSt ( True/*load*/, 8, rreg, am ); + return X86Instr_FpLdSt ( True/*load*/, 10, rreg, am ); case HRcVec128: return X86Instr_SseLdSt ( True/*load*/, rreg, am ); default: @@ -2497,14 +2499,27 @@ Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, goto done; case Xin_FpLdSt: - vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8); if (i->Xin.FpLdSt.isLoad) { /* Load from memory into %fakeN. - --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1) + --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1) */ p = do_ffree_st7(p); - *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD); - p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr); + switch (i->Xin.FpLdSt.sz) { + case 4: + *p++ = 0xD9; + p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + case 8: + *p++ = 0xDD; + p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + case 10: + *p++ = 0xDB; + p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + default: + vpanic("emitX86Instr(FpLdSt,load)"); + } p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg)); goto done; } else { @@ -2513,8 +2528,22 @@ Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, */ p = do_ffree_st7(p); p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg)); - *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD); - p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr); + switch (i->Xin.FpLdSt.sz) { + case 4: + *p++ = 0xD9; + p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + case 8: + *p++ = 0xDD; + p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + case 10: + *p++ = 0xDB; + p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr); + break; + default: + vpanic("emitX86Instr(FpLdSt,store)"); + } goto done; } break;