From: Julian Seward <jseward@acm.org>
Date: Sun, 25 Mar 2007 04:14:58 +0000 (+0000)
Subject: x86 back end: use 80-bit loads/stores for floating point spills rather
X-Git-Tag: svn/VALGRIND_3_3_1^2~46
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=142fa90be87cd1efa89bd797de17715b2b410ae5;p=thirdparty%2Fvalgrind.git

x86 back end: use 80-bit loads/stores for floating point spills rather
than 64-bit ones, to reduce accuracy loss.  To support this, in
reg-alloc, allocate 2 64-bit spill slots for each HRcFlt64 vreg
instead of just 1.


git-svn-id: svn://svn.valgrind.org/vex/trunk@1744
---

diff --git a/VEX/priv/host-generic/h_generic_regs.h b/VEX/priv/host-generic/h_generic_regs.h
index 8c5a006c26..82fc9ad40e 100644
--- a/VEX/priv/host-generic/h_generic_regs.h
+++ b/VEX/priv/host-generic/h_generic_regs.h
@@ -87,10 +87,17 @@ typedef UInt HReg;
    available on any specific host.  For example on x86, the available
    classes are: Int32, Flt64, Vec128 only.
 
-   IMPORTANT NOTE: Vec128 is the only >= 128-bit-sized class, and
-   reg_alloc2.c handles it specially when assigning spill slots.  If
-   you add another 128-bit or larger regclass, you must remember to
-   update reg_alloc2.c accordingly.
+   IMPORTANT NOTE: reg_alloc2.c needs how much space is needed to spill
+   each class of register.  It has the following knowledge hardwired in:
+
+      HRcInt32     32 bits
+      HRcInt64     64 bits
+      HRcFlt64     80 bits (on x86 these are spilled by fstpt/fldt)
+      HRcVec64     64 bits
+      HRcVec128    128 bits
+
+   If you add another regclass, you must remember to update
+   reg_alloc2.c accordingly.
 */
 typedef
    enum { 
diff --git a/VEX/priv/host-generic/reg_alloc2.c b/VEX/priv/host-generic/reg_alloc2.c
index 9a6695e0a3..638570fe4e 100644
--- a/VEX/priv/host-generic/reg_alloc2.c
+++ b/VEX/priv/host-generic/reg_alloc2.c
@@ -778,8 +778,9 @@ HInstrArray* doRegisterAllocation (
 
    /* --------- Stage 3: allocate spill slots. --------- */
 
-   /* Each spill slot is 8 bytes long.  For 128-bit vregs
-      we have to allocate two spill slots.
+   /* Each spill slot is 8 bytes long.  For vregs which take more than
+      64 bits to spill (classes Flt64 and Vec128), we have to allocate
+      two spill slots.
 
       Do a rank-based allocation of vregs to spill slot numbers.  We
       put as few values as possible in spill slows, but nevertheless
@@ -799,43 +800,44 @@ HInstrArray* doRegisterAllocation (
          continue;
       }
 
-      /* The spill slots are 64 bits in size.  That means, to spill a
-         Vec128-class vreg, we'll need to find two adjacent spill
-         slots to use.  Note, this special-casing needs to happen for
-         all 128-bit sized register classes.  Currently though
-         HRcVector is the only such class. */
+      /* The spill slots are 64 bits in size.  As per the comment on
+         definition of HRegClass in h_generic_regs.h, that means, to
+         spill a vreg of class Flt64 or Vec128, we'll need to find two
+         adjacent spill slots to use.  Note, this logic needs to kept
+         in sync with the size info on the definition of HRegClass. */
 
-      if (vreg_lrs[j].reg_class != HRcVec128) {
+      if (vreg_lrs[j].reg_class == HRcVec128
+          || vreg_lrs[j].reg_class == HRcFlt64) {
 
-         /* The ordinary case -- just find a single spill slot. */
+         /* Find two adjacent free slots in which between them provide
+            up to 128 bits in which to spill the vreg. */
 
-         /* Find the lowest-numbered spill slot which is available at
-            the start point of this interval, and assign the interval
-            to it. */
-         for (k = 0; k < N_SPILL64S; k++)
-            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after)
+         for (k = 0; k < N_SPILL64S-1; k++)
+            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
+                && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
                break;
-         if (k == N_SPILL64S) {
+         if (k == N_SPILL64S-1) {
             vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
                    "Increase and recompile.");
          }
-         ss_busy_until_before[k] = vreg_lrs[j].dead_before;
+         ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
+         ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
 
       } else {
 
-	/* Find two adjacent free slots in which to spill a 128-bit
-           vreg. */
+         /* The ordinary case -- just find a single spill slot. */
 
-         for (k = 0; k < N_SPILL64S-1; k++)
-            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
-                && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
+         /* Find the lowest-numbered spill slot which is available at
+            the start point of this interval, and assign the interval
+            to it. */
+         for (k = 0; k < N_SPILL64S; k++)
+            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after)
                break;
-         if (k == N_SPILL64S-1) {
+         if (k == N_SPILL64S) {
             vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
                    "Increase and recompile.");
          }
-         ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
-         ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
+         ss_busy_until_before[k] = vreg_lrs[j].dead_before;
 
       }
 
diff --git a/VEX/priv/host-x86/hdefs.c b/VEX/priv/host-x86/hdefs.c
index 9f6157f1a9..127285cf64 100644
--- a/VEX/priv/host-x86/hdefs.c
+++ b/VEX/priv/host-x86/hdefs.c
@@ -737,7 +737,7 @@ X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
    i->Xin.FpLdSt.sz     = sz;
    i->Xin.FpLdSt.reg    = reg;
    i->Xin.FpLdSt.addr   = addr;
-   vassert(sz == 4 || sz == 8);
+   vassert(sz == 4 || sz == 8 || sz == 10);
    return i;
 }
 X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,  
@@ -1005,12 +1005,14 @@ void ppX86Instr ( X86Instr* i, Bool mode64 ) {
          break;
       case Xin_FpLdSt:
          if (i->Xin.FpLdSt.isLoad) {
-            vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
+                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
             ppX86AMode(i->Xin.FpLdSt.addr);
             vex_printf(", ");
             ppHRegX86(i->Xin.FpLdSt.reg);
          } else {
-            vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
+                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
             ppHRegX86(i->Xin.FpLdSt.reg);
             vex_printf(", ");
             ppX86AMode(i->Xin.FpLdSt.addr);
@@ -1558,7 +1560,7 @@ X86Instr* genSpill_X86 ( HReg rreg, Int offsetB, Bool mode64 )
       case HRcInt32:
          return X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
       case HRcFlt64:
-         return X86Instr_FpLdSt ( False/*store*/, 8, rreg, am );
+         return X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
       case HRcVec128:
          return X86Instr_SseLdSt ( False/*store*/, rreg, am );
       default: 
@@ -1578,7 +1580,7 @@ X86Instr* genReload_X86 ( HReg rreg, Int offsetB, Bool mode64 )
       case HRcInt32:
          return X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
       case HRcFlt64:
-         return X86Instr_FpLdSt ( True/*load*/, 8, rreg, am );
+         return X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
       case HRcVec128:
          return X86Instr_SseLdSt ( True/*load*/, rreg, am );
       default: 
@@ -2497,14 +2499,27 @@ Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i,
       goto done;
 
    case Xin_FpLdSt:
-      vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8);
       if (i->Xin.FpLdSt.isLoad) {
          /* Load from memory into %fakeN.  
-            --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1) 
+            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1) 
          */
          p = do_ffree_st7(p);
-         *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD);
-	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+         switch (i->Xin.FpLdSt.sz) {
+            case 4:
+               *p++ = 0xD9;
+               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 8:
+               *p++ = 0xDD;
+               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 10:
+               *p++ = 0xDB;
+               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            default:
+               vpanic("emitX86Instr(FpLdSt,load)");
+         }
          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
          goto done;
       } else {
@@ -2513,8 +2528,22 @@ Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i,
 	 */
          p = do_ffree_st7(p);
          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
-         *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD);
-         p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+         switch (i->Xin.FpLdSt.sz) {
+            case 4:
+               *p++ = 0xD9;
+               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 8:
+               *p++ = 0xDD;
+               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 10:
+               *p++ = 0xDB;
+               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            default:
+               vpanic("emitX86Instr(FpLdSt,store)");
+         }
          goto done;
       }
       break;