From: Julian Seward Date: Thu, 3 Apr 2014 13:48:54 +0000 (+0000) Subject: Implement TBL and TBX instructions. X-Git-Tag: svn/VALGRIND_3_10_1^2~125 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6c3e57089f1b2585c7db9529d2dfd507d09f08a9;p=thirdparty%2Fvalgrind.git Implement TBL and TBX instructions. git-svn-id: svn://svn.valgrind.org/vex/trunk@2845 --- diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 6dde926b71..b8a2ecf9af 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -4987,6 +4987,96 @@ static IRTemp math_MINMAXV ( IRTemp src, IROp op ) } +/* Generate IR for TBL and TBX. This deals with the 128 bit case + only. */ +static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src, + IRTemp oor_values ) +{ + vassert(len >= 0 && len <= 3); + + /* Generate some useful constants as concisely as possible. */ + IRTemp half15 = newTemp(Ity_I64); + assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL)); + IRTemp half16 = newTemp(Ity_I64); + assign(half16, mkU64(0x1010101010101010ULL)); + + /* A zero vector */ + IRTemp allZero = newTemp(Ity_V128); + assign(allZero, mkV128(0x0000)); + /* A vector containing 15 in each 8-bit lane */ + IRTemp all15 = newTemp(Ity_V128); + assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15))); + /* A vector containing 16 in each 8-bit lane */ + IRTemp all16 = newTemp(Ity_V128); + assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16))); + /* A vector containing 32 in each 8-bit lane */ + IRTemp all32 = newTemp(Ity_V128); + assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16))); + /* A vector containing 48 in each 8-bit lane */ + IRTemp all48 = newTemp(Ity_V128); + assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32))); + /* A vector containing 64 in each 8-bit lane */ + IRTemp all64 = newTemp(Ity_V128); + assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32))); + + /* Group the 16/32/48/64 vectors so as to be indexable. */ + IRTemp allXX[4] = { all16, all32, all48, all64 }; + + /* Compute the result for each table vector, with zeroes in places + where the index values are out of range, and OR them into the + running vector. */ + IRTemp running_result = newTemp(Ity_V128); + assign(running_result, mkV128(0)); + + UInt tabent; + for (tabent = 0; tabent <= len; tabent++) { + vassert(tabent >= 0 && tabent < 4); + IRTemp bias = newTemp(Ity_V128); + assign(bias, + mkexpr(tabent == 0 ? allZero : allXX[tabent-1])); + IRTemp biased_indices = newTemp(Ity_V128); + assign(biased_indices, + binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias))); + IRTemp valid_mask = newTemp(Ity_V128); + assign(valid_mask, + binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices))); + IRTemp safe_biased_indices = newTemp(Ity_V128); + assign(safe_biased_indices, + binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15))); + IRTemp results_or_junk = newTemp(Ity_V128); + assign(results_or_junk, + binop(Iop_Perm8x16, mkexpr(tab[tabent]), + mkexpr(safe_biased_indices))); + IRTemp results_or_zero = newTemp(Ity_V128); + assign(results_or_zero, + binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask))); + /* And OR that into the running result. */ + IRTemp tmp = newTemp(Ity_V128); + assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero), + mkexpr(running_result))); + running_result = tmp; + } + + /* So now running_result holds the overall result where the indices + are in range, and zero in out-of-range lanes. Now we need to + compute an overall validity mask and use this to copy in the + lanes in the oor_values for out of range indices. This is + unnecessary for TBL but will get folded out by iropt, so we lean + on that and generate the same code for TBL and TBX here. */ + IRTemp overall_valid_mask = newTemp(Ity_V128); + assign(overall_valid_mask, + binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src))); + IRTemp result = newTemp(Ity_V128); + assign(result, + binop(Iop_OrV128, + mkexpr(running_result), + binop(Iop_AndV128, + mkexpr(oor_values), + unop(Iop_NotV128, mkexpr(overall_valid_mask))))); + return result; +} + + static Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) { @@ -6734,6 +6824,43 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) /* else fall through */ } + /* -------------------- TBL, TBX -------------------- */ + /* 31 28 20 15 14 12 9 4 + 0q0 01110 000 m 0 len 000 n d TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta + 0q0 01110 000 m 0 len 100 n d TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta + where Ta = 16b(q=1) or 8b(q=0) + */ + if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0) + && INSN(15,15) == 0 && INSN(11,10) == BITS2(0,0)) { + Bool isQ = INSN(30,30) == 1; + Bool isTBX = INSN(12,12) == 1; + UInt mm = INSN(20,16); + UInt len = INSN(14,13); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + /* The out-of-range values to use. */ + IRTemp oor_values = newTemp(Ity_V128); + assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0)); + /* src value */ + IRTemp src = newTemp(Ity_V128); + assign(src, getQReg128(mm)); + /* The table values */ + IRTemp tab[4]; + UInt i; + for (i = 0; i <= len; i++) { + vassert(i < 4); + tab[i] = newTemp(Ity_V128); + assign(tab[i], getQReg128((nn + i) % 32)); + } + IRTemp res = math_TBL_TBX(tab, len, src, oor_values); + putQReg128(dd, isQ ? mkexpr(res) + : unop(Iop_ZeroHI64ofV128, mkexpr(res)) ); + const HChar* Ta = isQ ? "16b" : "8b"; + const HChar* nm = isTBX ? "tbx" : "tbl"; + DIP("%s %s.%s, {v%d.16b .. v%d.16b}, %s.%s\n", + nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta); + return True; + } /* FIXME Temporary hacks to get through ld.so FIXME */ /* ------------------ movi vD.4s, #0x0 ------------------ */ diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index dd37053a83..0400897d79 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -853,9 +853,11 @@ static void showARM64VecBinOp(/*OUT*/const HChar** nm, case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; + case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return; case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; + case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return; case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return; case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return; case ARM64vecb_FADD64x2: *nm = "fadd"; *ar = "2d"; return; @@ -891,6 +893,8 @@ static void showARM64VecBinOp(/*OUT*/const HChar** nm, case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return; case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return; case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return; + case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return; + case ARM64vecb_CMHI8x16: *nm = "cmhi"; *ar = "16b"; return; default: vpanic("showARM64VecBinOp"); } } @@ -3337,6 +3341,7 @@ static inline UChar qregNo ( HReg r ) #define X001000 BITS8(0,0, 0,0,1,0,0,0) #define X001001 BITS8(0,0, 0,0,1,0,0,1) #define X001010 BITS8(0,0, 0,0,1,0,1,0) +#define X001101 BITS8(0,0, 0,0,1,1,0,1) #define X001111 BITS8(0,0, 0,0,1,1,1,1) #define X010000 BITS8(0,0, 0,1,0,0,0,0) #define X010001 BITS8(0,0, 0,1,0,0,0,1) @@ -4916,10 +4921,12 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 010 01110 11 1 m 100001 n d ADD Vd.2d, Vn.2d, Vm.2d 010 01110 10 1 m 100001 n d ADD Vd.4s, Vn.4s, Vm.4s 010 01110 01 1 m 100001 n d ADD Vd.8h, Vn.8h, Vm.8h + 010 01110 00 1 m 100001 n d ADD Vd.16b, Vn.16b, Vm.16b 011 01110 11 1 m 100001 n d SUB Vd.2d, Vn.2d, Vm.2d 011 01110 10 1 m 100001 n d SUB Vd.4s, Vn.4s, Vm.4s 011 01110 01 1 m 100001 n d SUB Vd.8h, Vn.8h, Vm.8h + 011 01110 00 1 m 100001 n d SUB Vd.16b, Vn.16b, Vm.16b 010 01110 10 1 m 100111 n d MUL Vd.4s, Vn.4s, Vm.4s 010 01110 01 1 m 100111 n d MUL Vd.8h, Vn.8h, Vm.8h @@ -4970,6 +4977,10 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 011 01110 11 1 m 111001 n d FCMGT Vd.2d, Vn.2d, Vm.2d 011 01110 10 1 m 111001 n d FCMGT Vd.4s, Vn.4s, Vm.4s + + 010 01110 00 0 m 000000 n d TBL Vd.16b, {Vn.16b}, Vm.16b + + 011 01110 00 1 m 001101 n d CMHI Vd.16b, Vn.16b, Vm.16b */ UInt vD = qregNo(i->ARM64in.VBinV.dst); UInt vN = qregNo(i->ARM64in.VBinV.argL); @@ -4984,6 +4995,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecb_ADD16x8: *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X100001, vN, vD); break; + case ARM64vecb_ADD8x16: + *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X100001, vN, vD); + break; case ARM64vecb_SUB64x2: *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100001, vN, vD); break; @@ -4993,6 +5007,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecb_SUB16x8: *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X100001, vN, vD); break; + case ARM64vecb_SUB8x16: + *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X100001, vN, vD); + break; case ARM64vecb_MUL32x4: *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X100111, vN, vD); break; @@ -5107,6 +5124,14 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecb_FCMGT32x4: *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X111001, vN, vD); break; + + case ARM64vecb_TBL1: + *p++ = X_3_8_5_6_5_5(X010, X01110000, vM, X000000, vN, vD); + break; + + case ARM64vecb_CMHI8x16: + *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X001101, vN, vD); + break; default: goto bad; } diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index bee6d2ce10..3d27ecda0c 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -310,9 +310,11 @@ typedef ARM64vecb_ADD64x2=120, ARM64vecb_ADD32x4, ARM64vecb_ADD16x8, + ARM64vecb_ADD8x16, ARM64vecb_SUB64x2, ARM64vecb_SUB32x4, ARM64vecb_SUB16x8, + ARM64vecb_SUB8x16, ARM64vecb_MUL32x4, ARM64vecb_MUL16x8, ARM64vecb_FADD64x2, @@ -348,6 +350,8 @@ typedef ARM64vecb_FCMGE32x4, ARM64vecb_FCMGT64x2, ARM64vecb_FCMGT32x4, + ARM64vecb_TBL1, + ARM64vecb_CMHI8x16, ARM64vecb_INVALID } ARM64VecBinOp; diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 712d161ccb..15476569b5 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -4918,9 +4918,11 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Add64x2: case Iop_Add32x4: case Iop_Add16x8: + case Iop_Add8x16: case Iop_Sub64x2: case Iop_Sub32x4: case Iop_Sub16x8: + case Iop_Sub8x16: case Iop_Mul32x4: case Iop_Mul16x8: case Iop_CmpEQ64x2: @@ -4930,6 +4932,8 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_CmpLE32Fx4: case Iop_CmpLT64Fx2: case Iop_CmpLT32Fx4: + case Iop_Perm8x16: + case Iop_CmpGT8Ux16: { HReg res = newVRegV(env); HReg argL = iselV128Expr(env, e->Iex.Binop.arg1); @@ -4955,9 +4959,11 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Add64x2: op = ARM64vecb_ADD64x2; break; case Iop_Add32x4: op = ARM64vecb_ADD32x4; break; case Iop_Add16x8: op = ARM64vecb_ADD16x8; break; + case Iop_Add8x16: op = ARM64vecb_ADD8x16; break; case Iop_Sub64x2: op = ARM64vecb_SUB64x2; break; case Iop_Sub32x4: op = ARM64vecb_SUB32x4; break; case Iop_Sub16x8: op = ARM64vecb_SUB16x8; break; + case Iop_Sub8x16: op = ARM64vecb_SUB8x16; break; case Iop_Mul32x4: op = ARM64vecb_MUL32x4; break; case Iop_Mul16x8: op = ARM64vecb_MUL16x8; break; case Iop_CmpEQ64x2: op = ARM64vecb_CMEQ64x2; break; @@ -4967,6 +4973,8 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_CmpLE32Fx4: op = ARM64vecb_FCMGE32x4; sw = True; break; case Iop_CmpLT64Fx2: op = ARM64vecb_FCMGT64x2; sw = True; break; case Iop_CmpLT32Fx4: op = ARM64vecb_FCMGT32x4; sw = True; break; + case Iop_Perm8x16: op = ARM64vecb_TBL1; break; + case Iop_CmpGT8Ux16: op = ARM64vecb_CMHI8x16; break; default: vassert(0); } if (sw) {