From df320937705284e02e8babc19c95fa6b2ace48c7 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Thu, 14 Aug 2014 22:26:52 +0000 Subject: [PATCH] arm64: implement: {uqshl, sqshl, sqshlu} (vector, imm). git-svn-id: svn://svn.valgrind.org/vex/trunk@2922 --- VEX/priv/guest_arm64_toIR.c | 143 ++++++++++++++++++++++++++++++++++++ VEX/priv/host_arm64_defs.c | 114 +++++++++++++++++++--------- VEX/priv/host_arm64_defs.h | 14 +++- VEX/priv/host_arm64_isel.c | 137 +++++++++++++++++++++------------- 4 files changed, 321 insertions(+), 87 deletions(-) diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index e2fc6739f1..28176a0053 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -921,6 +921,28 @@ static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) { return ops[sizeNarrow]; } +static IROp mkVecQSHLNSATU2U ( UInt size ) { + const IROp ops[4] + = { Iop_QShlN8x16, Iop_QShlN16x8, Iop_QShlN32x4, Iop_QShlN64x2 }; + vassert(size < 4); + return ops[size]; +} + +static IROp mkVecQSHLNSATS2S ( UInt size ) { + const IROp ops[4] + = { Iop_QSalN8x16, Iop_QSalN16x8, Iop_QSalN32x4, Iop_QSalN64x2 }; + vassert(size < 4); + return ops[size]; +} + +static IROp mkVecQSHLNSATS2U ( UInt size ) { + const IROp ops[4] + = { Iop_QShlN8Sx16, Iop_QShlN16Sx8, Iop_QShlN32Sx4, Iop_QShlN64Sx2 }; + vassert(size < 4); + return ops[size]; +} + + /* Generate IR to create 'arg rotated right by imm', for sane values of 'ty' and 'imm'. */ static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm ) @@ -6569,6 +6591,91 @@ void math_SQDMULH ( /*OUT*/IRTemp* res, } +/* Generate IR for SQSHL, UQSHL, SQSHLU by imm. Put the result in + a new temp in *res, and the Q difference pair in new temps in + *qDiff1 and *qDiff2 respectively. |nm| denotes which of the + three operations it is. */ +static +void math_QSHL_IMM ( /*OUT*/IRTemp* res, + /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2, + IRTemp src, UInt size, UInt shift, const HChar* nm ) +{ + vassert(size <= 3); + UInt laneBits = 8 << size; + vassert(shift < laneBits); + newTempsV128_3(res, qDiff1, qDiff2); + IRTemp z128 = newTempV128(); + assign(z128, mkV128(0x0000)); + + /* UQSHL */ + if (vex_streq(nm, "uqshl")) { + IROp qop = mkVecQSHLNSATU2U(size); + assign(*res, binop(qop, mkexpr(src), mkU8(shift))); + if (shift == 0) { + /* No shift means no saturation. */ + assign(*qDiff1, mkexpr(z128)); + assign(*qDiff2, mkexpr(z128)); + } else { + /* Saturation has occurred if any of the shifted-out bits are + nonzero. We get the shifted-out bits by right-shifting the + original value. */ + UInt rshift = laneBits - shift; + vassert(rshift >= 1 && rshift < laneBits); + assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift))); + assign(*qDiff2, mkexpr(z128)); + } + return; + } + + /* SQSHL */ + if (vex_streq(nm, "sqshl")) { + IROp qop = mkVecQSHLNSATS2S(size); + assign(*res, binop(qop, mkexpr(src), mkU8(shift))); + if (shift == 0) { + /* No shift means no saturation. */ + assign(*qDiff1, mkexpr(z128)); + assign(*qDiff2, mkexpr(z128)); + } else { + /* Saturation has occurred if any of the shifted-out bits are + different from the top bit of the original value. */ + UInt rshift = laneBits - 1 - shift; + vassert(rshift >= 0 && rshift < laneBits-1); + /* qDiff1 is the shifted out bits, and the top bit of the original + value, preceded by zeroes. */ + assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift))); + /* qDiff2 is the top bit of the original value, cloned the + correct number of times. */ + assign(*qDiff2, binop(mkVecSHRN(size), + binop(mkVecSARN(size), mkexpr(src), + mkU8(laneBits-1)), + mkU8(rshift))); + /* This also succeeds in comparing the top bit of the original + value to itself, which is a bit stupid, but not wrong. */ + } + return; + } + + /* SQSHLU */ + if (vex_streq(nm, "sqshlu")) { + IROp qop = mkVecQSHLNSATS2U(size); + assign(*res, binop(qop, mkexpr(src), mkU8(shift))); + /* This is different from the other two cases, in that + saturation can occur even if there is no shift. */ + /* Saturation has occurred if any of the shifted-out bits, or + the top bit of the original value, are nonzero. */ + UInt rshift = laneBits - 1 - shift; + vassert(rshift >= 0 && rshift < laneBits); + /* qDiff1 is the shifted out bits, and the top bit of the original + value, preceded by zeroes. */ + assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift))); + assign(*qDiff2, mkexpr(z128)); + return; + } + + vassert(0); +} + + /* QCFLAG tracks the SIMD sticky saturation status. Update the status thusly: if, after application of |opZHI| to both |qres| and |nres|, they have the same value, leave QCFLAG unchanged. Otherwise, set it @@ -8230,6 +8337,42 @@ Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS5(0,1,1,1,0) + || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) { + /* -------- 0,01110 SQSHL std7_std7_#imm -------- */ + /* -------- 1,01110 UQSHL std7_std7_#imm -------- */ + /* -------- 1,01100 SQSHLU std7_std7_#imm -------- */ + UInt size = 0; + UInt shift = 0; + Bool isQ = bitQ == 1; + Bool ok = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb); + if (!ok || (bitQ == 0 && size == X11)) return False; + vassert(size >= 0 && size <= 3); + /* The shift encoding has opposite sign for the leftwards case. + Adjust shift to compensate. */ + UInt lanebits = 8 << size; + shift = lanebits - shift; + vassert(shift >= 0 && shift < lanebits); + const HChar* nm = NULL; + /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl"; + else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl"; + else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu"; + else vassert(0); + IRTemp qDiff1 = IRTemp_INVALID; + IRTemp qDiff2 = IRTemp_INVALID; + IRTemp res = IRTemp_INVALID; + IRTemp src = newTempV128(); + assign(src, getQReg128(nn)); + math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm); + putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res)); + updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2, + isQ ? Iop_ZeroHI64ofV128 : Iop_INVALID); + const HChar* arr = nameArr_Q_SZ(bitQ, size); + DIP("%s %s.%s, %s.%s, #%u\n", nm, + nameQReg128(dd), arr, nameQReg128(nn), arr, shift); + return True; + } + if (bitU == 0 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) { /* -------- 0,10000 SHRN{,2} #imm -------- */ diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index a94a82467c..71135b9a3e 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -1037,6 +1037,18 @@ static void showARM64VecShiftOp(/*OUT*/const HChar** nm, case ARM64vecsh_SQRSHRUN2SD: *nm = "sqrshrun"; *ar = "2sd"; return; case ARM64vecsh_SQRSHRUN4HS: *nm = "sqrshrun"; *ar = "4hs"; return; case ARM64vecsh_SQRSHRUN8BH: *nm = "sqrshrun"; *ar = "8bh"; return; + case ARM64vecsh_UQSHL64x2: *nm = "uqshl "; *ar = "2d"; return; + case ARM64vecsh_UQSHL32x4: *nm = "uqshl "; *ar = "4s"; return; + case ARM64vecsh_UQSHL16x8: *nm = "uqshl "; *ar = "8h"; return; + case ARM64vecsh_UQSHL8x16: *nm = "uqshl "; *ar = "16b"; return; + case ARM64vecsh_SQSHL64x2: *nm = "sqshl "; *ar = "2d"; return; + case ARM64vecsh_SQSHL32x4: *nm = "sqshl "; *ar = "4s"; return; + case ARM64vecsh_SQSHL16x8: *nm = "sqshl "; *ar = "8h"; return; + case ARM64vecsh_SQSHL8x16: *nm = "sqshl "; *ar = "16b"; return; + case ARM64vecsh_SQSHLU64x2: *nm = "sqshlu"; *ar = "2d"; return; + case ARM64vecsh_SQSHLU32x4: *nm = "sqshlu"; *ar = "4s"; return; + case ARM64vecsh_SQSHLU16x8: *nm = "sqshlu"; *ar = "8h"; return; + case ARM64vecsh_SQSHLU8x16: *nm = "sqshlu"; *ar = "16b"; return; default: vpanic("showARM64VecShiftOp"); } } @@ -1763,48 +1775,53 @@ ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op, i->ARM64in.VShiftImmV.dst = dst; i->ARM64in.VShiftImmV.src = src; i->ARM64in.VShiftImmV.amt = amt; + UInt minSh = 0; UInt maxSh = 0; switch (op) { - /* NB: the comments below are wrong. Correct is: for right shifts, - the allowed shift amounts are 1 .. lane_size. For left shifts, - the allowed shift amoutns are 0 .. lane_size-1. */ - /* For these ordinary, non-saturating non-magical shifts, - the min shift value is actually zero, but we reject such cases - and instead only accept 1 as the minimum shift value. */ + /* For right shifts, the allowed shift amounts are 1 .. lane_size. + For left shifts, the allowed shift amounts are 0 .. lane_size-1. + */ case ARM64vecsh_USHR64x2: case ARM64vecsh_SSHR64x2: - case ARM64vecsh_SHL64x2: - maxSh = 63; break; - case ARM64vecsh_USHR32x4: case ARM64vecsh_SSHR32x4: - case ARM64vecsh_SHL32x4: - maxSh = 31; break; - case ARM64vecsh_USHR16x8: case ARM64vecsh_SSHR16x8: - case ARM64vecsh_SHL16x8: - maxSh = 15; break; - case ARM64vecsh_USHR8x16: case ARM64vecsh_SSHR8x16: - case ARM64vecsh_SHL8x16: - maxSh = 7; break; - /* Whereas for these shift right and narrow set, the min shift - value really is 1. */ - case ARM64vecsh_UQSHRN2SD: case ARM64vecsh_SQSHRN2SD: + case ARM64vecsh_UQSHRN2SD: case ARM64vecsh_SQSHRN2SD: case ARM64vecsh_SQSHRUN2SD: case ARM64vecsh_UQRSHRN2SD: case ARM64vecsh_SQRSHRN2SD: case ARM64vecsh_SQRSHRUN2SD: - maxSh = 64; break; - case ARM64vecsh_UQSHRN4HS: case ARM64vecsh_SQSHRN4HS: + minSh = 1; maxSh = 64; break; + case ARM64vecsh_SHL64x2: + case ARM64vecsh_UQSHL64x2: case ARM64vecsh_SQSHL64x2: + case ARM64vecsh_SQSHLU64x2: + minSh = 0; maxSh = 63; break; + case ARM64vecsh_USHR32x4: case ARM64vecsh_SSHR32x4: + case ARM64vecsh_UQSHRN4HS: case ARM64vecsh_SQSHRN4HS: case ARM64vecsh_SQSHRUN4HS: case ARM64vecsh_UQRSHRN4HS: case ARM64vecsh_SQRSHRN4HS: case ARM64vecsh_SQRSHRUN4HS: - maxSh = 32; break; - case ARM64vecsh_UQSHRN8BH: case ARM64vecsh_SQSHRN8BH: + minSh = 1; maxSh = 32; break; + case ARM64vecsh_SHL32x4: + case ARM64vecsh_UQSHL32x4: case ARM64vecsh_SQSHL32x4: + case ARM64vecsh_SQSHLU32x4: + minSh = 0; maxSh = 31; break; + case ARM64vecsh_USHR16x8: case ARM64vecsh_SSHR16x8: + case ARM64vecsh_UQSHRN8BH: case ARM64vecsh_SQSHRN8BH: case ARM64vecsh_SQSHRUN8BH: case ARM64vecsh_UQRSHRN8BH: case ARM64vecsh_SQRSHRN8BH: case ARM64vecsh_SQRSHRUN8BH: - maxSh = 16; break; + minSh = 1; maxSh = 16; break; + case ARM64vecsh_SHL16x8: + case ARM64vecsh_UQSHL16x8: case ARM64vecsh_SQSHL16x8: + case ARM64vecsh_SQSHLU16x8: + minSh = 0; maxSh = 15; break; + case ARM64vecsh_USHR8x16: case ARM64vecsh_SSHR8x16: + minSh = 1; maxSh = 8; break; + case ARM64vecsh_SHL8x16: + case ARM64vecsh_UQSHL8x16: case ARM64vecsh_SQSHL8x16: + case ARM64vecsh_SQSHLU8x16: + minSh = 0; maxSh = 7; break; default: vassert(0); } vassert(maxSh > 0); - vassert(amt > 0 && amt <= maxSh); + vassert(amt >= minSh && amt <= maxSh); return i; } ARM64Instr* ARM64Instr_VExtV ( HReg dst, HReg srcLo, HReg srcHi, UInt amtB ) { @@ -3623,6 +3640,7 @@ static inline UChar qregNo ( HReg r ) #define X011001 BITS8(0,0, 0,1,1,0,0,1) #define X011010 BITS8(0,0, 0,1,1,0,1,0) #define X011011 BITS8(0,0, 0,1,1,0,1,1) +#define X011101 BITS8(0,0, 0,1,1,1,0,1) #define X011110 BITS8(0,0, 0,1,1,1,1,0) #define X011111 BITS8(0,0, 0,1,1,1,1,1) #define X100001 BITS8(0,0, 1,0,0,0,0,1) @@ -5888,6 +5906,7 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 001 011110 immh immb 100111 n d UQRSHRN ,,#sh 000 011110 immh immb 100111 n d SQRSHRN ,,#sh 001 011110 immh immb 100011 n d SQRSHRUN ,,#sh + where immh:immb = case T of 2d | sh in 1..64 -> let xxxxxx = 64-sh in 1xxx:xxx @@ -5895,7 +5914,12 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 8h | sh in 1..16 -> let xxxx = 16-sh in 001x:xxx 16b | sh in 1..8 -> let xxx = 8-sh in 0001:xxx - 010 011110 immh immb 010101 n d SHL Vd.T, Vn.T, #sh + 010 011110 immh immb 010101 n d SHL Vd.T, Vn.T, #sh + + 011 011110 immh immb 011101 n d UQSHL Vd.T, Vn.T, #sh + 010 011110 immh immb 011101 n d SQSHL Vd.T, Vn.T, #sh + 011 011110 immh immb 011001 n d SQSHLU Vd.T, Vn.T, #sh + where immh:immb = case T of 2d | sh in 0..63 -> let xxxxxx = sh in 1xxx:xxx @@ -5930,11 +5954,20 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, const UInt tmpl_SHL = X_3_6_7_6_5_5(X010, X011110, 0, X010101, vN, vD); + const UInt tmpl_UQSHL + = X_3_6_7_6_5_5(X011, X011110, 0, X011101, vN, vD); + const UInt tmpl_SQSHL + = X_3_6_7_6_5_5(X010, X011110, 0, X011101, vN, vD); + const UInt tmpl_SQSHLU + = X_3_6_7_6_5_5(X011, X011110, 0, X011001, vN, vD); + switch (i->ARM64in.VShiftImmV.op) { case ARM64vecsh_SSHR64x2: tmpl = tmpl_SSHR; goto right64x2; case ARM64vecsh_USHR64x2: tmpl = tmpl_USHR; goto right64x2; case ARM64vecsh_SHL64x2: tmpl = tmpl_SHL; goto left64x2; - + case ARM64vecsh_UQSHL64x2: tmpl = tmpl_UQSHL; goto left64x2; + case ARM64vecsh_SQSHL64x2: tmpl = tmpl_SQSHL; goto left64x2; + case ARM64vecsh_SQSHLU64x2: tmpl = tmpl_SQSHLU; goto left64x2; case ARM64vecsh_SSHR32x4: tmpl = tmpl_SSHR; goto right32x4; case ARM64vecsh_USHR32x4: tmpl = tmpl_USHR; goto right32x4; case ARM64vecsh_UQSHRN2SD: tmpl = tmpl_UQSHRN; goto right32x4; @@ -5944,7 +5977,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecsh_SQRSHRN2SD: tmpl = tmpl_SQRSHRN; goto right32x4; case ARM64vecsh_SQRSHRUN2SD: tmpl = tmpl_SQRSHRUN; goto right32x4; case ARM64vecsh_SHL32x4: tmpl = tmpl_SHL; goto left32x4; - + case ARM64vecsh_UQSHL32x4: tmpl = tmpl_UQSHL; goto left32x4; + case ARM64vecsh_SQSHL32x4: tmpl = tmpl_SQSHL; goto left32x4; + case ARM64vecsh_SQSHLU32x4: tmpl = tmpl_SQSHLU; goto left32x4; case ARM64vecsh_SSHR16x8: tmpl = tmpl_SSHR; goto right16x8; case ARM64vecsh_USHR16x8: tmpl = tmpl_USHR; goto right16x8; case ARM64vecsh_UQSHRN4HS: tmpl = tmpl_UQSHRN; goto right16x8; @@ -5954,7 +5989,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecsh_SQRSHRN4HS: tmpl = tmpl_SQRSHRN; goto right16x8; case ARM64vecsh_SQRSHRUN4HS: tmpl = tmpl_SQRSHRUN; goto right16x8; case ARM64vecsh_SHL16x8: tmpl = tmpl_SHL; goto left16x8; - + case ARM64vecsh_UQSHL16x8: tmpl = tmpl_UQSHL; goto left16x8; + case ARM64vecsh_SQSHL16x8: tmpl = tmpl_SQSHL; goto left16x8; + case ARM64vecsh_SQSHLU16x8: tmpl = tmpl_SQSHLU; goto left16x8; case ARM64vecsh_SSHR8x16: tmpl = tmpl_SSHR; goto right8x16; case ARM64vecsh_USHR8x16: tmpl = tmpl_USHR; goto right8x16; case ARM64vecsh_UQSHRN8BH: tmpl = tmpl_UQSHRN; goto right8x16; @@ -5964,6 +6001,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecsh_SQRSHRN8BH: tmpl = tmpl_SQRSHRN; goto right8x16; case ARM64vecsh_SQRSHRUN8BH: tmpl = tmpl_SQRSHRUN; goto right8x16; case ARM64vecsh_SHL8x16: tmpl = tmpl_SHL; goto left8x16; + case ARM64vecsh_UQSHL8x16: tmpl = tmpl_UQSHL; goto left8x16; + case ARM64vecsh_SQSHL8x16: tmpl = tmpl_SQSHL; goto left8x16; + case ARM64vecsh_SQSHLU8x16: tmpl = tmpl_SQSHLU; goto left8x16; default: break; @@ -5993,25 +6033,25 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, break; left64x2: - if (sh >= 1 && sh <= 63) { + if (sh >= 0 && sh <= 63) { *p++ = tmpl | X_3_6_7_6_5_5(0,0, X1000000 | sh, 0,0,0); goto done; } break; left32x4: - if (sh >= 1 && sh <= 31) { + if (sh >= 0 && sh <= 31) { *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0100000 | sh, 0,0,0); goto done; } break; left16x8: - if (sh >= 1 && sh <= 15) { + if (sh >= 0 && sh <= 15) { *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0010000 | sh, 0,0,0); goto done; } break; left8x16: - if (sh >= 1 && sh <= 7) { + if (sh >= 0 && sh <= 7) { *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0001000 | sh, 0,0,0); goto done; } @@ -7024,6 +7064,12 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, HReg rD = i->ARM64in.VMov.dst; HReg rN = i->ARM64in.VMov.src; switch (i->ARM64in.VMov.szB) { + case 16: { + UInt dd = qregNo(rD); + UInt nn = qregNo(rN); + *p++ = X_3_8_5_6_5_5(X010, X01110101, nn, X000111, nn, dd); + goto done; + } case 8: { UInt dd = dregNo(rD); UInt nn = dregNo(rN); diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 868c120f04..8c07bdae1d 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -410,6 +410,13 @@ typedef ARM64vecsh_SQRSHRN2SD, ARM64vecsh_SQRSHRN4HS, ARM64vecsh_SQRSHRN8BH, ARM64vecsh_UQRSHRN2SD, ARM64vecsh_UQRSHRN4HS, ARM64vecsh_UQRSHRN8BH, ARM64vecsh_SQRSHRUN2SD, ARM64vecsh_SQRSHRUN4HS, ARM64vecsh_SQRSHRUN8BH, + /* Saturating left shifts, of various flavours. */ + ARM64vecsh_UQSHL64x2, ARM64vecsh_UQSHL32x4, + ARM64vecsh_UQSHL16x8, ARM64vecsh_UQSHL8x16, + ARM64vecsh_SQSHL64x2, ARM64vecsh_SQSHL32x4, + ARM64vecsh_SQSHL16x8, ARM64vecsh_SQSHL8x16, + ARM64vecsh_SQSHLU64x2, ARM64vecsh_SQSHLU32x4, + ARM64vecsh_SQSHLU16x8, ARM64vecsh_SQSHLU8x16, ARM64vecsh_INVALID } ARM64VecShiftOp; @@ -746,9 +753,10 @@ typedef HReg dst; // Q reg HReg src; // Q reg } VNarrowV; - /* Vector shift by immediate. |amt| needs to be > 0 and < - implied lane size of |op|. Zero shifts and out of range - shifts are not allowed. */ + /* Vector shift by immediate. For left shifts, |amt| must be + >= 0 and < implied lane size of |op|. For right shifts, + |amt| must be > 0 and <= implied lane size of |op|. Shifts + beyond these ranges are not allowed. */ struct { ARM64VecShiftOp op; HReg dst; diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index e22cd59312..71ee7a64cf 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -5604,68 +5604,105 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) //ZZ res, argL, size, True)); //ZZ return res; //ZZ } - case Iop_ShrN64x2: - case Iop_ShrN32x4: - case Iop_ShrN16x8: - case Iop_ShrN8x16: - case Iop_SarN64x2: - case Iop_SarN32x4: - case Iop_SarN16x8: - case Iop_SarN8x16: - case Iop_ShlN64x2: - case Iop_ShlN32x4: - case Iop_ShlN16x8: - case Iop_ShlN8x16: + case Iop_ShrN64x2: case Iop_ShrN32x4: + case Iop_ShrN16x8: case Iop_ShrN8x16: + case Iop_SarN64x2: case Iop_SarN32x4: + case Iop_SarN16x8: case Iop_SarN8x16: + case Iop_ShlN64x2: case Iop_ShlN32x4: + case Iop_ShlN16x8: case Iop_ShlN8x16: + case Iop_QShlN64x2: case Iop_QShlN32x4: + case Iop_QShlN16x8: case Iop_QShlN8x16: + case Iop_QSalN64x2: case Iop_QSalN32x4: + case Iop_QSalN16x8: case Iop_QSalN8x16: + case Iop_QShlN64Sx2: case Iop_QShlN32Sx4: + case Iop_QShlN16Sx8: case Iop_QShlN8Sx16: { IRExpr* argL = e->Iex.Binop.arg1; IRExpr* argR = e->Iex.Binop.arg2; if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) { UInt amt = argR->Iex.Const.con->Ico.U8; - UInt limit = 0; + UInt limLo = 0; + UInt limHi = 0; ARM64VecShiftOp op = ARM64vecsh_INVALID; + /* Establish the instruction to use. */ switch (e->Iex.Binop.op) { - case Iop_ShrN64x2: - op = ARM64vecsh_USHR64x2; limit = 63; break; - case Iop_ShrN32x4: - op = ARM64vecsh_USHR32x4; limit = 31; break; - case Iop_ShrN16x8: - op = ARM64vecsh_USHR16x8; limit = 15; break; - case Iop_ShrN8x16: - op = ARM64vecsh_USHR8x16; limit = 7; break; - case Iop_SarN64x2: - op = ARM64vecsh_SSHR64x2; limit = 63; break; - case Iop_SarN32x4: - op = ARM64vecsh_SSHR32x4; limit = 31; break; - case Iop_SarN16x8: - op = ARM64vecsh_SSHR16x8; limit = 15; break; - case Iop_SarN8x16: - op = ARM64vecsh_SSHR8x16; limit = 7; break; - case Iop_ShlN64x2: - op = ARM64vecsh_SHL64x2; limit = 63; break; - case Iop_ShlN32x4: - op = ARM64vecsh_SHL32x4; limit = 31; break; - case Iop_ShlN16x8: - op = ARM64vecsh_SHL16x8; limit = 15; break; - case Iop_ShlN8x16: - op = ARM64vecsh_SHL8x16; limit = 7; break; - default: - vassert(0); + case Iop_ShrN64x2: op = ARM64vecsh_USHR64x2; break; + case Iop_ShrN32x4: op = ARM64vecsh_USHR32x4; break; + case Iop_ShrN16x8: op = ARM64vecsh_USHR16x8; break; + case Iop_ShrN8x16: op = ARM64vecsh_USHR8x16; break; + case Iop_SarN64x2: op = ARM64vecsh_SSHR64x2; break; + case Iop_SarN32x4: op = ARM64vecsh_SSHR32x4; break; + case Iop_SarN16x8: op = ARM64vecsh_SSHR16x8; break; + case Iop_SarN8x16: op = ARM64vecsh_SSHR8x16; break; + case Iop_ShlN64x2: op = ARM64vecsh_SHL64x2; break; + case Iop_ShlN32x4: op = ARM64vecsh_SHL32x4; break; + case Iop_ShlN16x8: op = ARM64vecsh_SHL16x8; break; + case Iop_ShlN8x16: op = ARM64vecsh_SHL8x16; break; + case Iop_QShlN64x2: op = ARM64vecsh_UQSHL64x2; break; + case Iop_QShlN32x4: op = ARM64vecsh_UQSHL32x4; break; + case Iop_QShlN16x8: op = ARM64vecsh_UQSHL16x8; break; + case Iop_QShlN8x16: op = ARM64vecsh_UQSHL8x16; break; + case Iop_QSalN64x2: op = ARM64vecsh_SQSHL64x2; break; + case Iop_QSalN32x4: op = ARM64vecsh_SQSHL32x4; break; + case Iop_QSalN16x8: op = ARM64vecsh_SQSHL16x8; break; + case Iop_QSalN8x16: op = ARM64vecsh_SQSHL8x16; break; + case Iop_QShlN64Sx2: op = ARM64vecsh_SQSHLU64x2; break; + case Iop_QShlN32Sx4: op = ARM64vecsh_SQSHLU32x4; break; + case Iop_QShlN16Sx8: op = ARM64vecsh_SQSHLU16x8; break; + case Iop_QShlN8Sx16: op = ARM64vecsh_SQSHLU8x16; break; + default: vassert(0); } - if (op != ARM64vecsh_INVALID && amt >= 0 && amt <= limit) { + /* Establish the shift limits, for sanity check purposes only. */ + switch (e->Iex.Binop.op) { + case Iop_ShrN64x2: limLo = 1; limHi = 64; break; + case Iop_ShrN32x4: limLo = 1; limHi = 32; break; + case Iop_ShrN16x8: limLo = 1; limHi = 16; break; + case Iop_ShrN8x16: limLo = 1; limHi = 8; break; + case Iop_SarN64x2: limLo = 1; limHi = 64; break; + case Iop_SarN32x4: limLo = 1; limHi = 32; break; + case Iop_SarN16x8: limLo = 1; limHi = 16; break; + case Iop_SarN8x16: limLo = 1; limHi = 8; break; + case Iop_ShlN64x2: limLo = 0; limHi = 63; break; + case Iop_ShlN32x4: limLo = 0; limHi = 31; break; + case Iop_ShlN16x8: limLo = 0; limHi = 15; break; + case Iop_ShlN8x16: limLo = 0; limHi = 7; break; + case Iop_QShlN64x2: limLo = 0; limHi = 63; break; + case Iop_QShlN32x4: limLo = 0; limHi = 31; break; + case Iop_QShlN16x8: limLo = 0; limHi = 15; break; + case Iop_QShlN8x16: limLo = 0; limHi = 7; break; + case Iop_QSalN64x2: limLo = 0; limHi = 63; break; + case Iop_QSalN32x4: limLo = 0; limHi = 31; break; + case Iop_QSalN16x8: limLo = 0; limHi = 15; break; + case Iop_QSalN8x16: limLo = 0; limHi = 7; break; + case Iop_QShlN64Sx2: limLo = 0; limHi = 63; break; + case Iop_QShlN32Sx4: limLo = 0; limHi = 31; break; + case Iop_QShlN16Sx8: limLo = 0; limHi = 15; break; + case Iop_QShlN8Sx16: limLo = 0; limHi = 7; break; + default: vassert(0); + } + /* For left shifts, the allowable amt values are + 0 .. lane_bits-1. For right shifts the allowable + values are 1 .. lane_bits. */ + if (op != ARM64vecsh_INVALID && amt >= limLo && amt <= limHi) { HReg src = iselV128Expr(env, argL); HReg dst = newVRegV(env); - if (amt > 0) { - /* For left shifts, the allowable amt values are - 0 .. lane_bits-1. For right shifts the allowable - values are 1 .. lane_bits. By restricting it to - 1 .. lane_bits-1, we are guaranteed to create a - valid instruction. */ - addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt)); - } else { - dst = src; - } + addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt)); return dst; } + /* Special case some no-op shifts that the arm64 front end + throws at us. We can't generate any instructions for these, + but we don't need to either. */ + switch (e->Iex.Binop.op) { + case Iop_ShrN64x2: case Iop_ShrN32x4: + case Iop_ShrN16x8: case Iop_ShrN8x16: + if (amt == 0) { + return iselV128Expr(env, argL); + } + break; + default: + break; + } + /* otherwise unhandled */ } /* else fall out; this is unhandled */ break; -- 2.47.2