From: Julian Seward Date: Mon, 11 Aug 2014 14:02:47 +0000 (+0000) Subject: arm64: implement: X-Git-Tag: svn/VALGRIND_3_10_1^2~53 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9737edba6523b6714f6a9ec2a2ebb710606f4f55;p=thirdparty%2Fvalgrind.git arm64: implement: uqshrn{2}, sqrshrun{2}, sqshrun{2} (vector, imm) sqxtn{2}, uqxtn{2}, sqxtun{2} (vector and scalar) git-svn-id: svn://svn.valgrind.org/vex/trunk@2918 --- diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index b01c760807..5430bed507 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -809,8 +809,7 @@ static IROp mkVecQRDMULHIS ( UInt size ) { return ops[size]; } -static IROp mkVecQANDUQSH ( UInt size ) -{ +static IROp mkVecQANDUQSH ( UInt size ) { const IROp ops[4] = { Iop_QandUQsh8x16, Iop_QandUQsh16x8, Iop_QandUQsh32x4, Iop_QandUQsh64x2 }; @@ -818,8 +817,7 @@ static IROp mkVecQANDUQSH ( UInt size ) return ops[size]; } -static IROp mkVecQANDSQSH ( UInt size ) -{ +static IROp mkVecQANDSQSH ( UInt size ) { const IROp ops[4] = { Iop_QandSQsh8x16, Iop_QandSQsh16x8, Iop_QandSQsh32x4, Iop_QandSQsh64x2 }; @@ -827,8 +825,7 @@ static IROp mkVecQANDSQSH ( UInt size ) return ops[size]; } -static IROp mkVecQANDUQRSH ( UInt size ) -{ +static IROp mkVecQANDUQRSH ( UInt size ) { const IROp ops[4] = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8, Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 }; @@ -836,8 +833,7 @@ static IROp mkVecQANDUQRSH ( UInt size ) return ops[size]; } -static IROp mkVecQANDSQRSH ( UInt size ) -{ +static IROp mkVecQANDSQRSH ( UInt size ) { const IROp ops[4] = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8, Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 }; @@ -845,6 +841,86 @@ static IROp mkVecQANDSQRSH ( UInt size ) return ops[size]; } +static IROp mkVecNARROWUN ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4, + Iop_NarrowUn64to32x2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QNarrowUn16Sto8Ux8, Iop_QNarrowUn32Sto16Ux4, + Iop_QNarrowUn64Sto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QNarrowUn16Sto8Sx8, Iop_QNarrowUn32Sto16Sx4, + Iop_QNarrowUn64Sto32Sx2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QNarrowUn16Uto8Ux8, Iop_QNarrowUn32Uto16Ux4, + Iop_QNarrowUn64Uto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4, + Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQSarNnarrow16Sto8Sx8, Iop_QandQSarNnarrow32Sto16Sx4, + Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQSarNnarrow16Sto8Ux8, Iop_QandQSarNnarrow32Sto16Ux4, + Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQRShrNnarrow16Uto8Ux8, Iop_QandQRShrNnarrow32Uto16Ux4, + Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQRSarNnarrow16Sto8Sx8, Iop_QandQRSarNnarrow32Sto16Sx4, + Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + +static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) { + const IROp ops[4] + = { Iop_QandQRSarNnarrow16Sto8Ux8, Iop_QandQRSarNnarrow32Sto16Ux4, + Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID }; + vassert(sizeNarrow < 4); + return ops[sizeNarrow]; +} + /* Generate IR to create 'arg rotated right by imm', for sane values of 'ty' and 'imm'. */ static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm ) @@ -6137,6 +6213,22 @@ IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow ) } +/* Generate IR that takes an I64 and sign- or zero- widens each + lane, giving a V128 value. */ +static +IRTemp math_WIDEN_LANES ( Bool zWiden, UInt sizeNarrow, IRExpr* srcE ) +{ + IRTemp src = newTemp(Ity_I64); + assign(src, srcE); + return math_WIDEN_LO_OR_HI_LANES( + zWiden, + False/*!fromUpperHalf*/, + sizeNarrow, + binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)) + ); +} + + /* Return a temp which holds the vector dup of the lane of width (1 << size) obtained from src[laneNo]. */ static @@ -6328,6 +6420,8 @@ void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg, } +/* Zero all except the least significant lane of |srcE|, where |size| + indicates the lane size in the usual way. */ static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE ) { vassert(size < 4); @@ -7780,6 +7874,43 @@ Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS5(1,0,1,0,0) + || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) { + /* -------- 0,xx,10100: SQXTN -------- */ + /* -------- 1,xx,10100: UQXTN -------- */ + /* -------- 1,xx,10010: SQXTUN -------- */ + if (size == X11) return False; + vassert(size < 3); + IROp opN = Iop_INVALID; + Bool zWiden = True; + const HChar* nm = "??"; + /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) { + opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False; + } + else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) { + opN = mkVecQNARROWUNUU(size); nm = "uqxtn"; + } + else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) { + opN = mkVecQNARROWUNSU(size); nm = "sqxtun"; + } + else vassert(0); + IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE( + size+1, getQReg128(nn)); + IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE( + size, unop(Iop_64UtoV128, unop(opN, mkexpr(src)))); + putQReg128(dd, mkexpr(resN)); + /* This widens zero lanes to zero, and compares it against zero, so all + of the non-participating lanes make no contribution to the + Q flag state. */ + IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/, + size, mkexpr(resN)); + updateQCFLAGwithDifference(src, resW); + const HChar arrNarrow = "bhsd"[size]; + const HChar arrWide = "bhsd"[size+1]; + DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn); + return True; + } + # define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) return False; # undef INSN @@ -8087,6 +8218,66 @@ Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1) + || (bitU == 1 + && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) { + /* -------- 0,10010 SQSHRN{,2} #imm -------- */ + /* -------- 1,10010 UQSHRN{,2} #imm -------- */ + /* -------- 0,10011 SQRSHRN{,2} #imm -------- */ + /* -------- 1,10011 UQRSHRN{,2} #imm -------- */ + /* -------- 1,10000 SQSHRUN{,2} #imm -------- */ + /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */ + UInt size = 0; + UInt shift = 0; + Bool is2 = bitQ == 1; + Bool ok = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb); + if (!ok || size == X11) return False; + vassert(shift >= 1 && shift <= (8 << size)); + const HChar* nm = "??"; + IROp op = Iop_INVALID; + /* Decide on the name and the operation. */ + /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) { + nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size); + } + else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) { + nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size); + } + else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) { + nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size); + } + else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) { + nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size); + } + else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) { + nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size); + } + else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) { + nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size); + } + else vassert(0); + /* Compute the result (Q, shifted value) pair. */ + IRTemp src128 = newTempV128(); + assign(src128, getQReg128(nn)); + IRTemp pair = newTempV128(); + assign(pair, binop(op, mkexpr(src128), mkU8(shift))); + /* Update the result reg */ + IRTemp res64in128 = newTempV128(); + assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair))); + putLO64andZUorPutHI64(is2, dd, res64in128); + /* Update the Q flag. */ + IRTemp q64q64 = newTempV128(); + assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair))); + IRTemp z128 = newTempV128(); + assign(z128, mkV128(0x0000)); + updateQCFLAGwithDifference(q64q64, z128); + /* */ + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + DIP("%s %s.%s, %s.%s, #%u\n", nm, + nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift); + return True; + } + if (opcode == BITS5(1,0,1,0,0)) { /* -------- 0,10100 SSHLL{,2} #imm -------- */ /* -------- 1,10100 USHLL{,2} #imm -------- */ @@ -9248,34 +9439,55 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) { /* -------- 0,xx,10010: XTN{,2} -------- */ - /* 31 28 23 21 15 9 4 XTN{,2} Vd.Tb, Vn.Ta - 0q0 01110 size 100001 001010 n d - */ - Bool isQ = bitQ == 1; - IROp op = Iop_INVALID; - const HChar* tb = NULL; - const HChar* ta = NULL; - switch ((size << 1) | (isQ ? 1 : 0)) { - case 0: tb = "8b"; ta = "8h"; op = Iop_NarrowUn16to8x8; break; - case 1: tb = "16b"; ta = "8h"; op = Iop_NarrowUn16to8x8; break; - case 2: tb = "4h"; ta = "4s"; op = Iop_NarrowUn32to16x4; break; - case 3: tb = "8h"; ta = "4s"; op = Iop_NarrowUn32to16x4; break; - case 4: tb = "2s"; ta = "2d"; op = Iop_NarrowUn64to32x2; break; - case 5: tb = "4s"; ta = "2d"; op = Iop_NarrowUn64to32x2; break; - case 6: break; - case 7: break; - default: vassert(0); + if (size == X11) return False; + vassert(size < 3); + Bool is2 = bitQ == 1; + IROp opN = mkVecNARROWUN(size); + IRTemp resN = newTempV128(); + assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn)))); + putLO64andZUorPutHI64(is2, dd, resN); + const HChar* nm = "xtn"; + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm, + nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide); + return True; + } + + if (opcode == BITS5(1,0,1,0,0) + || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) { + /* -------- 0,xx,10100: SQXTN{,2} -------- */ + /* -------- 1,xx,10100: UQXTN{,2} -------- */ + /* -------- 1,xx,10010: SQXTUN{,2} -------- */ + if (size == X11) return False; + vassert(size < 3); + Bool is2 = bitQ == 1; + IROp opN = Iop_INVALID; + Bool zWiden = True; + const HChar* nm = "??"; + /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) { + opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False; } - if (op != Iop_INVALID) { - if (!isQ) { - putQRegLane(dd, 1, mkU64(0)); - } - putQRegLane(dd, isQ ? 1 : 0, unop(op, getQReg128(nn))); - DIP("xtn%s %s.%s, %s.%s\n", isQ ? "2" : "", - nameQReg128(dd), tb, nameQReg128(nn), ta); - return True; + else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) { + opN = mkVecQNARROWUNUU(size); nm = "uqxtn"; } - return False; + else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) { + opN = mkVecQNARROWUNSU(size); nm = "sqxtun"; + } + else vassert(0); + IRTemp src = newTempV128(); + assign(src, getQReg128(nn)); + IRTemp resN = newTempV128(); + assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src)))); + putLO64andZUorPutHI64(is2, dd, resN); + IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/, + size, mkexpr(resN)); + updateQCFLAGwithDifference(src, resW); + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm, + nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide); + return True; } if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) { diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index fc57794e24..a94a82467c 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -1007,19 +1007,47 @@ static void showARM64VecShiftOp(/*OUT*/const HChar** nm, ARM64VecShiftOp op ) { switch (op) { - case ARM64vecsh_USHR64x2: *nm = "ushr "; *ar = "2d"; return; - case ARM64vecsh_USHR32x4: *nm = "ushr "; *ar = "4s"; return; - case ARM64vecsh_USHR16x8: *nm = "ushr "; *ar = "8h"; return; - case ARM64vecsh_USHR8x16: *nm = "ushr "; *ar = "16b"; return; - case ARM64vecsh_SSHR64x2: *nm = "sshr "; *ar = "2d"; return; - case ARM64vecsh_SSHR32x4: *nm = "sshr "; *ar = "4s"; return; - case ARM64vecsh_SSHR16x8: *nm = "sshr "; *ar = "8h"; return; - case ARM64vecsh_SSHR8x16: *nm = "sshr "; *ar = "16b"; return; - case ARM64vecsh_SHL64x2: *nm = "shl "; *ar = "2d"; return; - case ARM64vecsh_SHL32x4: *nm = "shl "; *ar = "4s"; return; - case ARM64vecsh_SHL16x8: *nm = "shl "; *ar = "8h"; return; - case ARM64vecsh_SHL8x16: *nm = "shl "; *ar = "16b"; return; - default: vpanic("showARM64VecShiftImmOp"); + case ARM64vecsh_USHR64x2: *nm = "ushr "; *ar = "2d"; return; + case ARM64vecsh_USHR32x4: *nm = "ushr "; *ar = "4s"; return; + case ARM64vecsh_USHR16x8: *nm = "ushr "; *ar = "8h"; return; + case ARM64vecsh_USHR8x16: *nm = "ushr "; *ar = "16b"; return; + case ARM64vecsh_SSHR64x2: *nm = "sshr "; *ar = "2d"; return; + case ARM64vecsh_SSHR32x4: *nm = "sshr "; *ar = "4s"; return; + case ARM64vecsh_SSHR16x8: *nm = "sshr "; *ar = "8h"; return; + case ARM64vecsh_SSHR8x16: *nm = "sshr "; *ar = "16b"; return; + case ARM64vecsh_SHL64x2: *nm = "shl "; *ar = "2d"; return; + case ARM64vecsh_SHL32x4: *nm = "shl "; *ar = "4s"; return; + case ARM64vecsh_SHL16x8: *nm = "shl "; *ar = "8h"; return; + case ARM64vecsh_SHL8x16: *nm = "shl "; *ar = "16b"; return; + case ARM64vecsh_SQSHRN2SD: *nm = "sqshrn"; *ar = "2sd"; return; + case ARM64vecsh_SQSHRN4HS: *nm = "sqshrn"; *ar = "4hs"; return; + case ARM64vecsh_SQSHRN8BH: *nm = "sqshrn"; *ar = "8bh"; return; + case ARM64vecsh_UQSHRN2SD: *nm = "uqshrn"; *ar = "2sd"; return; + case ARM64vecsh_UQSHRN4HS: *nm = "uqshrn"; *ar = "4hs"; return; + case ARM64vecsh_UQSHRN8BH: *nm = "uqshrn"; *ar = "8bh"; return; + case ARM64vecsh_SQSHRUN2SD: *nm = "sqshrun"; *ar = "2sd"; return; + case ARM64vecsh_SQSHRUN4HS: *nm = "sqshrun"; *ar = "4hs"; return; + case ARM64vecsh_SQSHRUN8BH: *nm = "sqshrun"; *ar = "8bh"; return; + case ARM64vecsh_SQRSHRN2SD: *nm = "sqrshrn"; *ar = "2sd"; return; + case ARM64vecsh_SQRSHRN4HS: *nm = "sqrshrn"; *ar = "4hs"; return; + case ARM64vecsh_SQRSHRN8BH: *nm = "sqrshrn"; *ar = "8bh"; return; + case ARM64vecsh_UQRSHRN2SD: *nm = "uqrshrn"; *ar = "2sd"; return; + case ARM64vecsh_UQRSHRN4HS: *nm = "uqrshrn"; *ar = "4hs"; return; + case ARM64vecsh_UQRSHRN8BH: *nm = "uqrshrn"; *ar = "8bh"; return; + case ARM64vecsh_SQRSHRUN2SD: *nm = "sqrshrun"; *ar = "2sd"; return; + case ARM64vecsh_SQRSHRUN4HS: *nm = "sqrshrun"; *ar = "4hs"; return; + case ARM64vecsh_SQRSHRUN8BH: *nm = "sqrshrun"; *ar = "8bh"; return; + default: vpanic("showARM64VecShiftOp"); + } +} + +static const HChar* showARM64VecNarrowOp(ARM64VecNarrowOp op) { + switch (op) { + case ARM64vecna_XTN: return "xtn "; + case ARM64vecna_SQXTN: return "sqxtn "; + case ARM64vecna_UQXTN: return "uqxtn "; + case ARM64vecna_SQXTUN: return "sqxtun"; + default: vpanic("showARM64VecNarrowOp"); } } @@ -1716,9 +1744,11 @@ ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg dst, HReg arg ) { i->ARM64in.VUnaryV.arg = arg; return i; } -ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ) { +ARM64Instr* ARM64Instr_VNarrowV ( ARM64VecNarrowOp op, + UInt dszBlg2, HReg dst, HReg src ) { ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); i->tag = ARM64in_VNarrowV; + i->ARM64in.VNarrowV.op = op; i->ARM64in.VNarrowV.dszBlg2 = dszBlg2; i->ARM64in.VNarrowV.dst = dst; i->ARM64in.VNarrowV.src = src; @@ -1735,6 +1765,12 @@ ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op, i->ARM64in.VShiftImmV.amt = amt; UInt maxSh = 0; switch (op) { + /* NB: the comments below are wrong. Correct is: for right shifts, + the allowed shift amounts are 1 .. lane_size. For left shifts, + the allowed shift amoutns are 0 .. lane_size-1. */ + /* For these ordinary, non-saturating non-magical shifts, + the min shift value is actually zero, but we reject such cases + and instead only accept 1 as the minimum shift value. */ case ARM64vecsh_USHR64x2: case ARM64vecsh_SSHR64x2: case ARM64vecsh_SHL64x2: maxSh = 63; break; @@ -1747,6 +1783,23 @@ ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op, case ARM64vecsh_USHR8x16: case ARM64vecsh_SSHR8x16: case ARM64vecsh_SHL8x16: maxSh = 7; break; + /* Whereas for these shift right and narrow set, the min shift + value really is 1. */ + case ARM64vecsh_UQSHRN2SD: case ARM64vecsh_SQSHRN2SD: + case ARM64vecsh_SQSHRUN2SD: + case ARM64vecsh_UQRSHRN2SD: case ARM64vecsh_SQRSHRN2SD: + case ARM64vecsh_SQRSHRUN2SD: + maxSh = 64; break; + case ARM64vecsh_UQSHRN4HS: case ARM64vecsh_SQSHRN4HS: + case ARM64vecsh_SQSHRUN4HS: + case ARM64vecsh_UQRSHRN4HS: case ARM64vecsh_SQRSHRN4HS: + case ARM64vecsh_SQRSHRUN4HS: + maxSh = 32; break; + case ARM64vecsh_UQSHRN8BH: case ARM64vecsh_SQSHRN8BH: + case ARM64vecsh_SQSHRUN8BH: + case ARM64vecsh_UQRSHRN8BH: case ARM64vecsh_SQRSHRN8BH: + case ARM64vecsh_SQRSHRUN8BH: + maxSh = 16; break; default: vassert(0); } @@ -2408,7 +2461,8 @@ void ppARM64Instr ( ARM64Instr* i ) { UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2; const HChar* darr[3] = { "8b", "4h", "2s" }; const HChar* sarr[3] = { "8h", "4s", "2d" }; - vex_printf("xtn "); + const HChar* nm = showARM64VecNarrowOp(i->ARM64in.VNarrowV.op); + vex_printf("%s ", nm); ppHRegARM64(i->ARM64in.VNarrowV.dst); vex_printf(".%s, ", dszBlg2 < 3 ? darr[dszBlg2] : "??"); ppHRegARM64(i->ARM64in.VNarrowV.src); @@ -5783,114 +5837,185 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 000 01110 00 1,00001 001010 n d XTN Vd.8b, Vn.8h 000 01110 01 1,00001 001010 n d XTN Vd.4h, Vn.4s 000 01110 10 1,00001 001010 n d XTN Vd.2s, Vn.2d + + 001 01110 00 1,00001 001010 n d SQXTUN Vd.8b, Vn.8h + 001 01110 01 1,00001 001010 n d SQXTUN Vd.4h, Vn.4s + 001 01110 10 1,00001 001010 n d SQXTUN Vd.2s, Vn.2d + + 000 01110 00 1,00001 010010 n d SQXTN Vd.8b, Vn.8h + 000 01110 01 1,00001 010010 n d SQXTN Vd.4h, Vn.4s + 000 01110 10 1,00001 010010 n d SQXTN Vd.2s, Vn.2d + + 001 01110 00 1,00001 010010 n d UQXTN Vd.8b, Vn.8h + 001 01110 01 1,00001 010010 n d UQXTN Vd.4h, Vn.4s + 001 01110 10 1,00001 010010 n d UQXTN Vd.2s, Vn.2d */ UInt vD = qregNo(i->ARM64in.VNarrowV.dst); UInt vN = qregNo(i->ARM64in.VNarrowV.src); UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2; vassert(dszBlg2 >= 0 && dszBlg2 <= 2); - *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1), - X00001, X001010, vN, vD); - goto done; + switch (i->ARM64in.VNarrowV.op) { + case ARM64vecna_XTN: + *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1), + X00001, X001010, vN, vD); + goto done; + case ARM64vecna_SQXTUN: + *p++ = X_3_8_5_6_5_5(X001, X01110001 | (dszBlg2 << 1), + X00001, X001010, vN, vD); + goto done; + case ARM64vecna_SQXTN: + *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1), + X00001, X010010, vN, vD); + goto done; + case ARM64vecna_UQXTN: + *p++ = X_3_8_5_6_5_5(X001, X01110001 | (dszBlg2 << 1), + X00001, X010010, vN, vD); + goto done; + default: + break; + } + goto bad; } case ARM64in_VShiftImmV: { /* - 011 011110 immh immb 000001 n d USHR Vd.T, Vn.T, #sh - 010 011110 immh immb 000001 n d SSHR Vd.T, Vn.T, #sh + 011 011110 immh immb 000001 n d USHR Vd.T, Vn.T, #sh + 010 011110 immh immb 000001 n d SSHR Vd.T, Vn.T, #sh + + 001 011110 immh immb 100101 n d UQSHRN ,,#sh + 000 011110 immh immb 100101 n d SQSHRN ,,#sh + 001 011110 immh immb 100001 n d SQSHRUN ,,#sh + + 001 011110 immh immb 100111 n d UQRSHRN ,,#sh + 000 011110 immh immb 100111 n d SQRSHRN ,,#sh + 001 011110 immh immb 100011 n d SQRSHRUN ,,#sh where immh:immb = case T of - 2d | sh in 1..63 -> let xxxxxx = 64-sh in 1xxx:xxx - 4s | sh in 1..31 -> let xxxxx = 32-sh in 01xx:xxx - 8h | sh in 1..15 -> let xxxx = 16-sh in 001x:xxx - 16b | sh in 1..7 -> let xxx = 8-sh in 0001:xxx + 2d | sh in 1..64 -> let xxxxxx = 64-sh in 1xxx:xxx + 4s | sh in 1..32 -> let xxxxx = 32-sh in 01xx:xxx + 8h | sh in 1..16 -> let xxxx = 16-sh in 001x:xxx + 16b | sh in 1..8 -> let xxx = 8-sh in 0001:xxx 010 011110 immh immb 010101 n d SHL Vd.T, Vn.T, #sh where immh:immb = case T of - 2d | sh in 1..63 -> let xxxxxx = sh in 1xxx:xxx - 4s | sh in 1..31 -> let xxxxx = sh in 01xx:xxx - 8h | sh in 1..15 -> let xxxx = sh in 001x:xxx - 16b | sh in 1..7 -> let xxx = sh in 0001:xxx + 2d | sh in 0..63 -> let xxxxxx = sh in 1xxx:xxx + 4s | sh in 0..31 -> let xxxxx = sh in 01xx:xxx + 8h | sh in 0..15 -> let xxxx = sh in 001x:xxx + 16b | sh in 0..7 -> let xxx = sh in 0001:xxx */ - UInt vD = qregNo(i->ARM64in.VShiftImmV.dst); - UInt vN = qregNo(i->ARM64in.VShiftImmV.src); - UInt sh = i->ARM64in.VShiftImmV.amt; - ARM64VecShiftOp op = i->ARM64in.VShiftImmV.op; - Bool syned = False; - switch (op) { - /* 64x2 cases */ - case ARM64vecsh_SSHR64x2: syned = True; - case ARM64vecsh_USHR64x2: /* fallthrough */ + UInt vD = qregNo(i->ARM64in.VShiftImmV.dst); + UInt vN = qregNo(i->ARM64in.VShiftImmV.src); + UInt sh = i->ARM64in.VShiftImmV.amt; + UInt tmpl = 0; /* invalid */ + + const UInt tmpl_USHR + = X_3_6_7_6_5_5(X011, X011110, 0, X000001, vN, vD); + const UInt tmpl_SSHR + = X_3_6_7_6_5_5(X010, X011110, 0, X000001, vN, vD); + + const UInt tmpl_UQSHRN + = X_3_6_7_6_5_5(X001, X011110, 0, X100101, vN, vD); + const UInt tmpl_SQSHRN + = X_3_6_7_6_5_5(X000, X011110, 0, X100101, vN, vD); + const UInt tmpl_SQSHRUN + = X_3_6_7_6_5_5(X001, X011110, 0, X100001, vN, vD); + + const UInt tmpl_UQRSHRN + = X_3_6_7_6_5_5(X001, X011110, 0, X100111, vN, vD); + const UInt tmpl_SQRSHRN + = X_3_6_7_6_5_5(X000, X011110, 0, X100111, vN, vD); + const UInt tmpl_SQRSHRUN + = X_3_6_7_6_5_5(X001, X011110, 0, X100011, vN, vD); + + const UInt tmpl_SHL + = X_3_6_7_6_5_5(X010, X011110, 0, X010101, vN, vD); + + switch (i->ARM64in.VShiftImmV.op) { + case ARM64vecsh_SSHR64x2: tmpl = tmpl_SSHR; goto right64x2; + case ARM64vecsh_USHR64x2: tmpl = tmpl_USHR; goto right64x2; + case ARM64vecsh_SHL64x2: tmpl = tmpl_SHL; goto left64x2; + + case ARM64vecsh_SSHR32x4: tmpl = tmpl_SSHR; goto right32x4; + case ARM64vecsh_USHR32x4: tmpl = tmpl_USHR; goto right32x4; + case ARM64vecsh_UQSHRN2SD: tmpl = tmpl_UQSHRN; goto right32x4; + case ARM64vecsh_SQSHRN2SD: tmpl = tmpl_SQSHRN; goto right32x4; + case ARM64vecsh_SQSHRUN2SD: tmpl = tmpl_SQSHRUN; goto right32x4; + case ARM64vecsh_UQRSHRN2SD: tmpl = tmpl_UQRSHRN; goto right32x4; + case ARM64vecsh_SQRSHRN2SD: tmpl = tmpl_SQRSHRN; goto right32x4; + case ARM64vecsh_SQRSHRUN2SD: tmpl = tmpl_SQRSHRUN; goto right32x4; + case ARM64vecsh_SHL32x4: tmpl = tmpl_SHL; goto left32x4; + + case ARM64vecsh_SSHR16x8: tmpl = tmpl_SSHR; goto right16x8; + case ARM64vecsh_USHR16x8: tmpl = tmpl_USHR; goto right16x8; + case ARM64vecsh_UQSHRN4HS: tmpl = tmpl_UQSHRN; goto right16x8; + case ARM64vecsh_SQSHRN4HS: tmpl = tmpl_SQSHRN; goto right16x8; + case ARM64vecsh_SQSHRUN4HS: tmpl = tmpl_SQSHRUN; goto right16x8; + case ARM64vecsh_UQRSHRN4HS: tmpl = tmpl_UQRSHRN; goto right16x8; + case ARM64vecsh_SQRSHRN4HS: tmpl = tmpl_SQRSHRN; goto right16x8; + case ARM64vecsh_SQRSHRUN4HS: tmpl = tmpl_SQRSHRUN; goto right16x8; + case ARM64vecsh_SHL16x8: tmpl = tmpl_SHL; goto left16x8; + + case ARM64vecsh_SSHR8x16: tmpl = tmpl_SSHR; goto right8x16; + case ARM64vecsh_USHR8x16: tmpl = tmpl_USHR; goto right8x16; + case ARM64vecsh_UQSHRN8BH: tmpl = tmpl_UQSHRN; goto right8x16; + case ARM64vecsh_SQSHRN8BH: tmpl = tmpl_SQSHRN; goto right8x16; + case ARM64vecsh_SQSHRUN8BH: tmpl = tmpl_SQSHRUN; goto right8x16; + case ARM64vecsh_UQRSHRN8BH: tmpl = tmpl_UQRSHRN; goto right8x16; + case ARM64vecsh_SQRSHRN8BH: tmpl = tmpl_SQRSHRN; goto right8x16; + case ARM64vecsh_SQRSHRUN8BH: tmpl = tmpl_SQRSHRUN; goto right8x16; + case ARM64vecsh_SHL8x16: tmpl = tmpl_SHL; goto left8x16; + + default: break; + + right64x2: if (sh >= 1 && sh <= 63) { - UInt xxxxxx = 64-sh; - *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110, - X1000000 | xxxxxx, X000001, vN, vD); + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X1000000 | (64-sh), 0,0,0); goto done; } break; - case ARM64vecsh_SHL64x2: - if (sh >= 1 && sh <= 63) { - UInt xxxxxx = sh; - *p++ = X_3_6_7_6_5_5(X010, X011110, - X1000000 | xxxxxx, X010101, vN, vD); + right32x4: + if (sh >= 1 && sh <= 32) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0100000 | (32-sh), 0,0,0); goto done; } break; - /* 32x4 cases */ - case ARM64vecsh_SSHR32x4: syned = True; - case ARM64vecsh_USHR32x4: /* fallthrough */ - if (sh >= 1 && sh <= 31) { - UInt xxxxx = 32-sh; - *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110, - X0100000 | xxxxx, X000001, vN, vD); + right16x8: + if (sh >= 1 && sh <= 16) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0010000 | (16-sh), 0,0,0); goto done; } break; - case ARM64vecsh_SHL32x4: - if (sh >= 1 && sh <= 31) { - UInt xxxxx = sh; - *p++ = X_3_6_7_6_5_5(X010, X011110, - X0100000 | xxxxx, X010101, vN, vD); + right8x16: + if (sh >= 1 && sh <= 8) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0001000 | (8-sh), 0,0,0); goto done; } break; - /* 16x8 cases */ - case ARM64vecsh_SSHR16x8: syned = True; - case ARM64vecsh_USHR16x8: /* fallthrough */ - if (sh >= 1 && sh <= 15) { - UInt xxxx = 16-sh; - *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110, - X0010000 | xxxx, X000001, vN, vD); + + left64x2: + if (sh >= 1 && sh <= 63) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X1000000 | sh, 0,0,0); goto done; } break; - case ARM64vecsh_SHL16x8: - if (sh >= 1 && sh <= 15) { - UInt xxxx = sh; - *p++ = X_3_6_7_6_5_5(X010, X011110, - X0010000 | xxxx, X010101, vN, vD); + left32x4: + if (sh >= 1 && sh <= 31) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0100000 | sh, 0,0,0); goto done; } break; - /* 8x16 cases */ - case ARM64vecsh_SSHR8x16: syned = True; - case ARM64vecsh_USHR8x16: /* fallthrough */ - if (sh >= 1 && sh <= 7) { - UInt xxx = 8-sh; - *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110, - X0001000 | xxx, X000001, vN, vD); + left16x8: + if (sh >= 1 && sh <= 15) { + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0010000 | sh, 0,0,0); goto done; } break; - case ARM64vecsh_SHL8x16: + left8x16: if (sh >= 1 && sh <= 7) { - UInt xxx = sh; - *p++ = X_3_6_7_6_5_5(X010, X011110, - X0001000 | xxx, X010101, vN, vD); + *p++ = tmpl | X_3_6_7_6_5_5(0,0, X0001000 | sh, 0,0,0); goto done; } break; - default: - break; } goto bad; } diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index aaf839ff26..868c120f04 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -402,10 +402,28 @@ typedef ARM64vecsh_SSHR16x8, ARM64vecsh_SSHR8x16, ARM64vecsh_SHL64x2, ARM64vecsh_SHL32x4, ARM64vecsh_SHL16x8, ARM64vecsh_SHL8x16, + /* These narrowing shifts zero out the top half of the destination + register. */ + ARM64vecsh_SQSHRN2SD, ARM64vecsh_SQSHRN4HS, ARM64vecsh_SQSHRN8BH, + ARM64vecsh_UQSHRN2SD, ARM64vecsh_UQSHRN4HS, ARM64vecsh_UQSHRN8BH, + ARM64vecsh_SQSHRUN2SD, ARM64vecsh_SQSHRUN4HS, ARM64vecsh_SQSHRUN8BH, + ARM64vecsh_SQRSHRN2SD, ARM64vecsh_SQRSHRN4HS, ARM64vecsh_SQRSHRN8BH, + ARM64vecsh_UQRSHRN2SD, ARM64vecsh_UQRSHRN4HS, ARM64vecsh_UQRSHRN8BH, + ARM64vecsh_SQRSHRUN2SD, ARM64vecsh_SQRSHRUN4HS, ARM64vecsh_SQRSHRUN8BH, ARM64vecsh_INVALID } ARM64VecShiftOp; +typedef + enum { + ARM64vecna_XTN=400, + ARM64vecna_SQXTN, + ARM64vecna_UQXTN, + ARM64vecna_SQXTUN, + ARM64vecna_INVALID + } + ARM64VecNarrowOp; + typedef enum { /* baseline */ @@ -720,11 +738,13 @@ typedef HReg arg; } VUnaryV; /* vector narrowing, Q -> Q. Result goes in the bottom half - of dst and the top half is zeroed out. Iow is XTN. */ + of dst and the top half is zeroed out. Iow one of the + XTN family. */ struct { - UInt dszBlg2; // 0: 16to8_x8 1: 32to16_x4 2: 64to32_x2 - HReg dst; // Q reg - HReg src; // Q reg + ARM64VecNarrowOp op; + UInt dszBlg2; // 0: 16to8_x8 1: 32to16_x4 2: 64to32_x2 + HReg dst; // Q reg + HReg src; // Q reg } VNarrowV; /* Vector shift by immediate. |amt| needs to be > 0 and < implied lane size of |op|. Zero shifts and out of range @@ -836,7 +856,8 @@ extern ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ); extern ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg ); extern ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg ); -extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ); +extern ARM64Instr* ARM64Instr_VNarrowV ( ARM64VecNarrowOp op, UInt dszBlg2, + HReg dst, HReg src ); extern ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op, HReg dst, HReg src, UInt amt ); extern ARM64Instr* ARM64Instr_VExtV ( HReg dst, diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index b7523a16ff..e22cd59312 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -2146,18 +2146,51 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) } case Iop_NarrowUn16to8x8: case Iop_NarrowUn32to16x4: - case Iop_NarrowUn64to32x2: { + case Iop_NarrowUn64to32x2: + case Iop_QNarrowUn16Sto8Sx8: + case Iop_QNarrowUn32Sto16Sx4: + case Iop_QNarrowUn64Sto32Sx2: + case Iop_QNarrowUn16Uto8Ux8: + case Iop_QNarrowUn32Uto16Ux4: + case Iop_QNarrowUn64Uto32Ux2: + case Iop_QNarrowUn16Sto8Ux8: + case Iop_QNarrowUn32Sto16Ux4: + case Iop_QNarrowUn64Sto32Ux2: + { HReg src = iselV128Expr(env, e->Iex.Unop.arg); HReg tmp = newVRegV(env); HReg dst = newVRegI(env); UInt dszBlg2 = 3; /* illegal */ + ARM64VecNarrowOp op = ARM64vecna_INVALID; switch (e->Iex.Unop.op) { - case Iop_NarrowUn16to8x8: dszBlg2 = 0; break; // 16to8_x8 - case Iop_NarrowUn32to16x4: dszBlg2 = 1; break; // 32to16_x4 - case Iop_NarrowUn64to32x2: dszBlg2 = 2; break; // 64to32_x2 - default: vassert(0); + case Iop_NarrowUn16to8x8: + dszBlg2 = 0; op = ARM64vecna_XTN; break; + case Iop_NarrowUn32to16x4: + dszBlg2 = 1; op = ARM64vecna_XTN; break; + case Iop_NarrowUn64to32x2: + dszBlg2 = 2; op = ARM64vecna_XTN; break; + case Iop_QNarrowUn16Sto8Sx8: + dszBlg2 = 0; op = ARM64vecna_SQXTN; break; + case Iop_QNarrowUn32Sto16Sx4: + dszBlg2 = 1; op = ARM64vecna_SQXTN; break; + case Iop_QNarrowUn64Sto32Sx2: + dszBlg2 = 2; op = ARM64vecna_SQXTN; break; + case Iop_QNarrowUn16Uto8Ux8: + dszBlg2 = 0; op = ARM64vecna_UQXTN; break; + case Iop_QNarrowUn32Uto16Ux4: + dszBlg2 = 1; op = ARM64vecna_UQXTN; break; + case Iop_QNarrowUn64Uto32Ux2: + dszBlg2 = 2; op = ARM64vecna_UQXTN; break; + case Iop_QNarrowUn16Sto8Ux8: + dszBlg2 = 0; op = ARM64vecna_SQXTUN; break; + case Iop_QNarrowUn32Sto16Ux4: + dszBlg2 = 1; op = ARM64vecna_SQXTUN; break; + case Iop_QNarrowUn64Sto32Ux2: + dszBlg2 = 2; op = ARM64vecna_SQXTUN; break; + default: + vassert(0); } - addInstr(env, ARM64Instr_VNarrowV(dszBlg2, tmp, src)); + addInstr(env, ARM64Instr_VNarrowV(op, dszBlg2, tmp, src)); addInstr(env, ARM64Instr_VXfromQ(dst, tmp, 0/*laneNo*/)); return dst; } @@ -4489,6 +4522,12 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) iselV256Expr(&vHi, &vLo, env, e->Iex.Unop.arg); return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo; } + case Iop_64UtoV128: { + HReg res = newVRegV(env); + HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg); + addInstr(env, ARM64Instr_VQfromX(res, arg)); + return res; + } //ZZ case Iop_NotV128: { //ZZ DECLARE_PATTERN(p_veqz_8x16); @@ -5616,6 +5655,11 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) HReg src = iselV128Expr(env, argL); HReg dst = newVRegV(env); if (amt > 0) { + /* For left shifts, the allowable amt values are + 0 .. lane_bits-1. For right shifts the allowable + values are 1 .. lane_bits. By restricting it to + 1 .. lane_bits-1, we are guaranteed to create a + valid instruction. */ addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt)); } else { dst = src; @@ -5627,6 +5671,118 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) break; } + /* uu */ + case Iop_QandQShrNnarrow16Uto8Ux8: + case Iop_QandQShrNnarrow32Uto16Ux4: + case Iop_QandQShrNnarrow64Uto32Ux2: + /* ss */ + case Iop_QandQSarNnarrow16Sto8Sx8: + case Iop_QandQSarNnarrow32Sto16Sx4: + case Iop_QandQSarNnarrow64Sto32Sx2: + /* su */ + case Iop_QandQSarNnarrow16Sto8Ux8: + case Iop_QandQSarNnarrow32Sto16Ux4: + case Iop_QandQSarNnarrow64Sto32Ux2: + /* ruu */ + case Iop_QandQRShrNnarrow16Uto8Ux8: + case Iop_QandQRShrNnarrow32Uto16Ux4: + case Iop_QandQRShrNnarrow64Uto32Ux2: + /* rss */ + case Iop_QandQRSarNnarrow16Sto8Sx8: + case Iop_QandQRSarNnarrow32Sto16Sx4: + case Iop_QandQRSarNnarrow64Sto32Sx2: + /* rsu */ + case Iop_QandQRSarNnarrow16Sto8Ux8: + case Iop_QandQRSarNnarrow32Sto16Ux4: + case Iop_QandQRSarNnarrow64Sto32Ux2: + { + IRExpr* argL = e->Iex.Binop.arg1; + IRExpr* argR = e->Iex.Binop.arg2; + if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) { + UInt amt = argR->Iex.Const.con->Ico.U8; + UInt limit = 0; + ARM64VecShiftOp op = ARM64vecsh_INVALID; + switch (e->Iex.Binop.op) { + /* uu */ + case Iop_QandQShrNnarrow64Uto32Ux2: + op = ARM64vecsh_UQSHRN2SD; limit = 64; break; + case Iop_QandQShrNnarrow32Uto16Ux4: + op = ARM64vecsh_UQSHRN4HS; limit = 32; break; + case Iop_QandQShrNnarrow16Uto8Ux8: + op = ARM64vecsh_UQSHRN8BH; limit = 16; break; + /* ss */ + case Iop_QandQSarNnarrow64Sto32Sx2: + op = ARM64vecsh_SQSHRN2SD; limit = 64; break; + case Iop_QandQSarNnarrow32Sto16Sx4: + op = ARM64vecsh_SQSHRN4HS; limit = 32; break; + case Iop_QandQSarNnarrow16Sto8Sx8: + op = ARM64vecsh_SQSHRN8BH; limit = 16; break; + /* su */ + case Iop_QandQSarNnarrow64Sto32Ux2: + op = ARM64vecsh_SQSHRUN2SD; limit = 64; break; + case Iop_QandQSarNnarrow32Sto16Ux4: + op = ARM64vecsh_SQSHRUN4HS; limit = 32; break; + case Iop_QandQSarNnarrow16Sto8Ux8: + op = ARM64vecsh_SQSHRUN8BH; limit = 16; break; + /* ruu */ + case Iop_QandQRShrNnarrow64Uto32Ux2: + op = ARM64vecsh_UQRSHRN2SD; limit = 64; break; + case Iop_QandQRShrNnarrow32Uto16Ux4: + op = ARM64vecsh_UQRSHRN4HS; limit = 32; break; + case Iop_QandQRShrNnarrow16Uto8Ux8: + op = ARM64vecsh_UQRSHRN8BH; limit = 16; break; + /* rss */ + case Iop_QandQRSarNnarrow64Sto32Sx2: + op = ARM64vecsh_SQRSHRN2SD; limit = 64; break; + case Iop_QandQRSarNnarrow32Sto16Sx4: + op = ARM64vecsh_SQRSHRN4HS; limit = 32; break; + case Iop_QandQRSarNnarrow16Sto8Sx8: + op = ARM64vecsh_SQRSHRN8BH; limit = 16; break; + /* rsu */ + case Iop_QandQRSarNnarrow64Sto32Ux2: + op = ARM64vecsh_SQRSHRUN2SD; limit = 64; break; + case Iop_QandQRSarNnarrow32Sto16Ux4: + op = ARM64vecsh_SQRSHRUN4HS; limit = 32; break; + case Iop_QandQRSarNnarrow16Sto8Ux8: + op = ARM64vecsh_SQRSHRUN8BH; limit = 16; break; + /**/ + default: + vassert(0); + } + if (op != ARM64vecsh_INVALID && amt >= 1 && amt <= limit) { + HReg src = iselV128Expr(env, argL); + HReg dst = newVRegV(env); + HReg fpsr = newVRegI(env); + /* Clear FPSR.Q, do the operation, and return both its + result and the new value of FPSR.Q. We can simply + zero out FPSR since all the other bits have no relevance + in VEX generated code. */ + addInstr(env, ARM64Instr_Imm64(fpsr, 0)); + addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr)); + addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt)); + addInstr(env, ARM64Instr_FPSR(False/*!toFPSR*/, fpsr)); + addInstr(env, ARM64Instr_Shift(fpsr, fpsr, ARM64RI6_I6(27), + ARM64sh_SHR)); + ARM64RIL* ril_one = mb_mkARM64RIL_I(1); + vassert(ril_one); + addInstr(env, ARM64Instr_Logic(fpsr, + fpsr, ril_one, ARM64lo_AND)); + /* Now we have: the main (shift) result in the bottom half + of |dst|, and the Q bit at the bottom of |fpsr|. + Combining them with a "InterleaveLO64x2" style operation + produces a 128 bit value, dst[63:0]:fpsr[63:0], + which is what we want. */ + HReg scratch = newVRegV(env); + addInstr(env, ARM64Instr_VQfromX(scratch, fpsr)); + addInstr(env, ARM64Instr_VBinV(ARM64vecb_UZP164x2, + dst, dst, scratch)); + return dst; + } + } + /* else fall out; this is unhandled */ + break; + } + case Iop_ShlV128: case Iop_ShrV128: { Bool isSHR = e->Iex.Binop.op == Iop_ShrV128; @@ -6510,9 +6666,9 @@ static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, default: vassert(0); } /* Clear FPSR.Q, do the operation, and return both its result - and the new value of FPSR.Q. We can simply zero the whole - thing out since FPSR is essentially a scratch status register - on the host. */ + and the new value of FPSR.Q. We can simply zero out FPSR + since all the other bits have no relevance in VEX generated + code. */ addInstr(env, ARM64Instr_Imm64(fpsr, 0)); addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr)); addInstr(env, ARM64Instr_VBinV(op, resLo, argL, argR)); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 6fa27ea611..ed23fd9415 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -920,6 +920,43 @@ void ppIROp ( IROp op ) case Iop_QandSQRsh32x4: vex_printf("QandSQRsh32x4"); return; case Iop_QandSQRsh64x2: vex_printf("QandSQRsh64x2"); return; + case Iop_QandQShrNnarrow16Uto8Ux8: + vex_printf("QandQShrNnarrow16Uto8Ux8"); return; + case Iop_QandQShrNnarrow32Uto16Ux4: + vex_printf("QandQShrNnarrow32Uto16Ux4"); return; + case Iop_QandQShrNnarrow64Uto32Ux2: + vex_printf("QandQShrNnarrow64Uto32Ux2"); return; + case Iop_QandQSarNnarrow16Sto8Sx8: + vex_printf("QandQSarNnarrow16Sto8Sx8"); return; + case Iop_QandQSarNnarrow32Sto16Sx4: + vex_printf("QandQSarNnarrow32Sto16Sx4"); return; + case Iop_QandQSarNnarrow64Sto32Sx2: + vex_printf("QandQSarNnarrow64Sto32Sx2"); return; + case Iop_QandQSarNnarrow16Sto8Ux8: + vex_printf("QandQSarNnarrow16Sto8Ux8"); return; + case Iop_QandQSarNnarrow32Sto16Ux4: + vex_printf("QandQSarNnarrow32Sto16Ux4"); return; + case Iop_QandQSarNnarrow64Sto32Ux2: + vex_printf("QandQSarNnarrow64Sto32Ux2"); return; + case Iop_QandQRShrNnarrow16Uto8Ux8: + vex_printf("QandQRShrNnarrow16Uto8Ux8"); return; + case Iop_QandQRShrNnarrow32Uto16Ux4: + vex_printf("QandQRShrNnarrow32Uto16Ux4"); return; + case Iop_QandQRShrNnarrow64Uto32Ux2: + vex_printf("QandQRShrNnarrow64Uto32Ux2"); return; + case Iop_QandQRSarNnarrow16Sto8Sx8: + vex_printf("QandQRSarNnarrow16Sto8Sx8"); return; + case Iop_QandQRSarNnarrow32Sto16Sx4: + vex_printf("QandQRSarNnarrow32Sto16Sx4"); return; + case Iop_QandQRSarNnarrow64Sto32Sx2: + vex_printf("QandQRSarNnarrow64Sto32Sx2"); return; + case Iop_QandQRSarNnarrow16Sto8Ux8: + vex_printf("QandQRSarNnarrow16Sto8Ux8"); return; + case Iop_QandQRSarNnarrow32Sto16Ux4: + vex_printf("QandQRSarNnarrow32Sto16Ux4"); return; + case Iop_QandQRSarNnarrow64Sto32Ux2: + vex_printf("QandQRSarNnarrow64Sto32Ux2"); return; + case Iop_NarrowBin16to8x16: vex_printf("NarrowBin16to8x16"); return; case Iop_NarrowBin32to16x8: vex_printf("NarrowBin32to16x8"); return; case Iop_QNarrowBin16Uto8Ux16: vex_printf("QNarrowBin16Uto8Ux16"); return; @@ -2953,6 +2990,24 @@ void typeOfPrimop ( IROp op, case Iop_QSalN8x16: case Iop_QSalN16x8: case Iop_QSalN32x4: case Iop_QSalN64x2: case Iop_SHA256: case Iop_SHA512: + case Iop_QandQShrNnarrow16Uto8Ux8: + case Iop_QandQShrNnarrow32Uto16Ux4: + case Iop_QandQShrNnarrow64Uto32Ux2: + case Iop_QandQSarNnarrow16Sto8Sx8: + case Iop_QandQSarNnarrow32Sto16Sx4: + case Iop_QandQSarNnarrow64Sto32Sx2: + case Iop_QandQSarNnarrow16Sto8Ux8: + case Iop_QandQSarNnarrow32Sto16Ux4: + case Iop_QandQSarNnarrow64Sto32Ux2: + case Iop_QandQRShrNnarrow16Uto8Ux8: + case Iop_QandQRShrNnarrow32Uto16Ux4: + case Iop_QandQRShrNnarrow64Uto32Ux2: + case Iop_QandQRSarNnarrow16Sto8Sx8: + case Iop_QandQRSarNnarrow32Sto16Sx4: + case Iop_QandQRSarNnarrow64Sto32Sx2: + case Iop_QandQRSarNnarrow16Sto8Ux8: + case Iop_QandQRSarNnarrow32Sto16Ux4: + case Iop_QandQRSarNnarrow64Sto32Ux2: BINARY(Ity_V128,Ity_I8, Ity_V128); case Iop_F32ToFixed32Ux4_RZ: diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 48a9911dc7..b494afa885 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1566,6 +1566,43 @@ typedef Iop_QandSQRsh8x16, Iop_QandSQRsh16x8, Iop_QandSQRsh32x4, Iop_QandSQRsh64x2, + /* VECTOR x SCALAR SATURATING (& MAYBE ROUNDING) NARROWING SHIFT RIGHT */ + /* All of type (V128, I8) -> V128 */ + /* The first argument is shifted right, then narrowed to half the width + by saturating it. The second argument is a scalar shift amount that + applies to all lanes, and must be a value in the range 1 to lane_width. + The shift may be done signedly (Sar variants) or unsignedly (Shr + variants). The saturation is done according to the two signedness + indicators at the end of the name. For example 64Sto32U means a + signed 64 bit value is saturated into an unsigned 32 bit value. + Additionally, the QRS variants do rounding, that is, they add the + value (1 << (shift_amount-1)) to each source lane before shifting. + + These operations return 65 bits: one bit ("Q") indicating whether + saturation occurred, and the shift result. The result type is V128, + of which the lower half is the shift result, and Q occupies the + least significant bit of the upper half. All other bits of the + upper half are zero. */ + // No rounding, sat U->U + Iop_QandQShrNnarrow16Uto8Ux8, + Iop_QandQShrNnarrow32Uto16Ux4, Iop_QandQShrNnarrow64Uto32Ux2, + // No rounding, sat S->S + Iop_QandQSarNnarrow16Sto8Sx8, + Iop_QandQSarNnarrow32Sto16Sx4, Iop_QandQSarNnarrow64Sto32Sx2, + // No rounding, sat S->U + Iop_QandQSarNnarrow16Sto8Ux8, + Iop_QandQSarNnarrow32Sto16Ux4, Iop_QandQSarNnarrow64Sto32Ux2, + + // Rounding, sat U->U + Iop_QandQRShrNnarrow16Uto8Ux8, + Iop_QandQRShrNnarrow32Uto16Ux4, Iop_QandQRShrNnarrow64Uto32Ux2, + // Rounding, sat S->S + Iop_QandQRSarNnarrow16Sto8Sx8, + Iop_QandQRSarNnarrow32Sto16Sx4, Iop_QandQRSarNnarrow64Sto32Sx2, + // Rounding, sat S->U + Iop_QandQRSarNnarrow16Sto8Ux8, + Iop_QandQRSarNnarrow32Sto16Ux4, Iop_QandQRSarNnarrow64Sto32Ux2, + /* NARROWING (binary) -- narrow 2xV128 into 1xV128, hi half from left arg */ /* See comments above w.r.t. U vs S issues in saturated narrowing. */