From: Julian Seward Date: Mon, 4 Aug 2014 08:09:47 +0000 (+0000) Subject: arm64: add support for: sqshl, uqshl, sqrshl, uqrshl (reg) (vector and scalar) X-Git-Tag: svn/VALGRIND_3_10_1^2~58 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93ceb74d09acba4a8801e154061c820c55c52fcf;p=thirdparty%2Fvalgrind.git arm64: add support for: sqshl, uqshl, sqrshl, uqrshl (reg) (vector and scalar) git-svn-id: svn://svn.valgrind.org/vex/trunk@2913 --- diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 44decc5f70..b01c760807 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -809,6 +809,42 @@ static IROp mkVecQRDMULHIS ( UInt size ) { return ops[size]; } +static IROp mkVecQANDUQSH ( UInt size ) +{ + const IROp ops[4] + = { Iop_QandUQsh8x16, Iop_QandUQsh16x8, + Iop_QandUQsh32x4, Iop_QandUQsh64x2 }; + vassert(size < 4); + return ops[size]; +} + +static IROp mkVecQANDSQSH ( UInt size ) +{ + const IROp ops[4] + = { Iop_QandSQsh8x16, Iop_QandSQsh16x8, + Iop_QandSQsh32x4, Iop_QandSQsh64x2 }; + vassert(size < 4); + return ops[size]; +} + +static IROp mkVecQANDUQRSH ( UInt size ) +{ + const IROp ops[4] + = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8, + Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 }; + vassert(size < 4); + return ops[size]; +} + +static IROp mkVecQANDSQRSH ( UInt size ) +{ + const IROp ops[4] + = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8, + Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 }; + vassert(size < 4); + return ops[size]; +} + /* Generate IR to create 'arg rotated right by imm', for sane values of 'ty' and 'imm'. */ static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm ) @@ -7503,8 +7539,7 @@ Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn) const HChar* nm = isADD ? (isU ? "uqadd" : "sqadd") : (isU ? "uqsub" : "sqsub"); const HChar arr = "bhsd"[size]; - DIP("%s %s.%c, %s.%c, %s.%c\n", nm, - nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr); + DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm); return True; } @@ -7542,6 +7577,41 @@ Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) { + /* -------- 0,xx,01001 SQSHL std4_std4_std4 -------- */ + /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */ + /* -------- 1,xx,01001 UQSHL std4_std4_std4 -------- */ + /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */ + Bool isU = bitU == 1; + Bool isR = opcode == BITS5(0,1,0,1,1); + IROp op = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size)) + : (isU ? mkVecQANDUQSH(size) : mkVecQANDSQSH(size)); + /* This is a bit tricky. Since we're only interested in the lowest + lane of the result, we zero out all the rest in the operands, so + as to ensure that other lanes don't pollute the returned Q value. + This works because it means, for the lanes we don't care about, we + are shifting zero by zero, which can never saturate. */ + IRTemp res256 = newTemp(Ity_V256); + IRTemp resSH = newTempV128(); + IRTemp resQ = newTempV128(); + IRTemp zero = newTempV128(); + assign( + res256, + binop(op, + mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))), + mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm))))); + assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256))); + assign(resQ, unop(Iop_V256toV128_1, mkexpr(res256))); + assign(zero, mkV128(0x0000)); + putQReg128(dd, mkexpr(resSH)); + updateQCFLAGwithDifference(resQ, zero); + const HChar* nm = isR ? (isU ? "uqrshl" : "sqrshl") + : (isU ? "uqshl" : "sqshl"); + const HChar arr = "bhsd"[size]; + DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm); + return True; + } + if (size == X11 && opcode == BITS5(1,0,0,0,0)) { /* -------- 0,11,10000 ADD d_d_d -------- */ /* -------- 1,11,10000 SUB d_d_d -------- */ @@ -8542,6 +8612,43 @@ Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) { + /* -------- 0,xx,01001 SQSHL std7_std7_std7 -------- */ + /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */ + /* -------- 1,xx,01001 UQSHL std7_std7_std7 -------- */ + /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */ + if (bitQ == 0 && size == X11) return False; // implied 1d case + Bool isU = bitU == 1; + Bool isR = opcode == BITS5(0,1,0,1,1); + IROp op = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size)) + : (isU ? mkVecQANDUQSH(size) : mkVecQANDSQSH(size)); + /* This is a bit tricky. If we're only interested in the lowest 64 bits + of the result (viz, bitQ == 0), then we must adjust the operands to + ensure that the upper part of the result, that we don't care about, + doesn't pollute the returned Q value. To do this, zero out the upper + operand halves beforehand. This works because it means, for the + lanes we don't care about, we are shifting zero by zero, which can + never saturate. */ + IRTemp res256 = newTemp(Ity_V256); + IRTemp resSH = newTempV128(); + IRTemp resQ = newTempV128(); + IRTemp zero = newTempV128(); + assign(res256, binop(op, + math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)), + math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm)))); + assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256))); + assign(resQ, unop(Iop_V256toV128_1, mkexpr(res256))); + assign(zero, mkV128(0x0000)); + putQReg128(dd, mkexpr(resSH)); + updateQCFLAGwithDifference(resQ, zero); + const HChar* nm = isR ? (isU ? "uqrshl" : "sqrshl") + : (isU ? "uqshl" : "sqshl"); + const HChar* arr = nameArr_Q_SZ(bitQ, size); + DIP("%s %s.%s, %s.%s, %s.%s\n", nm, + nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr); + return True; + } + if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) { /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */ /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */ diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index ef451a3409..fc57794e24 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -854,103 +854,119 @@ static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) { static void showARM64VecBinOp(/*OUT*/const HChar** nm, /*OUT*/const HChar** ar, ARM64VecBinOp op ) { switch (op) { - case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; - case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; - case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; - case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return; - case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; - case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; - case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; - case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return; - case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return; - case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return; - case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return; - case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return; - case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return; - case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return; - case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return; - case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return; - case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return; - case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return; - case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return; - case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return; - case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return; - case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return; - case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return; - case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return; - case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return; - case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return; - case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return; - case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return; - case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return; - case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return; - case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return; - case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return; - case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return; - case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return; - case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return; - case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return; - case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return; - case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return; - case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return; - case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return; - case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return; - case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return; - case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return; - case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return; - case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return; - case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return; - case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return; - case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return; - case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return; - case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return; - case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return; - case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return; - case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return; - case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return; - case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return; - case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return; - case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return; - case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return; - case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return; - case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return; - case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return; - case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return; - case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return; - case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return; - case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return; - case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return; - case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return; - case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return; - case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return; - case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return; - case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return; - case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return; - case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return; - case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return; - case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return; - case ARM64vecb_SQADD64x2: *nm = "sqadd"; *ar = "2d"; return; - case ARM64vecb_SQADD32x4: *nm = "sqadd"; *ar = "4s"; return; - case ARM64vecb_SQADD16x8: *nm = "sqadd"; *ar = "8h"; return; - case ARM64vecb_SQADD8x16: *nm = "sqadd"; *ar = "16b"; return; - case ARM64vecb_UQADD64x2: *nm = "uqadd"; *ar = "2d"; return; - case ARM64vecb_UQADD32x4: *nm = "uqadd"; *ar = "4s"; return; - case ARM64vecb_UQADD16x8: *nm = "uqadd"; *ar = "8h"; return; - case ARM64vecb_UQADD8x16: *nm = "uqadd"; *ar = "16b"; return; - case ARM64vecb_SQSUB64x2: *nm = "sqsub"; *ar = "2d"; return; - case ARM64vecb_SQSUB32x4: *nm = "sqsub"; *ar = "4s"; return; - case ARM64vecb_SQSUB16x8: *nm = "sqsub"; *ar = "8h"; return; - case ARM64vecb_SQSUB8x16: *nm = "sqsub"; *ar = "16b"; return; - case ARM64vecb_UQSUB64x2: *nm = "uqsub"; *ar = "2d"; return; - case ARM64vecb_UQSUB32x4: *nm = "uqsub"; *ar = "4s"; return; - case ARM64vecb_UQSUB16x8: *nm = "uqsub"; *ar = "8h"; return; - case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return; - case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return; - case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return; - case ARM64vecb_SQDMULH32x4: *nm = "sqdmulh"; *ar = "4s"; return; - case ARM64vecb_SQDMULH16x8: *nm = "sqdmulh"; *ar = "8h"; return; - case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s"; return; - case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h"; return; + case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; + case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; + case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; + case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return; + case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; + case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; + case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; + case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return; + case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return; + case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return; + case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return; + case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return; + case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return; + case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return; + case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return; + case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return; + case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return; + case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return; + case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return; + case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return; + case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return; + case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return; + case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return; + case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return; + case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return; + case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return; + case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return; + case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return; + case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return; + case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return; + case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return; + case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return; + case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return; + case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return; + case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return; + case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return; + case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return; + case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return; + case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return; + case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return; + case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return; + case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return; + case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return; + case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return; + case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return; + case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return; + case ARM64vecb_FCMEQ64x2: *nm = "fcmeq "; *ar = "2d"; return; + case ARM64vecb_FCMEQ32x4: *nm = "fcmeq "; *ar = "4s"; return; + case ARM64vecb_FCMGE64x2: *nm = "fcmge "; *ar = "2d"; return; + case ARM64vecb_FCMGE32x4: *nm = "fcmge "; *ar = "4s"; return; + case ARM64vecb_FCMGT64x2: *nm = "fcmgt "; *ar = "2d"; return; + case ARM64vecb_FCMGT32x4: *nm = "fcmgt "; *ar = "4s"; return; + case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return; + case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return; + case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return; + case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return; + case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return; + case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return; + case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return; + case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return; + case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return; + case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return; + case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return; + case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return; + case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return; + case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return; + case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return; + case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return; + case ARM64vecb_PMULL8x8: *nm = "pmull "; *ar = "8hbb"; return; + case ARM64vecb_UMULL2DSS: *nm = "umull "; *ar = "2dss"; return; + case ARM64vecb_UMULL4SHH: *nm = "umull "; *ar = "4shh"; return; + case ARM64vecb_UMULL8HBB: *nm = "umull "; *ar = "8hbb"; return; + case ARM64vecb_SMULL2DSS: *nm = "smull "; *ar = "2dss"; return; + case ARM64vecb_SMULL4SHH: *nm = "smull "; *ar = "4shh"; return; + case ARM64vecb_SMULL8HBB: *nm = "smull "; *ar = "8hbb"; return; + case ARM64vecb_SQADD64x2: *nm = "sqadd "; *ar = "2d"; return; + case ARM64vecb_SQADD32x4: *nm = "sqadd "; *ar = "4s"; return; + case ARM64vecb_SQADD16x8: *nm = "sqadd "; *ar = "8h"; return; + case ARM64vecb_SQADD8x16: *nm = "sqadd "; *ar = "16b"; return; + case ARM64vecb_UQADD64x2: *nm = "uqadd "; *ar = "2d"; return; + case ARM64vecb_UQADD32x4: *nm = "uqadd "; *ar = "4s"; return; + case ARM64vecb_UQADD16x8: *nm = "uqadd "; *ar = "8h"; return; + case ARM64vecb_UQADD8x16: *nm = "uqadd "; *ar = "16b"; return; + case ARM64vecb_SQSUB64x2: *nm = "sqsub "; *ar = "2d"; return; + case ARM64vecb_SQSUB32x4: *nm = "sqsub "; *ar = "4s"; return; + case ARM64vecb_SQSUB16x8: *nm = "sqsub "; *ar = "8h"; return; + case ARM64vecb_SQSUB8x16: *nm = "sqsub "; *ar = "16b"; return; + case ARM64vecb_UQSUB64x2: *nm = "uqsub "; *ar = "2d"; return; + case ARM64vecb_UQSUB32x4: *nm = "uqsub "; *ar = "4s"; return; + case ARM64vecb_UQSUB16x8: *nm = "uqsub "; *ar = "8h"; return; + case ARM64vecb_UQSUB8x16: *nm = "uqsub "; *ar = "16b"; return; + case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return; + case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return; + case ARM64vecb_SQDMULH32x4: *nm = "sqdmulh"; *ar = "4s"; return; + case ARM64vecb_SQDMULH16x8: *nm = "sqdmulh"; *ar = "8h"; return; + case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s"; return; + case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h"; return; + case ARM64vecb_SQSHL64x2: *nm = "sqshl "; *ar = "2d"; return; + case ARM64vecb_SQSHL32x4: *nm = "sqshl "; *ar = "4s"; return; + case ARM64vecb_SQSHL16x8: *nm = "sqshl "; *ar = "8h"; return; + case ARM64vecb_SQSHL8x16: *nm = "sqshl "; *ar = "16b"; return; + case ARM64vecb_UQSHL64x2: *nm = "uqshl "; *ar = "2d"; return; + case ARM64vecb_UQSHL32x4: *nm = "uqshl "; *ar = "4s"; return; + case ARM64vecb_UQSHL16x8: *nm = "uqshl "; *ar = "8h"; return; + case ARM64vecb_UQSHL8x16: *nm = "uqshl "; *ar = "16b"; return; + case ARM64vecb_SQRSHL64x2: *nm = "sqrshl"; *ar = "2d"; return; + case ARM64vecb_SQRSHL32x4: *nm = "sqrshl"; *ar = "4s"; return; + case ARM64vecb_SQRSHL16x8: *nm = "sqrshl"; *ar = "8h"; return; + case ARM64vecb_SQRSHL8x16: *nm = "sqrshl"; *ar = "16b"; return; + case ARM64vecb_UQRSHL64x2: *nm = "uqrshl"; *ar = "2d"; return; + case ARM64vecb_UQRSHL32x4: *nm = "uqrshl"; *ar = "4s"; return; + case ARM64vecb_UQRSHL16x8: *nm = "uqrshl"; *ar = "8h"; return; + case ARM64vecb_UQRSHL8x16: *nm = "uqrshl"; *ar = "16b"; return; default: vpanic("showARM64VecBinOp"); } } @@ -1675,6 +1691,13 @@ ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ) { i->ARM64in.FPCR.iReg = iReg; return i; } +ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg ) { + ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); + i->tag = ARM64in_FPSR; + i->ARM64in.FPSR.toFPSR = toFPSR; + i->ARM64in.FPSR.iReg = iReg; + return i; +} ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg dst, HReg argL, HReg argR ) { ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); @@ -1868,6 +1891,13 @@ ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX ) { i->ARM64in.VDfromX.rX = rX; return i; } +ARM64Instr* ARM64Instr_VQfromX ( HReg rQ, HReg rXlo ) { + ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); + i->tag = ARM64in_VQfromX; + i->ARM64in.VQfromX.rQ = rQ; + i->ARM64in.VQfromX.rXlo = rXlo; + return i; +} ARM64Instr* ARM64Instr_VQfromXX ( HReg rQ, HReg rXhi, HReg rXlo ) { ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); i->tag = ARM64in_VQfromXX; @@ -2340,11 +2370,21 @@ void ppARM64Instr ( ARM64Instr* i ) { vex_printf(", fpcr"); } return; + case ARM64in_FPSR: + if (i->ARM64in.FPSR.toFPSR) { + vex_printf("msr fpsr, "); + ppHRegARM64(i->ARM64in.FPSR.iReg); + } else { + vex_printf("mrs "); + ppHRegARM64(i->ARM64in.FPSR.iReg); + vex_printf(", fpsr"); + } + return; case ARM64in_VBinV: { const HChar* nm = "??"; const HChar* ar = "??"; showARM64VecBinOp(&nm, &ar, i->ARM64in.VBinV.op); - vex_printf("%s ", nm); + vex_printf("%s ", nm); ppHRegARM64(i->ARM64in.VBinV.dst); vex_printf(".%s, ", ar); ppHRegARM64(i->ARM64in.VBinV.argL); @@ -2557,6 +2597,12 @@ void ppARM64Instr ( ARM64Instr* i ) { vex_printf(", "); ppHRegARM64(i->ARM64in.VDfromX.rX); return; + case ARM64in_VQfromX: + vex_printf("fmov "); + ppHRegARM64(i->ARM64in.VQfromX.rQ); + vex_printf(".d[0], "); + ppHRegARM64(i->ARM64in.VQfromX.rXlo); + return; case ARM64in_VQfromXX: vex_printf("qFromXX "); ppHRegARM64(i->ARM64in.VQfromXX.rQ); @@ -2862,6 +2908,12 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 ) else addHRegUse(u, HRmWrite, i->ARM64in.FPCR.iReg); return; + case ARM64in_FPSR: + if (i->ARM64in.FPSR.toFPSR) + addHRegUse(u, HRmRead, i->ARM64in.FPSR.iReg); + else + addHRegUse(u, HRmWrite, i->ARM64in.FPSR.iReg); + return; case ARM64in_VBinV: addHRegUse(u, HRmWrite, i->ARM64in.VBinV.dst); addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL); @@ -2963,6 +3015,10 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 ) addHRegUse(u, HRmWrite, i->ARM64in.VDfromX.rD); addHRegUse(u, HRmRead, i->ARM64in.VDfromX.rX); return; + case ARM64in_VQfromX: + addHRegUse(u, HRmWrite, i->ARM64in.VQfromX.rQ); + addHRegUse(u, HRmRead, i->ARM64in.VQfromX.rXlo); + return; case ARM64in_VQfromXX: addHRegUse(u, HRmWrite, i->ARM64in.VQfromXX.rQ); addHRegUse(u, HRmRead, i->ARM64in.VQfromXX.rXhi); @@ -3160,6 +3216,9 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 ) case ARM64in_FPCR: i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg); return; + case ARM64in_FPSR: + i->ARM64in.FPSR.iReg = lookupHRegRemap(m, i->ARM64in.FPSR.iReg); + return; case ARM64in_VBinV: i->ARM64in.VBinV.dst = lookupHRegRemap(m, i->ARM64in.VBinV.dst); i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL); @@ -3247,6 +3306,12 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 ) i->ARM64in.VDfromX.rX = lookupHRegRemap(m, i->ARM64in.VDfromX.rX); return; + case ARM64in_VQfromX: + i->ARM64in.VQfromX.rQ + = lookupHRegRemap(m, i->ARM64in.VQfromX.rQ); + i->ARM64in.VQfromX.rXlo + = lookupHRegRemap(m, i->ARM64in.VQfromX.rXlo); + return; case ARM64in_VQfromXX: i->ARM64in.VQfromXX.rQ = lookupHRegRemap(m, i->ARM64in.VQfromXX.rQ); @@ -3497,8 +3562,10 @@ static inline UChar qregNo ( HReg r ) #define X010000 BITS8(0,0, 0,1,0,0,0,0) #define X010001 BITS8(0,0, 0,1,0,0,0,1) #define X010010 BITS8(0,0, 0,1,0,0,1,0) +#define X010011 BITS8(0,0, 0,1,0,0,1,1) #define X010101 BITS8(0,0, 0,1,0,1,0,1) #define X010110 BITS8(0,0, 0,1,0,1,1,0) +#define X010111 BITS8(0,0, 0,1,0,1,1,1) #define X011001 BITS8(0,0, 0,1,1,0,0,1) #define X011010 BITS8(0,0, 0,1,1,0,1,0) #define X011011 BITS8(0,0, 0,1,1,0,1,1) @@ -5076,6 +5143,18 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, } goto bad; // FPCR -> iReg case currently ATC } + case ARM64in_FPSR: { + Bool toFPSR = i->ARM64in.FPSR.toFPSR; + UInt iReg = iregNo(i->ARM64in.FPSR.iReg); + if (toFPSR) { + /* 0xD51B44 001 Rt MSR fpsr, rT */ + *p++ = 0xD51B4420 | (iReg & 0x1F); + } else { + /* 0xD53B44 001 Rt MRS rT, fpsr */ + *p++ = 0xD53B4420 | (iReg & 0x1F); + } + goto done; + } case ARM64in_VBinV: { /* 31 23 20 15 9 4 010 01110 11 1 m 100001 n d ADD Vd.2d, Vn.2d, Vm.2d @@ -5205,6 +5284,11 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 010 01110 01 1 m 101101 n d SQDMULH Vd.8h, Vn.8h, Vm.8h 011 01110 10 1 m 101101 n d SQRDMULH Vd.4s, Vn.4s, Vm.4s 011 01110 10 1 m 101101 n d SQRDMULH Vd.8h, Vn.8h, Vm.8h + + 010 01110 sz 1 m 010011 n d SQSHL@sz Vd, Vn, Vm + 010 01110 sz 1 m 010111 n d SQRSHL@sz Vd, Vn, Vm + 011 01110 sz 1 m 010011 n d UQSHL@sz Vd, Vn, Vm + 011 01110 sz 1 m 010111 n d URQSHL@sz Vd, Vn, Vm */ UInt vD = qregNo(i->ARM64in.VBinV.dst); UInt vN = qregNo(i->ARM64in.VBinV.argL); @@ -5528,6 +5612,58 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X101101, vN, vD); break; + case ARM64vecb_SQSHL64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010011, vN, vD); + break; + case ARM64vecb_SQSHL32x4: + *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010011, vN, vD); + break; + case ARM64vecb_SQSHL16x8: + *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010011, vN, vD); + break; + case ARM64vecb_SQSHL8x16: + *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010011, vN, vD); + break; + + case ARM64vecb_SQRSHL64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010111, vN, vD); + break; + case ARM64vecb_SQRSHL32x4: + *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010111, vN, vD); + break; + case ARM64vecb_SQRSHL16x8: + *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010111, vN, vD); + break; + case ARM64vecb_SQRSHL8x16: + *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010111, vN, vD); + break; + + case ARM64vecb_UQSHL64x2: + *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010011, vN, vD); + break; + case ARM64vecb_UQSHL32x4: + *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010011, vN, vD); + break; + case ARM64vecb_UQSHL16x8: + *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010011, vN, vD); + break; + case ARM64vecb_UQSHL8x16: + *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010011, vN, vD); + break; + + case ARM64vecb_UQRSHL64x2: + *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010111, vN, vD); + break; + case ARM64vecb_UQRSHL32x4: + *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010111, vN, vD); + break; + case ARM64vecb_UQRSHL16x8: + *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010111, vN, vD); + break; + case ARM64vecb_UQRSHL8x16: + *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010111, vN, vD); + break; + default: goto bad; } @@ -6700,6 +6836,18 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, goto done; } + case ARM64in_VQfromX: { + /* FMOV D, X + 1001 1110 0110 0111 0000 00 nn dd FMOV Vd.D[0], Xn + I think this zeroes out the top half of the destination, which + is what we need. TODO: can we do VDfromX and VQfromXX better? */ + UInt dd = qregNo(i->ARM64in.VQfromX.rQ); + UInt xx = iregNo(i->ARM64in.VQfromX.rXlo); + vassert(xx < 31); + *p++ = 0x9E670000 | X_2_6_2_12_5_5(0,0,0,0,xx,dd); + goto done; + } + case ARM64in_VQfromXX: { /* What we really generate is a two insn sequence: INS Vd.D[0], Xlo; INS Vd.D[1], Xhi diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index f2f5bea2b4..aaf839ff26 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -364,6 +364,14 @@ typedef ARM64vecb_SQDMULH16x8, ARM64vecb_SQRDMULH32x4, ARM64vecb_SQRDMULH16x8, + ARM64vecb_SQSHL64x2, ARM64vecb_SQSHL32x4, + ARM64vecb_SQSHL16x8, ARM64vecb_SQSHL8x16, + ARM64vecb_UQSHL64x2, ARM64vecb_UQSHL32x4, + ARM64vecb_UQSHL16x8, ARM64vecb_UQSHL8x16, + ARM64vecb_SQRSHL64x2, ARM64vecb_SQRSHL32x4, + ARM64vecb_SQRSHL16x8, ARM64vecb_SQRSHL8x16, + ARM64vecb_UQRSHL64x2, ARM64vecb_UQRSHL32x4, + ARM64vecb_UQRSHL16x8, ARM64vecb_UQRSHL8x16, ARM64vecb_INVALID } ARM64VecBinOp; @@ -438,6 +446,7 @@ typedef ARM64in_VCmpD, ARM64in_VCmpS, ARM64in_FPCR, + ARM64in_FPSR, /* ARM64in_V*V: vector ops on vector registers */ ARM64in_VBinV, ARM64in_VUnaryV, @@ -446,6 +455,7 @@ typedef ARM64in_VExtV, ARM64in_VImmQ, ARM64in_VDfromX, /* Move an Xreg to a Dreg */ + ARM64in_VQfromX, /* Move an Xreg to a Qreg lo64, and zero hi64 */ ARM64in_VQfromXX, /* Move 2 Xregs to a Qreg */ ARM64in_VXfromQ, /* Move half a Qreg to an Xreg */ ARM64in_VXfromDorS, /* Move Dreg or Sreg(ZX) to an Xreg */ @@ -691,6 +701,11 @@ typedef Bool toFPCR; HReg iReg; } FPCR; + /* Move a 32-bit value to/from the FPSR */ + struct { + Bool toFPSR; + HReg iReg; + } FPSR; /* binary vector operation on vector registers */ struct { ARM64VecBinOp op; @@ -734,6 +749,10 @@ typedef HReg rD; HReg rX; } VDfromX; + struct { + HReg rQ; + HReg rXlo; + } VQfromX; struct { HReg rQ; HReg rXhi; @@ -814,6 +833,7 @@ extern ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op, HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ); +extern ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg ); extern ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg ); extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ); @@ -823,6 +843,7 @@ extern ARM64Instr* ARM64Instr_VExtV ( HReg dst, HReg srcLo, HReg srcHi, UInt amtB ); extern ARM64Instr* ARM64Instr_VImmQ ( HReg, UShort ); extern ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX ); +extern ARM64Instr* ARM64Instr_VQfromX ( HReg rQ, HReg rXlo ); extern ARM64Instr* ARM64Instr_VQfromXX( HReg rQ, HReg rXhi, HReg rXlo ); extern ARM64Instr* ARM64Instr_VXfromQ ( HReg rX, HReg rQ, UInt laneNo ); extern ARM64Instr* ARM64Instr_VXfromDorS ( HReg rX, HReg rDorS, Bool fromD ); diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 6f2d8bcd56..b7523a16ff 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -136,6 +136,16 @@ static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp ) return env->vregmap[tmp]; } +static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO, + ISelEnv* env, IRTemp tmp ) +{ + vassert(tmp >= 0); + vassert(tmp < env->n_vregmap); + vassert(! hregIsInvalid(env->vregmapHI[tmp])); + *vrLO = env->vregmap[tmp]; + *vrHI = env->vregmapHI[tmp]; +} + static void addInstr ( ISelEnv* env, ARM64Instr* instr ) { addHInstr(env->code, instr); @@ -230,24 +240,20 @@ static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e ); - -//ZZ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, -//ZZ ISelEnv* env, IRExpr* e ); -//ZZ static void iselInt64Expr ( HReg* rHi, HReg* rLo, -//ZZ ISelEnv* env, IRExpr* e ); - static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e ); static HReg iselDblExpr ( ISelEnv* env, IRExpr* e ); static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e ); static HReg iselFltExpr ( ISelEnv* env, IRExpr* e ); -//ZZ static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e ); -//ZZ static HReg iselNeon64Expr ( ISelEnv* env, IRExpr* e ); - static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ); static HReg iselV128Expr ( ISelEnv* env, IRExpr* e ); +static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ); +static void iselV256Expr ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ); + static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 ); @@ -4332,7 +4338,7 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, /*---------------------------------------------------------*/ -/*--- ISEL: Vector (NEON) expressions (128 bit) ---*/ +/*--- ISEL: Vector expressions (128 bit) ---*/ /*---------------------------------------------------------*/ static HReg iselV128Expr ( ISelEnv* env, IRExpr* e ) @@ -4389,7 +4395,7 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) if (e->tag == Iex_Unop) { - /* Iop_ZeroHIXXofV128 cases */ + /* Iop_ZeroHIXXofV128 cases */ UShort imm16 = 0; switch (e->Iex.Unop.op) { case Iop_ZeroHI64ofV128: imm16 = 0x00FF; break; @@ -4477,6 +4483,12 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res)); return res; } + case Iop_V256toV128_0: + case Iop_V256toV128_1: { + HReg vHi, vLo; + iselV256Expr(&vHi, &vLo, env, e->Iex.Unop.arg); + return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo; + } //ZZ case Iop_NotV128: { //ZZ DECLARE_PATTERN(p_veqz_8x16); @@ -6424,6 +6436,111 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e ) } +/*---------------------------------------------------------*/ +/*--- ISEL: Vector expressions (256 bit) ---*/ +/*---------------------------------------------------------*/ + +static void iselV256Expr ( /*OUT*/HReg* rHi, HReg* rLo, + ISelEnv* env, IRExpr* e ) +{ + iselV256Expr_wrk( rHi, rLo, env, e ); + vassert(hregClass(*rHi) == HRcVec128); + vassert(hregClass(*rLo) == HRcVec128); + vassert(hregIsVirtual(*rHi)); + vassert(hregIsVirtual(*rLo)); +} + +/* DO NOT CALL THIS DIRECTLY */ +static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, + ISelEnv* env, IRExpr* e ) +{ + vassert(e); + IRType ty = typeOfIRExpr(env->type_env,e); + vassert(ty == Ity_V256); + + /* read 256-bit IRTemp */ + if (e->tag == Iex_RdTmp) { + lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); + return; + } + + if (e->tag == Iex_Binop) { + switch (e->Iex.Binop.op) { + + case Iop_QandSQsh64x2: + case Iop_QandSQsh32x4: + case Iop_QandSQsh16x8: + case Iop_QandSQsh8x16: + case Iop_QandUQsh64x2: + case Iop_QandUQsh32x4: + case Iop_QandUQsh16x8: + case Iop_QandUQsh8x16: + case Iop_QandSQRsh64x2: + case Iop_QandSQRsh32x4: + case Iop_QandSQRsh16x8: + case Iop_QandSQRsh8x16: + case Iop_QandUQRsh64x2: + case Iop_QandUQRsh32x4: + case Iop_QandUQRsh16x8: + case Iop_QandUQRsh8x16: + { + HReg argL = iselV128Expr(env, e->Iex.Binop.arg1); + HReg argR = iselV128Expr(env, e->Iex.Binop.arg2); + HReg fpsr = newVRegI(env); + HReg resHi = newVRegV(env); + HReg resLo = newVRegV(env); + ARM64VecBinOp op = ARM64vecb_INVALID; + switch (e->Iex.Binop.op) { + case Iop_QandSQsh64x2: op = ARM64vecb_SQSHL64x2; break; + case Iop_QandSQsh32x4: op = ARM64vecb_SQSHL32x4; break; + case Iop_QandSQsh16x8: op = ARM64vecb_SQSHL16x8; break; + case Iop_QandSQsh8x16: op = ARM64vecb_SQSHL8x16; break; + case Iop_QandUQsh64x2: op = ARM64vecb_UQSHL64x2; break; + case Iop_QandUQsh32x4: op = ARM64vecb_UQSHL32x4; break; + case Iop_QandUQsh16x8: op = ARM64vecb_UQSHL16x8; break; + case Iop_QandUQsh8x16: op = ARM64vecb_UQSHL8x16; break; + case Iop_QandSQRsh64x2: op = ARM64vecb_SQRSHL64x2; break; + case Iop_QandSQRsh32x4: op = ARM64vecb_SQRSHL32x4; break; + case Iop_QandSQRsh16x8: op = ARM64vecb_SQRSHL16x8; break; + case Iop_QandSQRsh8x16: op = ARM64vecb_SQRSHL8x16; break; + case Iop_QandUQRsh64x2: op = ARM64vecb_UQRSHL64x2; break; + case Iop_QandUQRsh32x4: op = ARM64vecb_UQRSHL32x4; break; + case Iop_QandUQRsh16x8: op = ARM64vecb_UQRSHL16x8; break; + case Iop_QandUQRsh8x16: op = ARM64vecb_UQRSHL8x16; break; + default: vassert(0); + } + /* Clear FPSR.Q, do the operation, and return both its result + and the new value of FPSR.Q. We can simply zero the whole + thing out since FPSR is essentially a scratch status register + on the host. */ + addInstr(env, ARM64Instr_Imm64(fpsr, 0)); + addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr)); + addInstr(env, ARM64Instr_VBinV(op, resLo, argL, argR)); + addInstr(env, ARM64Instr_FPSR(False/*!toFPSR*/, fpsr)); + addInstr(env, ARM64Instr_Shift(fpsr, fpsr, ARM64RI6_I6(27), + ARM64sh_SHR)); + ARM64RIL* ril_one = mb_mkARM64RIL_I(1); + vassert(ril_one); + addInstr(env, ARM64Instr_Logic(fpsr, fpsr, ril_one, ARM64lo_AND)); + /* Now we have: the main (shift) result in |resLo|, and the + Q bit at the bottom of |fpsr|. */ + addInstr(env, ARM64Instr_VQfromX(resHi, fpsr)); + *rHi = resHi; + *rLo = resLo; + return; + } + + /* ... */ + default: + break; + } /* switch on the binop */ + } /* if (e->tag == Iex_Binop) */ + + ppIRExpr(e); + vpanic("iselV256Expr_wrk"); +} + + /*---------------------------------------------------------*/ /*--- ISEL: Statements ---*/ /*---------------------------------------------------------*/ @@ -6763,6 +6880,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) addInstr(env, ARM64Instr_VMov(16, dst, src)); return; } + if (ty == Ity_V256) { + HReg srcHi, srcLo, dstHi, dstLo; + iselV256Expr(&srcHi,&srcLo, env, stmt->Ist.WrTmp.data); + lookupIRTempPair( &dstHi, &dstLo, env, tmp); + addInstr(env, ARM64Instr_VMov(16, dstHi, srcHi)); + addInstr(env, ARM64Instr_VMov(16, dstLo, srcLo)); + return; + } break; } @@ -7155,6 +7280,10 @@ HInstrArray* iselSB_ARM64 ( IRSB* bb, case Ity_V128: hreg = mkHReg(j++, HRcVec128, True); break; + case Ity_V256: + hreg = mkHReg(j++, HRcVec128, True); + hregHI = mkHReg(j++, HRcVec128, True); + break; default: ppIRType(bb->tyenv->types[i]); vpanic("iselBB(arm64): IRTemp type"); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 71c8f28e2d..6fa27ea611 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -903,6 +903,23 @@ void ppIROp ( IROp op ) case Iop_Rol32x4: vex_printf("Rol32x4"); return; case Iop_Rol64x2: vex_printf("Rol64x2"); return; + case Iop_QandUQsh8x16: vex_printf("QandUQsh8x16"); return; + case Iop_QandUQsh16x8: vex_printf("QandUQsh16x8"); return; + case Iop_QandUQsh32x4: vex_printf("QandUQsh32x4"); return; + case Iop_QandUQsh64x2: vex_printf("QandUQsh64x2"); return; + case Iop_QandSQsh8x16: vex_printf("QandSQsh8x16"); return; + case Iop_QandSQsh16x8: vex_printf("QandSQsh16x8"); return; + case Iop_QandSQsh32x4: vex_printf("QandSQsh32x4"); return; + case Iop_QandSQsh64x2: vex_printf("QandSQsh64x2"); return; + case Iop_QandUQRsh8x16: vex_printf("QandUQRsh8x16"); return; + case Iop_QandUQRsh16x8: vex_printf("QandUQRsh16x8"); return; + case Iop_QandUQRsh32x4: vex_printf("QandUQRsh32x4"); return; + case Iop_QandUQRsh64x2: vex_printf("QandUQRsh64x2"); return; + case Iop_QandSQRsh8x16: vex_printf("QandSQRsh8x16"); return; + case Iop_QandSQRsh16x8: vex_printf("QandSQRsh16x8"); return; + case Iop_QandSQRsh32x4: vex_printf("QandSQRsh32x4"); return; + case Iop_QandSQRsh64x2: vex_printf("QandSQRsh64x2"); return; + case Iop_NarrowBin16to8x16: vex_printf("NarrowBin16to8x16"); return; case Iop_NarrowBin32to16x8: vex_printf("NarrowBin32to16x8"); return; case Iop_QNarrowBin16Uto8Ux16: vex_printf("QNarrowBin16Uto8Ux16"); return; @@ -1037,16 +1054,16 @@ void ppIROp ( IROp op ) case Iop_DivD128: vex_printf("DivD128"); return; case Iop_ShlD128: vex_printf("ShlD128"); return; case Iop_ShrD128: vex_printf("ShrD128"); return; - case Iop_RoundD64toInt: vex_printf("Iop_RoundD64toInt"); return; - case Iop_RoundD128toInt: vex_printf("Iop_RoundD128toInt"); return; - case Iop_QuantizeD64: vex_printf("Iop_QuantizeD64"); return; - case Iop_QuantizeD128: vex_printf("Iop_QuantizeD128"); return; - case Iop_ExtractExpD64: vex_printf("Iop_ExtractExpD64"); return; - case Iop_ExtractExpD128: vex_printf("Iop_ExtractExpD128"); return; - case Iop_ExtractSigD64: vex_printf("Iop_ExtractSigD64"); return; - case Iop_ExtractSigD128: vex_printf("Iop_ExtractSigD128"); return; - case Iop_InsertExpD64: vex_printf("Iop_InsertExpD64"); return; - case Iop_InsertExpD128: vex_printf("Iop_InsertExpD128"); return; + case Iop_RoundD64toInt: vex_printf("RoundD64toInt"); return; + case Iop_RoundD128toInt: vex_printf("RoundD128toInt"); return; + case Iop_QuantizeD64: vex_printf("QuantizeD64"); return; + case Iop_QuantizeD128: vex_printf("QuantizeD128"); return; + case Iop_ExtractExpD64: vex_printf("ExtractExpD64"); return; + case Iop_ExtractExpD128: vex_printf("ExtractExpD128"); return; + case Iop_ExtractSigD64: vex_printf("ExtractSigD64"); return; + case Iop_ExtractSigD128: vex_printf("ExtractSigD128"); return; + case Iop_InsertExpD64: vex_printf("InsertExpD64"); return; + case Iop_InsertExpD128: vex_printf("InsertExpD128"); return; case Iop_CmpD64: vex_printf("CmpD64"); return; case Iop_CmpD128: vex_printf("CmpD128"); return; case Iop_CmpExpD64: vex_printf("CmpExpD64"); return; @@ -1054,9 +1071,9 @@ void ppIROp ( IROp op ) case Iop_D64HLtoD128: vex_printf("D64HLtoD128"); return; case Iop_D128HItoD64: vex_printf("D128HItoD64"); return; case Iop_D128LOtoD64: vex_printf("D128LOtoD64"); return; - case Iop_SignificanceRoundD64: vex_printf("Iop_SignificanceRoundD64"); + case Iop_SignificanceRoundD64: vex_printf("SignificanceRoundD64"); return; - case Iop_SignificanceRoundD128: vex_printf("Iop_SignificanceRoundD128"); + case Iop_SignificanceRoundD128: vex_printf("SignificanceRoundD128"); return; case Iop_ReinterpI64asD64: vex_printf("ReinterpI64asD64"); return; case Iop_ReinterpD64asI64: vex_printf("ReinterpD64asI64"); return; @@ -3260,6 +3277,14 @@ void typeOfPrimop ( IROp op, case Iop_V256toV128_1: case Iop_V256toV128_0: UNARY(Ity_V256, Ity_V128); + case Iop_QandUQsh8x16: case Iop_QandUQsh16x8: + case Iop_QandUQsh32x4: case Iop_QandUQsh64x2: + case Iop_QandSQsh8x16: case Iop_QandSQsh16x8: + case Iop_QandSQsh32x4: case Iop_QandSQsh64x2: + case Iop_QandUQRsh8x16: case Iop_QandUQRsh16x8: + case Iop_QandUQRsh32x4: case Iop_QandUQRsh64x2: + case Iop_QandSQRsh8x16: case Iop_QandSQRsh16x8: + case Iop_QandSQRsh32x4: case Iop_QandSQRsh64x2: case Iop_V128HLtoV256: BINARY(Ity_V128,Ity_V128, Ity_V256); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 8937b5e2b4..48a9911dc7 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1538,6 +1538,34 @@ typedef Iop_QShlN8x16, Iop_QShlN16x8, Iop_QShlN32x4, Iop_QShlN64x2, Iop_QSalN8x16, Iop_QSalN16x8, Iop_QSalN32x4, Iop_QSalN64x2, + /* VECTOR x VECTOR BIDIRECTIONAL SATURATING (& MAYBE ROUNDING) SHIFT */ + /* The least significant 8 bits of each lane of the second + operand are used as the shift amount, and interpreted signedly. + Positive values mean a shift left, negative a shift right. The + result is signedly or unsignedly saturated. There are also + rounding variants, which add 2^(shift_amount-1) to the value before + shifting, but only in the shift-right case. Vacated positions + are filled with zeroes. IOW, it's either SHR or SHL, but not SAR. + + These operations return 129 bits: one bit ("Q") indicating whether + saturation occurred, and the shift result. The result type is V256, + of which the lower V128 is the shift result, and Q occupies the + least significant bit of the upper V128. All other bits of the + upper V128 are zero. */ + // Unsigned saturation, no rounding + Iop_QandUQsh8x16, Iop_QandUQsh16x8, + Iop_QandUQsh32x4, Iop_QandUQsh64x2, + // Signed saturation, no rounding + Iop_QandSQsh8x16, Iop_QandSQsh16x8, + Iop_QandSQsh32x4, Iop_QandSQsh64x2, + + // Unsigned saturation, rounding + Iop_QandUQRsh8x16, Iop_QandUQRsh16x8, + Iop_QandUQRsh32x4, Iop_QandUQRsh64x2, + // Signed saturation, rounding + Iop_QandSQRsh8x16, Iop_QandSQRsh16x8, + Iop_QandSQRsh32x4, Iop_QandSQRsh64x2, + /* NARROWING (binary) -- narrow 2xV128 into 1xV128, hi half from left arg */ /* See comments above w.r.t. U vs S issues in saturated narrowing. */