return ops[size];
}
+static IROp mkVecQANDUQSH ( UInt size )
+{
+ const IROp ops[4]
+ = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
+ Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQANDSQSH ( UInt size )
+{
+ const IROp ops[4]
+ = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
+ Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQANDUQRSH ( UInt size )
+{
+ const IROp ops[4]
+ = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
+ Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQANDSQRSH ( UInt size )
+{
+ const IROp ops[4]
+ = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
+ Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
/* Generate IR to create 'arg rotated right by imm', for sane values
of 'ty' and 'imm'. */
static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
const HChar* nm = isADD ? (isU ? "uqadd" : "sqadd")
: (isU ? "uqsub" : "sqsub");
const HChar arr = "bhsd"[size];
- DIP("%s %s.%c, %s.%c, %s.%c\n", nm,
- nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
return True;
}
return True;
}
+ if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
+ /* -------- 0,xx,01001 SQSHL std4_std4_std4 -------- */
+ /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
+ /* -------- 1,xx,01001 UQSHL std4_std4_std4 -------- */
+ /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
+ Bool isU = bitU == 1;
+ Bool isR = opcode == BITS5(0,1,0,1,1);
+ IROp op = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
+ : (isU ? mkVecQANDUQSH(size) : mkVecQANDSQSH(size));
+ /* This is a bit tricky. Since we're only interested in the lowest
+ lane of the result, we zero out all the rest in the operands, so
+ as to ensure that other lanes don't pollute the returned Q value.
+ This works because it means, for the lanes we don't care about, we
+ are shifting zero by zero, which can never saturate. */
+ IRTemp res256 = newTemp(Ity_V256);
+ IRTemp resSH = newTempV128();
+ IRTemp resQ = newTempV128();
+ IRTemp zero = newTempV128();
+ assign(
+ res256,
+ binop(op,
+ mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
+ mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
+ assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
+ assign(resQ, unop(Iop_V256toV128_1, mkexpr(res256)));
+ assign(zero, mkV128(0x0000));
+ putQReg128(dd, mkexpr(resSH));
+ updateQCFLAGwithDifference(resQ, zero);
+ const HChar* nm = isR ? (isU ? "uqrshl" : "sqrshl")
+ : (isU ? "uqshl" : "sqshl");
+ const HChar arr = "bhsd"[size];
+ DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
+ return True;
+ }
+
if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
/* -------- 0,11,10000 ADD d_d_d -------- */
/* -------- 1,11,10000 SUB d_d_d -------- */
return True;
}
+ if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
+ /* -------- 0,xx,01001 SQSHL std7_std7_std7 -------- */
+ /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
+ /* -------- 1,xx,01001 UQSHL std7_std7_std7 -------- */
+ /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
+ if (bitQ == 0 && size == X11) return False; // implied 1d case
+ Bool isU = bitU == 1;
+ Bool isR = opcode == BITS5(0,1,0,1,1);
+ IROp op = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
+ : (isU ? mkVecQANDUQSH(size) : mkVecQANDSQSH(size));
+ /* This is a bit tricky. If we're only interested in the lowest 64 bits
+ of the result (viz, bitQ == 0), then we must adjust the operands to
+ ensure that the upper part of the result, that we don't care about,
+ doesn't pollute the returned Q value. To do this, zero out the upper
+ operand halves beforehand. This works because it means, for the
+ lanes we don't care about, we are shifting zero by zero, which can
+ never saturate. */
+ IRTemp res256 = newTemp(Ity_V256);
+ IRTemp resSH = newTempV128();
+ IRTemp resQ = newTempV128();
+ IRTemp zero = newTempV128();
+ assign(res256, binop(op,
+ math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
+ math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
+ assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
+ assign(resQ, unop(Iop_V256toV128_1, mkexpr(res256)));
+ assign(zero, mkV128(0x0000));
+ putQReg128(dd, mkexpr(resSH));
+ updateQCFLAGwithDifference(resQ, zero);
+ const HChar* nm = isR ? (isU ? "uqrshl" : "sqrshl")
+ : (isU ? "uqshl" : "sqshl");
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
/* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
/* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
static void showARM64VecBinOp(/*OUT*/const HChar** nm,
/*OUT*/const HChar** ar, ARM64VecBinOp op ) {
switch (op) {
- case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
- case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
- case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
- case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
- case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
- case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
- case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
- case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
- case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
- case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
- case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
- case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return;
- case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return;
- case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return;
- case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return;
- case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return;
- case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
- case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
- case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
- case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
- case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
- case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
- case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return;
- case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return;
- case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return;
- case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return;
- case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return;
- case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return;
- case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return;
- case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return;
- case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return;
- case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return;
- case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return;
- case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return;
- case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return;
- case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return;
- case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return;
- case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return;
- case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return;
- case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return;
- case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return;
- case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return;
- case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return;
- case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return;
- case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return;
- case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return;
- case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
- case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
- case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
- case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
- case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
- case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
- case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
- case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return;
- case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return;
- case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return;
- case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return;
- case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return;
- case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return;
- case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return;
- case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return;
- case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return;
- case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return;
- case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return;
- case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return;
- case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return;
- case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return;
- case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return;
- case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return;
- case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return;
- case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return;
- case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return;
- case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return;
- case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return;
- case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return;
- case ARM64vecb_SQADD64x2: *nm = "sqadd"; *ar = "2d"; return;
- case ARM64vecb_SQADD32x4: *nm = "sqadd"; *ar = "4s"; return;
- case ARM64vecb_SQADD16x8: *nm = "sqadd"; *ar = "8h"; return;
- case ARM64vecb_SQADD8x16: *nm = "sqadd"; *ar = "16b"; return;
- case ARM64vecb_UQADD64x2: *nm = "uqadd"; *ar = "2d"; return;
- case ARM64vecb_UQADD32x4: *nm = "uqadd"; *ar = "4s"; return;
- case ARM64vecb_UQADD16x8: *nm = "uqadd"; *ar = "8h"; return;
- case ARM64vecb_UQADD8x16: *nm = "uqadd"; *ar = "16b"; return;
- case ARM64vecb_SQSUB64x2: *nm = "sqsub"; *ar = "2d"; return;
- case ARM64vecb_SQSUB32x4: *nm = "sqsub"; *ar = "4s"; return;
- case ARM64vecb_SQSUB16x8: *nm = "sqsub"; *ar = "8h"; return;
- case ARM64vecb_SQSUB8x16: *nm = "sqsub"; *ar = "16b"; return;
- case ARM64vecb_UQSUB64x2: *nm = "uqsub"; *ar = "2d"; return;
- case ARM64vecb_UQSUB32x4: *nm = "uqsub"; *ar = "4s"; return;
- case ARM64vecb_UQSUB16x8: *nm = "uqsub"; *ar = "8h"; return;
- case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return;
- case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return;
- case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return;
- case ARM64vecb_SQDMULH32x4: *nm = "sqdmulh"; *ar = "4s"; return;
- case ARM64vecb_SQDMULH16x8: *nm = "sqdmulh"; *ar = "8h"; return;
- case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s"; return;
- case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h"; return;
+ case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
+ case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
+ case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
+ case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
+ case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
+ case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
+ case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
+ case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
+ case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
+ case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
+ case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
+ case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return;
+ case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return;
+ case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return;
+ case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return;
+ case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return;
+ case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
+ case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
+ case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
+ case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
+ case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
+ case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
+ case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return;
+ case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return;
+ case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return;
+ case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return;
+ case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return;
+ case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return;
+ case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return;
+ case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return;
+ case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return;
+ case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return;
+ case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return;
+ case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return;
+ case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return;
+ case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return;
+ case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return;
+ case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return;
+ case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return;
+ case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return;
+ case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return;
+ case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return;
+ case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return;
+ case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return;
+ case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return;
+ case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return;
+ case ARM64vecb_FCMEQ64x2: *nm = "fcmeq "; *ar = "2d"; return;
+ case ARM64vecb_FCMEQ32x4: *nm = "fcmeq "; *ar = "4s"; return;
+ case ARM64vecb_FCMGE64x2: *nm = "fcmge "; *ar = "2d"; return;
+ case ARM64vecb_FCMGE32x4: *nm = "fcmge "; *ar = "4s"; return;
+ case ARM64vecb_FCMGT64x2: *nm = "fcmgt "; *ar = "2d"; return;
+ case ARM64vecb_FCMGT32x4: *nm = "fcmgt "; *ar = "4s"; return;
+ case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
+ case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return;
+ case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return;
+ case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return;
+ case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return;
+ case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return;
+ case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return;
+ case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return;
+ case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return;
+ case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return;
+ case ARM64vecb_PMULL8x8: *nm = "pmull "; *ar = "8hbb"; return;
+ case ARM64vecb_UMULL2DSS: *nm = "umull "; *ar = "2dss"; return;
+ case ARM64vecb_UMULL4SHH: *nm = "umull "; *ar = "4shh"; return;
+ case ARM64vecb_UMULL8HBB: *nm = "umull "; *ar = "8hbb"; return;
+ case ARM64vecb_SMULL2DSS: *nm = "smull "; *ar = "2dss"; return;
+ case ARM64vecb_SMULL4SHH: *nm = "smull "; *ar = "4shh"; return;
+ case ARM64vecb_SMULL8HBB: *nm = "smull "; *ar = "8hbb"; return;
+ case ARM64vecb_SQADD64x2: *nm = "sqadd "; *ar = "2d"; return;
+ case ARM64vecb_SQADD32x4: *nm = "sqadd "; *ar = "4s"; return;
+ case ARM64vecb_SQADD16x8: *nm = "sqadd "; *ar = "8h"; return;
+ case ARM64vecb_SQADD8x16: *nm = "sqadd "; *ar = "16b"; return;
+ case ARM64vecb_UQADD64x2: *nm = "uqadd "; *ar = "2d"; return;
+ case ARM64vecb_UQADD32x4: *nm = "uqadd "; *ar = "4s"; return;
+ case ARM64vecb_UQADD16x8: *nm = "uqadd "; *ar = "8h"; return;
+ case ARM64vecb_UQADD8x16: *nm = "uqadd "; *ar = "16b"; return;
+ case ARM64vecb_SQSUB64x2: *nm = "sqsub "; *ar = "2d"; return;
+ case ARM64vecb_SQSUB32x4: *nm = "sqsub "; *ar = "4s"; return;
+ case ARM64vecb_SQSUB16x8: *nm = "sqsub "; *ar = "8h"; return;
+ case ARM64vecb_SQSUB8x16: *nm = "sqsub "; *ar = "16b"; return;
+ case ARM64vecb_UQSUB64x2: *nm = "uqsub "; *ar = "2d"; return;
+ case ARM64vecb_UQSUB32x4: *nm = "uqsub "; *ar = "4s"; return;
+ case ARM64vecb_UQSUB16x8: *nm = "uqsub "; *ar = "8h"; return;
+ case ARM64vecb_UQSUB8x16: *nm = "uqsub "; *ar = "16b"; return;
+ case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return;
+ case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return;
+ case ARM64vecb_SQDMULH32x4: *nm = "sqdmulh"; *ar = "4s"; return;
+ case ARM64vecb_SQDMULH16x8: *nm = "sqdmulh"; *ar = "8h"; return;
+ case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s"; return;
+ case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h"; return;
+ case ARM64vecb_SQSHL64x2: *nm = "sqshl "; *ar = "2d"; return;
+ case ARM64vecb_SQSHL32x4: *nm = "sqshl "; *ar = "4s"; return;
+ case ARM64vecb_SQSHL16x8: *nm = "sqshl "; *ar = "8h"; return;
+ case ARM64vecb_SQSHL8x16: *nm = "sqshl "; *ar = "16b"; return;
+ case ARM64vecb_UQSHL64x2: *nm = "uqshl "; *ar = "2d"; return;
+ case ARM64vecb_UQSHL32x4: *nm = "uqshl "; *ar = "4s"; return;
+ case ARM64vecb_UQSHL16x8: *nm = "uqshl "; *ar = "8h"; return;
+ case ARM64vecb_UQSHL8x16: *nm = "uqshl "; *ar = "16b"; return;
+ case ARM64vecb_SQRSHL64x2: *nm = "sqrshl"; *ar = "2d"; return;
+ case ARM64vecb_SQRSHL32x4: *nm = "sqrshl"; *ar = "4s"; return;
+ case ARM64vecb_SQRSHL16x8: *nm = "sqrshl"; *ar = "8h"; return;
+ case ARM64vecb_SQRSHL8x16: *nm = "sqrshl"; *ar = "16b"; return;
+ case ARM64vecb_UQRSHL64x2: *nm = "uqrshl"; *ar = "2d"; return;
+ case ARM64vecb_UQRSHL32x4: *nm = "uqrshl"; *ar = "4s"; return;
+ case ARM64vecb_UQRSHL16x8: *nm = "uqrshl"; *ar = "8h"; return;
+ case ARM64vecb_UQRSHL8x16: *nm = "uqrshl"; *ar = "16b"; return;
default: vpanic("showARM64VecBinOp");
}
}
i->ARM64in.FPCR.iReg = iReg;
return i;
}
+ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg ) {
+ ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+ i->tag = ARM64in_FPSR;
+ i->ARM64in.FPSR.toFPSR = toFPSR;
+ i->ARM64in.FPSR.iReg = iReg;
+ return i;
+}
ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op,
HReg dst, HReg argL, HReg argR ) {
ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
i->ARM64in.VDfromX.rX = rX;
return i;
}
+ARM64Instr* ARM64Instr_VQfromX ( HReg rQ, HReg rXlo ) {
+ ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+ i->tag = ARM64in_VQfromX;
+ i->ARM64in.VQfromX.rQ = rQ;
+ i->ARM64in.VQfromX.rXlo = rXlo;
+ return i;
+}
ARM64Instr* ARM64Instr_VQfromXX ( HReg rQ, HReg rXhi, HReg rXlo ) {
ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
i->tag = ARM64in_VQfromXX;
vex_printf(", fpcr");
}
return;
+ case ARM64in_FPSR:
+ if (i->ARM64in.FPSR.toFPSR) {
+ vex_printf("msr fpsr, ");
+ ppHRegARM64(i->ARM64in.FPSR.iReg);
+ } else {
+ vex_printf("mrs ");
+ ppHRegARM64(i->ARM64in.FPSR.iReg);
+ vex_printf(", fpsr");
+ }
+ return;
case ARM64in_VBinV: {
const HChar* nm = "??";
const HChar* ar = "??";
showARM64VecBinOp(&nm, &ar, i->ARM64in.VBinV.op);
- vex_printf("%s ", nm);
+ vex_printf("%s ", nm);
ppHRegARM64(i->ARM64in.VBinV.dst);
vex_printf(".%s, ", ar);
ppHRegARM64(i->ARM64in.VBinV.argL);
vex_printf(", ");
ppHRegARM64(i->ARM64in.VDfromX.rX);
return;
+ case ARM64in_VQfromX:
+ vex_printf("fmov ");
+ ppHRegARM64(i->ARM64in.VQfromX.rQ);
+ vex_printf(".d[0], ");
+ ppHRegARM64(i->ARM64in.VQfromX.rXlo);
+ return;
case ARM64in_VQfromXX:
vex_printf("qFromXX ");
ppHRegARM64(i->ARM64in.VQfromXX.rQ);
else
addHRegUse(u, HRmWrite, i->ARM64in.FPCR.iReg);
return;
+ case ARM64in_FPSR:
+ if (i->ARM64in.FPSR.toFPSR)
+ addHRegUse(u, HRmRead, i->ARM64in.FPSR.iReg);
+ else
+ addHRegUse(u, HRmWrite, i->ARM64in.FPSR.iReg);
+ return;
case ARM64in_VBinV:
addHRegUse(u, HRmWrite, i->ARM64in.VBinV.dst);
addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL);
addHRegUse(u, HRmWrite, i->ARM64in.VDfromX.rD);
addHRegUse(u, HRmRead, i->ARM64in.VDfromX.rX);
return;
+ case ARM64in_VQfromX:
+ addHRegUse(u, HRmWrite, i->ARM64in.VQfromX.rQ);
+ addHRegUse(u, HRmRead, i->ARM64in.VQfromX.rXlo);
+ return;
case ARM64in_VQfromXX:
addHRegUse(u, HRmWrite, i->ARM64in.VQfromXX.rQ);
addHRegUse(u, HRmRead, i->ARM64in.VQfromXX.rXhi);
case ARM64in_FPCR:
i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg);
return;
+ case ARM64in_FPSR:
+ i->ARM64in.FPSR.iReg = lookupHRegRemap(m, i->ARM64in.FPSR.iReg);
+ return;
case ARM64in_VBinV:
i->ARM64in.VBinV.dst = lookupHRegRemap(m, i->ARM64in.VBinV.dst);
i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL);
i->ARM64in.VDfromX.rX
= lookupHRegRemap(m, i->ARM64in.VDfromX.rX);
return;
+ case ARM64in_VQfromX:
+ i->ARM64in.VQfromX.rQ
+ = lookupHRegRemap(m, i->ARM64in.VQfromX.rQ);
+ i->ARM64in.VQfromX.rXlo
+ = lookupHRegRemap(m, i->ARM64in.VQfromX.rXlo);
+ return;
case ARM64in_VQfromXX:
i->ARM64in.VQfromXX.rQ
= lookupHRegRemap(m, i->ARM64in.VQfromXX.rQ);
#define X010000 BITS8(0,0, 0,1,0,0,0,0)
#define X010001 BITS8(0,0, 0,1,0,0,0,1)
#define X010010 BITS8(0,0, 0,1,0,0,1,0)
+#define X010011 BITS8(0,0, 0,1,0,0,1,1)
#define X010101 BITS8(0,0, 0,1,0,1,0,1)
#define X010110 BITS8(0,0, 0,1,0,1,1,0)
+#define X010111 BITS8(0,0, 0,1,0,1,1,1)
#define X011001 BITS8(0,0, 0,1,1,0,0,1)
#define X011010 BITS8(0,0, 0,1,1,0,1,0)
#define X011011 BITS8(0,0, 0,1,1,0,1,1)
}
goto bad; // FPCR -> iReg case currently ATC
}
+ case ARM64in_FPSR: {
+ Bool toFPSR = i->ARM64in.FPSR.toFPSR;
+ UInt iReg = iregNo(i->ARM64in.FPSR.iReg);
+ if (toFPSR) {
+ /* 0xD51B44 001 Rt MSR fpsr, rT */
+ *p++ = 0xD51B4420 | (iReg & 0x1F);
+ } else {
+ /* 0xD53B44 001 Rt MRS rT, fpsr */
+ *p++ = 0xD53B4420 | (iReg & 0x1F);
+ }
+ goto done;
+ }
case ARM64in_VBinV: {
/* 31 23 20 15 9 4
010 01110 11 1 m 100001 n d ADD Vd.2d, Vn.2d, Vm.2d
010 01110 01 1 m 101101 n d SQDMULH Vd.8h, Vn.8h, Vm.8h
011 01110 10 1 m 101101 n d SQRDMULH Vd.4s, Vn.4s, Vm.4s
011 01110 10 1 m 101101 n d SQRDMULH Vd.8h, Vn.8h, Vm.8h
+
+ 010 01110 sz 1 m 010011 n d SQSHL@sz Vd, Vn, Vm
+ 010 01110 sz 1 m 010111 n d SQRSHL@sz Vd, Vn, Vm
+ 011 01110 sz 1 m 010011 n d UQSHL@sz Vd, Vn, Vm
+ 011 01110 sz 1 m 010111 n d URQSHL@sz Vd, Vn, Vm
*/
UInt vD = qregNo(i->ARM64in.VBinV.dst);
UInt vN = qregNo(i->ARM64in.VBinV.argL);
*p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X101101, vN, vD);
break;
+ case ARM64vecb_SQSHL64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_SQSHL32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_SQSHL16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_SQSHL8x16:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010011, vN, vD);
+ break;
+
+ case ARM64vecb_SQRSHL64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_SQRSHL32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_SQRSHL16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_SQRSHL8x16:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X010111, vN, vD);
+ break;
+
+ case ARM64vecb_UQSHL64x2:
+ *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_UQSHL32x4:
+ *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_UQSHL16x8:
+ *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010011, vN, vD);
+ break;
+ case ARM64vecb_UQSHL8x16:
+ *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010011, vN, vD);
+ break;
+
+ case ARM64vecb_UQRSHL64x2:
+ *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_UQRSHL32x4:
+ *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_UQRSHL16x8:
+ *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X010111, vN, vD);
+ break;
+ case ARM64vecb_UQRSHL8x16:
+ *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X010111, vN, vD);
+ break;
+
default:
goto bad;
}
goto done;
}
+ case ARM64in_VQfromX: {
+ /* FMOV D, X
+ 1001 1110 0110 0111 0000 00 nn dd FMOV Vd.D[0], Xn
+ I think this zeroes out the top half of the destination, which
+ is what we need. TODO: can we do VDfromX and VQfromXX better? */
+ UInt dd = qregNo(i->ARM64in.VQfromX.rQ);
+ UInt xx = iregNo(i->ARM64in.VQfromX.rXlo);
+ vassert(xx < 31);
+ *p++ = 0x9E670000 | X_2_6_2_12_5_5(0,0,0,0,xx,dd);
+ goto done;
+ }
+
case ARM64in_VQfromXX: {
/* What we really generate is a two insn sequence:
INS Vd.D[0], Xlo; INS Vd.D[1], Xhi
ARM64vecb_SQDMULH16x8,
ARM64vecb_SQRDMULH32x4,
ARM64vecb_SQRDMULH16x8,
+ ARM64vecb_SQSHL64x2, ARM64vecb_SQSHL32x4,
+ ARM64vecb_SQSHL16x8, ARM64vecb_SQSHL8x16,
+ ARM64vecb_UQSHL64x2, ARM64vecb_UQSHL32x4,
+ ARM64vecb_UQSHL16x8, ARM64vecb_UQSHL8x16,
+ ARM64vecb_SQRSHL64x2, ARM64vecb_SQRSHL32x4,
+ ARM64vecb_SQRSHL16x8, ARM64vecb_SQRSHL8x16,
+ ARM64vecb_UQRSHL64x2, ARM64vecb_UQRSHL32x4,
+ ARM64vecb_UQRSHL16x8, ARM64vecb_UQRSHL8x16,
ARM64vecb_INVALID
}
ARM64VecBinOp;
ARM64in_VCmpD,
ARM64in_VCmpS,
ARM64in_FPCR,
+ ARM64in_FPSR,
/* ARM64in_V*V: vector ops on vector registers */
ARM64in_VBinV,
ARM64in_VUnaryV,
ARM64in_VExtV,
ARM64in_VImmQ,
ARM64in_VDfromX, /* Move an Xreg to a Dreg */
+ ARM64in_VQfromX, /* Move an Xreg to a Qreg lo64, and zero hi64 */
ARM64in_VQfromXX, /* Move 2 Xregs to a Qreg */
ARM64in_VXfromQ, /* Move half a Qreg to an Xreg */
ARM64in_VXfromDorS, /* Move Dreg or Sreg(ZX) to an Xreg */
Bool toFPCR;
HReg iReg;
} FPCR;
+ /* Move a 32-bit value to/from the FPSR */
+ struct {
+ Bool toFPSR;
+ HReg iReg;
+ } FPSR;
/* binary vector operation on vector registers */
struct {
ARM64VecBinOp op;
HReg rD;
HReg rX;
} VDfromX;
+ struct {
+ HReg rQ;
+ HReg rXlo;
+ } VQfromX;
struct {
HReg rQ;
HReg rXhi;
extern ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR );
extern ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR );
extern ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg );
+extern ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg );
extern ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg, HReg, HReg );
extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg );
extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src );
HReg srcLo, HReg srcHi, UInt amtB );
extern ARM64Instr* ARM64Instr_VImmQ ( HReg, UShort );
extern ARM64Instr* ARM64Instr_VDfromX ( HReg rD, HReg rX );
+extern ARM64Instr* ARM64Instr_VQfromX ( HReg rQ, HReg rXlo );
extern ARM64Instr* ARM64Instr_VQfromXX( HReg rQ, HReg rXhi, HReg rXlo );
extern ARM64Instr* ARM64Instr_VXfromQ ( HReg rX, HReg rQ, UInt laneNo );
extern ARM64Instr* ARM64Instr_VXfromDorS ( HReg rX, HReg rDorS, Bool fromD );
return env->vregmap[tmp];
}
+static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
+ ISelEnv* env, IRTemp tmp )
+{
+ vassert(tmp >= 0);
+ vassert(tmp < env->n_vregmap);
+ vassert(! hregIsInvalid(env->vregmapHI[tmp]));
+ *vrLO = env->vregmap[tmp];
+ *vrHI = env->vregmapHI[tmp];
+}
+
static void addInstr ( ISelEnv* env, ARM64Instr* instr )
{
addHInstr(env->code, instr);
static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo,
ISelEnv* env, IRExpr* e );
-
-//ZZ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
-//ZZ ISelEnv* env, IRExpr* e );
-//ZZ static void iselInt64Expr ( HReg* rHi, HReg* rLo,
-//ZZ ISelEnv* env, IRExpr* e );
-
static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
static HReg iselDblExpr ( ISelEnv* env, IRExpr* e );
static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
static HReg iselFltExpr ( ISelEnv* env, IRExpr* e );
-//ZZ static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e );
-//ZZ static HReg iselNeon64Expr ( ISelEnv* env, IRExpr* e );
-
static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e );
static HReg iselV128Expr ( ISelEnv* env, IRExpr* e );
+static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
+ ISelEnv* env, IRExpr* e );
+static void iselV256Expr ( /*OUT*/HReg* rHi, HReg* rLo,
+ ISelEnv* env, IRExpr* e );
+
static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 );
/*---------------------------------------------------------*/
-/*--- ISEL: Vector (NEON) expressions (128 bit) ---*/
+/*--- ISEL: Vector expressions (128 bit) ---*/
/*---------------------------------------------------------*/
static HReg iselV128Expr ( ISelEnv* env, IRExpr* e )
if (e->tag == Iex_Unop) {
- /* Iop_ZeroHIXXofV128 cases */
+ /* Iop_ZeroHIXXofV128 cases */
UShort imm16 = 0;
switch (e->Iex.Unop.op) {
case Iop_ZeroHI64ofV128: imm16 = 0x00FF; break;
addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res));
return res;
}
+ case Iop_V256toV128_0:
+ case Iop_V256toV128_1: {
+ HReg vHi, vLo;
+ iselV256Expr(&vHi, &vLo, env, e->Iex.Unop.arg);
+ return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
+ }
//ZZ case Iop_NotV128: {
//ZZ DECLARE_PATTERN(p_veqz_8x16);
}
+/*---------------------------------------------------------*/
+/*--- ISEL: Vector expressions (256 bit) ---*/
+/*---------------------------------------------------------*/
+
+static void iselV256Expr ( /*OUT*/HReg* rHi, HReg* rLo,
+ ISelEnv* env, IRExpr* e )
+{
+ iselV256Expr_wrk( rHi, rLo, env, e );
+ vassert(hregClass(*rHi) == HRcVec128);
+ vassert(hregClass(*rLo) == HRcVec128);
+ vassert(hregIsVirtual(*rHi));
+ vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
+ ISelEnv* env, IRExpr* e )
+{
+ vassert(e);
+ IRType ty = typeOfIRExpr(env->type_env,e);
+ vassert(ty == Ity_V256);
+
+ /* read 256-bit IRTemp */
+ if (e->tag == Iex_RdTmp) {
+ lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
+ return;
+ }
+
+ if (e->tag == Iex_Binop) {
+ switch (e->Iex.Binop.op) {
+
+ case Iop_QandSQsh64x2:
+ case Iop_QandSQsh32x4:
+ case Iop_QandSQsh16x8:
+ case Iop_QandSQsh8x16:
+ case Iop_QandUQsh64x2:
+ case Iop_QandUQsh32x4:
+ case Iop_QandUQsh16x8:
+ case Iop_QandUQsh8x16:
+ case Iop_QandSQRsh64x2:
+ case Iop_QandSQRsh32x4:
+ case Iop_QandSQRsh16x8:
+ case Iop_QandSQRsh8x16:
+ case Iop_QandUQRsh64x2:
+ case Iop_QandUQRsh32x4:
+ case Iop_QandUQRsh16x8:
+ case Iop_QandUQRsh8x16:
+ {
+ HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
+ HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
+ HReg fpsr = newVRegI(env);
+ HReg resHi = newVRegV(env);
+ HReg resLo = newVRegV(env);
+ ARM64VecBinOp op = ARM64vecb_INVALID;
+ switch (e->Iex.Binop.op) {
+ case Iop_QandSQsh64x2: op = ARM64vecb_SQSHL64x2; break;
+ case Iop_QandSQsh32x4: op = ARM64vecb_SQSHL32x4; break;
+ case Iop_QandSQsh16x8: op = ARM64vecb_SQSHL16x8; break;
+ case Iop_QandSQsh8x16: op = ARM64vecb_SQSHL8x16; break;
+ case Iop_QandUQsh64x2: op = ARM64vecb_UQSHL64x2; break;
+ case Iop_QandUQsh32x4: op = ARM64vecb_UQSHL32x4; break;
+ case Iop_QandUQsh16x8: op = ARM64vecb_UQSHL16x8; break;
+ case Iop_QandUQsh8x16: op = ARM64vecb_UQSHL8x16; break;
+ case Iop_QandSQRsh64x2: op = ARM64vecb_SQRSHL64x2; break;
+ case Iop_QandSQRsh32x4: op = ARM64vecb_SQRSHL32x4; break;
+ case Iop_QandSQRsh16x8: op = ARM64vecb_SQRSHL16x8; break;
+ case Iop_QandSQRsh8x16: op = ARM64vecb_SQRSHL8x16; break;
+ case Iop_QandUQRsh64x2: op = ARM64vecb_UQRSHL64x2; break;
+ case Iop_QandUQRsh32x4: op = ARM64vecb_UQRSHL32x4; break;
+ case Iop_QandUQRsh16x8: op = ARM64vecb_UQRSHL16x8; break;
+ case Iop_QandUQRsh8x16: op = ARM64vecb_UQRSHL8x16; break;
+ default: vassert(0);
+ }
+ /* Clear FPSR.Q, do the operation, and return both its result
+ and the new value of FPSR.Q. We can simply zero the whole
+ thing out since FPSR is essentially a scratch status register
+ on the host. */
+ addInstr(env, ARM64Instr_Imm64(fpsr, 0));
+ addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr));
+ addInstr(env, ARM64Instr_VBinV(op, resLo, argL, argR));
+ addInstr(env, ARM64Instr_FPSR(False/*!toFPSR*/, fpsr));
+ addInstr(env, ARM64Instr_Shift(fpsr, fpsr, ARM64RI6_I6(27),
+ ARM64sh_SHR));
+ ARM64RIL* ril_one = mb_mkARM64RIL_I(1);
+ vassert(ril_one);
+ addInstr(env, ARM64Instr_Logic(fpsr, fpsr, ril_one, ARM64lo_AND));
+ /* Now we have: the main (shift) result in |resLo|, and the
+ Q bit at the bottom of |fpsr|. */
+ addInstr(env, ARM64Instr_VQfromX(resHi, fpsr));
+ *rHi = resHi;
+ *rLo = resLo;
+ return;
+ }
+
+ /* ... */
+ default:
+ break;
+ } /* switch on the binop */
+ } /* if (e->tag == Iex_Binop) */
+
+ ppIRExpr(e);
+ vpanic("iselV256Expr_wrk");
+}
+
+
/*---------------------------------------------------------*/
/*--- ISEL: Statements ---*/
/*---------------------------------------------------------*/
addInstr(env, ARM64Instr_VMov(16, dst, src));
return;
}
+ if (ty == Ity_V256) {
+ HReg srcHi, srcLo, dstHi, dstLo;
+ iselV256Expr(&srcHi,&srcLo, env, stmt->Ist.WrTmp.data);
+ lookupIRTempPair( &dstHi, &dstLo, env, tmp);
+ addInstr(env, ARM64Instr_VMov(16, dstHi, srcHi));
+ addInstr(env, ARM64Instr_VMov(16, dstLo, srcLo));
+ return;
+ }
break;
}
case Ity_V128:
hreg = mkHReg(j++, HRcVec128, True);
break;
+ case Ity_V256:
+ hreg = mkHReg(j++, HRcVec128, True);
+ hregHI = mkHReg(j++, HRcVec128, True);
+ break;
default:
ppIRType(bb->tyenv->types[i]);
vpanic("iselBB(arm64): IRTemp type");
case Iop_Rol32x4: vex_printf("Rol32x4"); return;
case Iop_Rol64x2: vex_printf("Rol64x2"); return;
+ case Iop_QandUQsh8x16: vex_printf("QandUQsh8x16"); return;
+ case Iop_QandUQsh16x8: vex_printf("QandUQsh16x8"); return;
+ case Iop_QandUQsh32x4: vex_printf("QandUQsh32x4"); return;
+ case Iop_QandUQsh64x2: vex_printf("QandUQsh64x2"); return;
+ case Iop_QandSQsh8x16: vex_printf("QandSQsh8x16"); return;
+ case Iop_QandSQsh16x8: vex_printf("QandSQsh16x8"); return;
+ case Iop_QandSQsh32x4: vex_printf("QandSQsh32x4"); return;
+ case Iop_QandSQsh64x2: vex_printf("QandSQsh64x2"); return;
+ case Iop_QandUQRsh8x16: vex_printf("QandUQRsh8x16"); return;
+ case Iop_QandUQRsh16x8: vex_printf("QandUQRsh16x8"); return;
+ case Iop_QandUQRsh32x4: vex_printf("QandUQRsh32x4"); return;
+ case Iop_QandUQRsh64x2: vex_printf("QandUQRsh64x2"); return;
+ case Iop_QandSQRsh8x16: vex_printf("QandSQRsh8x16"); return;
+ case Iop_QandSQRsh16x8: vex_printf("QandSQRsh16x8"); return;
+ case Iop_QandSQRsh32x4: vex_printf("QandSQRsh32x4"); return;
+ case Iop_QandSQRsh64x2: vex_printf("QandSQRsh64x2"); return;
+
case Iop_NarrowBin16to8x16: vex_printf("NarrowBin16to8x16"); return;
case Iop_NarrowBin32to16x8: vex_printf("NarrowBin32to16x8"); return;
case Iop_QNarrowBin16Uto8Ux16: vex_printf("QNarrowBin16Uto8Ux16"); return;
case Iop_DivD128: vex_printf("DivD128"); return;
case Iop_ShlD128: vex_printf("ShlD128"); return;
case Iop_ShrD128: vex_printf("ShrD128"); return;
- case Iop_RoundD64toInt: vex_printf("Iop_RoundD64toInt"); return;
- case Iop_RoundD128toInt: vex_printf("Iop_RoundD128toInt"); return;
- case Iop_QuantizeD64: vex_printf("Iop_QuantizeD64"); return;
- case Iop_QuantizeD128: vex_printf("Iop_QuantizeD128"); return;
- case Iop_ExtractExpD64: vex_printf("Iop_ExtractExpD64"); return;
- case Iop_ExtractExpD128: vex_printf("Iop_ExtractExpD128"); return;
- case Iop_ExtractSigD64: vex_printf("Iop_ExtractSigD64"); return;
- case Iop_ExtractSigD128: vex_printf("Iop_ExtractSigD128"); return;
- case Iop_InsertExpD64: vex_printf("Iop_InsertExpD64"); return;
- case Iop_InsertExpD128: vex_printf("Iop_InsertExpD128"); return;
+ case Iop_RoundD64toInt: vex_printf("RoundD64toInt"); return;
+ case Iop_RoundD128toInt: vex_printf("RoundD128toInt"); return;
+ case Iop_QuantizeD64: vex_printf("QuantizeD64"); return;
+ case Iop_QuantizeD128: vex_printf("QuantizeD128"); return;
+ case Iop_ExtractExpD64: vex_printf("ExtractExpD64"); return;
+ case Iop_ExtractExpD128: vex_printf("ExtractExpD128"); return;
+ case Iop_ExtractSigD64: vex_printf("ExtractSigD64"); return;
+ case Iop_ExtractSigD128: vex_printf("ExtractSigD128"); return;
+ case Iop_InsertExpD64: vex_printf("InsertExpD64"); return;
+ case Iop_InsertExpD128: vex_printf("InsertExpD128"); return;
case Iop_CmpD64: vex_printf("CmpD64"); return;
case Iop_CmpD128: vex_printf("CmpD128"); return;
case Iop_CmpExpD64: vex_printf("CmpExpD64"); return;
case Iop_D64HLtoD128: vex_printf("D64HLtoD128"); return;
case Iop_D128HItoD64: vex_printf("D128HItoD64"); return;
case Iop_D128LOtoD64: vex_printf("D128LOtoD64"); return;
- case Iop_SignificanceRoundD64: vex_printf("Iop_SignificanceRoundD64");
+ case Iop_SignificanceRoundD64: vex_printf("SignificanceRoundD64");
return;
- case Iop_SignificanceRoundD128: vex_printf("Iop_SignificanceRoundD128");
+ case Iop_SignificanceRoundD128: vex_printf("SignificanceRoundD128");
return;
case Iop_ReinterpI64asD64: vex_printf("ReinterpI64asD64"); return;
case Iop_ReinterpD64asI64: vex_printf("ReinterpD64asI64"); return;
case Iop_V256toV128_1: case Iop_V256toV128_0:
UNARY(Ity_V256, Ity_V128);
+ case Iop_QandUQsh8x16: case Iop_QandUQsh16x8:
+ case Iop_QandUQsh32x4: case Iop_QandUQsh64x2:
+ case Iop_QandSQsh8x16: case Iop_QandSQsh16x8:
+ case Iop_QandSQsh32x4: case Iop_QandSQsh64x2:
+ case Iop_QandUQRsh8x16: case Iop_QandUQRsh16x8:
+ case Iop_QandUQRsh32x4: case Iop_QandUQRsh64x2:
+ case Iop_QandSQRsh8x16: case Iop_QandSQRsh16x8:
+ case Iop_QandSQRsh32x4: case Iop_QandSQRsh64x2:
case Iop_V128HLtoV256:
BINARY(Ity_V128,Ity_V128, Ity_V256);
Iop_QShlN8x16, Iop_QShlN16x8, Iop_QShlN32x4, Iop_QShlN64x2,
Iop_QSalN8x16, Iop_QSalN16x8, Iop_QSalN32x4, Iop_QSalN64x2,
+ /* VECTOR x VECTOR BIDIRECTIONAL SATURATING (& MAYBE ROUNDING) SHIFT */
+ /* The least significant 8 bits of each lane of the second
+ operand are used as the shift amount, and interpreted signedly.
+ Positive values mean a shift left, negative a shift right. The
+ result is signedly or unsignedly saturated. There are also
+ rounding variants, which add 2^(shift_amount-1) to the value before
+ shifting, but only in the shift-right case. Vacated positions
+ are filled with zeroes. IOW, it's either SHR or SHL, but not SAR.
+
+ These operations return 129 bits: one bit ("Q") indicating whether
+ saturation occurred, and the shift result. The result type is V256,
+ of which the lower V128 is the shift result, and Q occupies the
+ least significant bit of the upper V128. All other bits of the
+ upper V128 are zero. */
+ // Unsigned saturation, no rounding
+ Iop_QandUQsh8x16, Iop_QandUQsh16x8,
+ Iop_QandUQsh32x4, Iop_QandUQsh64x2,
+ // Signed saturation, no rounding
+ Iop_QandSQsh8x16, Iop_QandSQsh16x8,
+ Iop_QandSQsh32x4, Iop_QandSQsh64x2,
+
+ // Unsigned saturation, rounding
+ Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
+ Iop_QandUQRsh32x4, Iop_QandUQRsh64x2,
+ // Signed saturation, rounding
+ Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
+ Iop_QandSQRsh32x4, Iop_QandSQRsh64x2,
+
/* NARROWING (binary)
-- narrow 2xV128 into 1xV128, hi half from left arg */
/* See comments above w.r.t. U vs S issues in saturated narrowing. */