static
IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
{
- const IROp opSUB[3] = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4 };
- const IROp opGTU[3] = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4 };
- const IROp opGTS[3] = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4 };
- vassert(size <= 2);
+ const IROp opSUB[4]
+ = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+ const IROp opGTU[4]
+ = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
+ const IROp opGTS[4]
+ = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
+ vassert(size <= 3);
IRTemp argL = newTemp(Ity_V128);
IRTemp argR = newTemp(Ity_V128);
IRTemp msk = newTemp(Ity_V128);
}
+/* Generate IR that takes a V128 and sign- or zero-widens
+ either the lower or upper set of lanes to twice-as-wide,
+ resulting in a new V128 value. */
+static
+IRTemp math_WIDEN_LANES ( Bool zWiden, Bool fromUpperHalf,
+ UInt sizeNarrow, IRExpr* srcE )
+{
+ IRTemp src = newTemp(Ity_V128);
+ IRTemp res = newTemp(Ity_V128);
+ assign(src, srcE);
+ switch (sizeNarrow) {
+ case X10:
+ assign(res,
+ binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
+ binop(fromUpperHalf ? Iop_InterleaveHI32x4
+ : Iop_InterleaveLO32x4,
+ mkexpr(src),
+ mkexpr(src)),
+ mkU8(32)));
+ break;
+ case X01:
+ assign(res,
+ binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
+ binop(fromUpperHalf ? Iop_InterleaveHI16x8
+ : Iop_InterleaveLO16x8,
+ mkexpr(src),
+ mkexpr(src)),
+ mkU8(16)));
+ break;
+ case X00:
+ assign(res,
+ binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
+ binop(fromUpperHalf ? Iop_InterleaveHI8x16
+ : Iop_InterleaveLO8x16,
+ mkexpr(src),
+ mkexpr(src)),
+ mkU8(8)));
+ break;
+ default:
+ vassert(0);
+ }
+ return res;
+}
+
+
/* Let |new64| be a V128 in which only the lower 64 bits are interesting,
and the upper can contain any value -- it is ignored. If |is2| is False,
generate IR to put |new64| in the lower half of vector reg |dd| and zero
vassert(size < 4);
Bool is2 = bitQ == 1;
+ if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
+ /* -------- 0,0000 SADDL{2} -------- */
+ /* -------- 1,0000 UADDL{2} -------- */
+ /* -------- 0,0010 SSUBL{2} -------- */
+ /* -------- 1,0010 USUBL{2} -------- */
+ /* Widens, and size refers to the narrowed lanes. */
+ const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+ const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+ if (size == X11) return False;
+ vassert(size <= 2);
+ Bool isU = bitU == 1;
+ Bool isADD = opcode == BITS4(0,0,0,0);
+ IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn));
+ IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(isADD ? opADD[size] : opSUB[size],
+ mkexpr(argL), mkexpr(argR)));
+ putQReg128(dd, mkexpr(res));
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = isADD ? (isU ? "uaddl" : "saddl")
+ : (isU ? "usubl" : "ssubl");
+ DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
+ return True;
+ }
+
if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
/* -------- 0,0100 ADDHN{2} -------- */
/* -------- 1,0100 RADDHN{2} -------- */
return True;
}
+ if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
+ /* -------- 0,0101 SABAL{2} -------- */
+ /* -------- 1,0101 UABAL{2} -------- */
+ /* -------- 0,0111 SABDL{2} -------- */
+ /* -------- 1,0111 UABDL{2} -------- */
+ /* Widens, and size refers to the narrowed lanes. */
+ const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+ if (size == X11) return False;
+ vassert(size <= 2);
+ Bool isU = bitU == 1;
+ Bool isACC = opcode == BITS4(0,1,0,1);
+ IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn));
+ IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm));
+ IRTemp abd = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, isACC ? binop(opADD[size], mkexpr(abd), getQReg128(dd))
+ : mkexpr(abd));
+ putQReg128(dd, mkexpr(res));
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = isACC ? (isU ? "uabal" : "sabal")
+ : (isU ? "uabdl" : "sabdl");
+ DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
+ return True;
+ }
+
+ if (opcode == BITS4(1,1,0,0)
+ || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
+ /* -------- 0,1100 SMULL{2} -------- */ // 0 (ix)
+ /* -------- 1,1100 UMULL{2} -------- */ // 0
+ /* -------- 0,1000 SMLAL{2} -------- */ // 1
+ /* -------- 1,1000 UMLAL{2} -------- */ // 1
+ /* -------- 0,1010 SMLSL{2} -------- */ // 2
+ /* -------- 1,1010 UMLSL{2} -------- */ // 2
+ /* Widens, and size refers to the narrowed lanes. */
+ UInt ix = 3;
+ switch (opcode) {
+ case BITS4(1,1,0,0): ix = 0; break;
+ case BITS4(1,0,0,0): ix = 1; break;
+ case BITS4(1,0,1,0): ix = 2; break;
+ default: vassert(0);
+ }
+ vassert(ix >= 0 && ix <= 2);
+ const IROp opMULLU[3] = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2 };
+ const IROp opMULLS[3] = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2 };
+ const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+ const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+ if (size == X11) return False;
+ vassert(size <= 2);
+ Bool isU = bitU == 1;
+ IROp mulOp = isU ? opMULLU[size] : opMULLS[size];
+ IROp accOp = (ix == 1) ? opADD[size]
+ : (ix == 2 ? opSUB[size] : Iop_INVALID);
+ IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp,
+ getQReg128(nn), getQReg128(mm));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, ix == 0 ? mkexpr(mul)
+ : binop(accOp, getQReg128(dd), mkexpr(mul)));
+ putQReg128(dd, mkexpr(res));
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = ix == 0 ? "mull" : (ix == 1 ? "mlal" : "mlsl");
+ DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
+ return True;
+ }
+
if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
/* -------- 0,1110 PMULL{2} -------- */
- /* Narrows, and size refers to the narrowed lanes. */
+ /* Widens, and size refers to the narrowed lanes. */
if (size != X00) return False;
IRTemp res
= math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
static void showARM64VecBinOp(/*OUT*/const HChar** nm,
/*OUT*/const HChar** ar, ARM64VecBinOp op ) {
switch (op) {
- case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
- case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
- case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
- case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
- case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
- case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
- case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
- case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
- case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
- case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
- case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
- case ARM64vecb_FADD64x2: *nm = "fadd"; *ar = "2d"; return;
- case ARM64vecb_FSUB64x2: *nm = "fsub"; *ar = "2d"; return;
- case ARM64vecb_FMUL64x2: *nm = "fmul"; *ar = "2d"; return;
- case ARM64vecb_FDIV64x2: *nm = "fdiv"; *ar = "2d"; return;
- case ARM64vecb_FADD32x4: *nm = "fadd"; *ar = "4s"; return;
- case ARM64vecb_FSUB32x4: *nm = "fsub"; *ar = "4s"; return;
- case ARM64vecb_FMUL32x4: *nm = "fmul"; *ar = "4s"; return;
- case ARM64vecb_FDIV32x4: *nm = "fdiv"; *ar = "4s"; return;
- case ARM64vecb_UMAX32x4: *nm = "umax"; *ar = "4s"; return;
- case ARM64vecb_UMAX16x8: *nm = "umax"; *ar = "8h"; return;
- case ARM64vecb_UMAX8x16: *nm = "umax"; *ar = "16b"; return;
- case ARM64vecb_UMIN32x4: *nm = "umin"; *ar = "4s"; return;
- case ARM64vecb_UMIN16x8: *nm = "umin"; *ar = "8h"; return;
- case ARM64vecb_UMIN8x16: *nm = "umin"; *ar = "16b"; return;
- case ARM64vecb_SMAX32x4: *nm = "smax"; *ar = "4s"; return;
- case ARM64vecb_SMAX16x8: *nm = "smax"; *ar = "8h"; return;
- case ARM64vecb_SMAX8x16: *nm = "smax"; *ar = "16b"; return;
- case ARM64vecb_SMIN32x4: *nm = "smin"; *ar = "4s"; return;
- case ARM64vecb_SMIN16x8: *nm = "smin"; *ar = "8h"; return;
- case ARM64vecb_SMIN8x16: *nm = "smin"; *ar = "16b"; return;
- case ARM64vecb_AND: *nm = "and "; *ar = "all"; return;
- case ARM64vecb_ORR: *nm = "orr "; *ar = "all"; return;
- case ARM64vecb_XOR: *nm = "eor "; *ar = "all"; return;
- case ARM64vecb_CMEQ64x2: *nm = "cmeq"; *ar = "2d"; return;
- case ARM64vecb_CMEQ32x4: *nm = "cmeq"; *ar = "4s"; return;
- case ARM64vecb_CMEQ16x8: *nm = "cmeq"; *ar = "8h"; return;
- case ARM64vecb_CMEQ8x16: *nm = "cmeq"; *ar = "16b"; return;
- case ARM64vecb_CMHI64x2: *nm = "cmhi"; *ar = "2d"; return;
- case ARM64vecb_CMHI32x4: *nm = "cmhi"; *ar = "4s"; return;
- case ARM64vecb_CMHI16x8: *nm = "cmhi"; *ar = "8h"; return;
- case ARM64vecb_CMHI8x16: *nm = "cmhi"; *ar = "16b"; return;
- case ARM64vecb_CMGT64x2: *nm = "cmgt"; *ar = "2d"; return;
- case ARM64vecb_CMGT32x4: *nm = "cmgt"; *ar = "4s"; return;
- case ARM64vecb_CMGT16x8: *nm = "cmgt"; *ar = "8h"; return;
- case ARM64vecb_CMGT8x16: *nm = "cmgt"; *ar = "16b"; return;
- case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
- case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
- case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
- case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
- case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
- case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
- case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
- case ARM64vecb_UZP164x2: *nm = "uzp1"; *ar = "2d"; return;
- case ARM64vecb_UZP132x4: *nm = "uzp1"; *ar = "4s"; return;
- case ARM64vecb_UZP116x8: *nm = "uzp1"; *ar = "8h"; return;
- case ARM64vecb_UZP18x16: *nm = "uzp1"; *ar = "16b"; return;
- case ARM64vecb_UZP264x2: *nm = "uzp2"; *ar = "2d"; return;
- case ARM64vecb_UZP232x4: *nm = "uzp2"; *ar = "4s"; return;
- case ARM64vecb_UZP216x8: *nm = "uzp2"; *ar = "8h"; return;
- case ARM64vecb_UZP28x16: *nm = "uzp2"; *ar = "16b"; return;
- case ARM64vecb_ZIP132x4: *nm = "zip1"; *ar = "4s"; return;
- case ARM64vecb_ZIP116x8: *nm = "zip1"; *ar = "8h"; return;
- case ARM64vecb_ZIP18x16: *nm = "zip1"; *ar = "16b"; return;
- case ARM64vecb_ZIP232x4: *nm = "zip2"; *ar = "4s"; return;
- case ARM64vecb_ZIP216x8: *nm = "zip2"; *ar = "8h"; return;
- case ARM64vecb_ZIP28x16: *nm = "zip2"; *ar = "16b"; return;
- case ARM64vecb_PMUL8x16: *nm = "pmul"; *ar = "16b"; return;
- case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hb"; return;
+ case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return;
+ case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return;
+ case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return;
+ case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return;
+ case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return;
+ case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return;
+ case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return;
+ case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return;
+ case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return;
+ case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return;
+ case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return;
+ case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return;
+ case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return;
+ case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return;
+ case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return;
+ case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return;
+ case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
+ case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
+ case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
+ case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
+ case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
+ case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
+ case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return;
+ case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return;
+ case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return;
+ case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return;
+ case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return;
+ case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return;
+ case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return;
+ case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return;
+ case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return;
+ case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return;
+ case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return;
+ case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return;
+ case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return;
+ case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return;
+ case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return;
+ case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return;
+ case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return;
+ case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return;
+ case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return;
+ case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return;
+ case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return;
+ case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return;
+ case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return;
+ case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return;
+ case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return;
+ case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return;
+ case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return;
+ case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return;
+ case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return;
+ case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return;
+ case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return;
+ case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return;
+ case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return;
+ case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return;
+ case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return;
+ case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return;
+ case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return;
+ case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return;
+ case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return;
+ case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return;
+ case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return;
+ case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return;
+ case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return;
+ case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return;
+ case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return;
+ case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return;
+ case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return;
+ case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return;
+ case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return;
+ case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return;
default: vpanic("showARM64VecBinOp");
}
}
011 01110 00 1 m 100111 n d PMUL Vd.16b, Vn.16b, Vm.16b
000 01110 00 1 m 111000 n d PMULL Vd.8h, Vn.8b, Vm.8b
+
+ 001 01110 10 1 m 110000 n d UMULL Vd.2d, Vn.2s, Vm.2s
+ 001 01110 01 1 m 110000 n d UMULL Vd.4s, Vn.4h, Vm.4h
+ 001 01110 00 1 m 110000 n d UMULL Vd.8h, Vn.8b, Vm.8b
+
+ 000 01110 10 1 m 110000 n d SMULL Vd.2d, Vn.2s, Vm.2s
+ 000 01110 01 1 m 110000 n d SMULL Vd.4s, Vn.4h, Vm.4h
+ 000 01110 00 1 m 110000 n d SMULL Vd.8h, Vn.8b, Vm.8b
*/
UInt vD = qregNo(i->ARM64in.VBinV.dst);
UInt vN = qregNo(i->ARM64in.VBinV.argL);
*p++ = X_3_8_5_6_5_5(X000, X01110001, vM, X111000, vN, vD);
break;
+ case ARM64vecb_UMULL2DSS:
+ *p++ = X_3_8_5_6_5_5(X001, X01110101, vM, X110000, vN, vD);
+ break;
+ case ARM64vecb_UMULL4SHH:
+ *p++ = X_3_8_5_6_5_5(X001, X01110011, vM, X110000, vN, vD);
+ break;
+ case ARM64vecb_UMULL8HBB:
+ *p++ = X_3_8_5_6_5_5(X001, X01110001, vM, X110000, vN, vD);
+ break;
+
+ case ARM64vecb_SMULL2DSS:
+ *p++ = X_3_8_5_6_5_5(X000, X01110101, vM, X110000, vN, vD);
+ break;
+ case ARM64vecb_SMULL4SHH:
+ *p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110000, vN, vD);
+ break;
+ case ARM64vecb_SMULL8HBB:
+ *p++ = X_3_8_5_6_5_5(X000, X01110001, vM, X110000, vN, vD);
+ break;
+
default:
goto bad;
}
ARM64vecb_ZIP216x8, ARM64vecb_ZIP28x16,
ARM64vecb_PMUL8x16,
ARM64vecb_PMULL8x8,
+ ARM64vecb_UMULL2DSS,
+ ARM64vecb_UMULL4SHH, ARM64vecb_UMULL8HBB,
+ ARM64vecb_SMULL2DSS,
+ ARM64vecb_SMULL4SHH, ARM64vecb_SMULL8HBB,
ARM64vecb_INVALID
}
ARM64VecBinOp;
break;
}
- case Iop_PolynomialMull8x8: {
+ case Iop_PolynomialMull8x8:
+ case Iop_Mull32Ux2:
+ case Iop_Mull16Ux4:
+ case Iop_Mull8Ux8:
+ case Iop_Mull32Sx2:
+ case Iop_Mull16Sx4:
+ case Iop_Mull8Sx8:
+ {
HReg iSrcL = iselIntExpr_R(env, e->Iex.Binop.arg1);
HReg iSrcR = iselIntExpr_R(env, e->Iex.Binop.arg2);
HReg vSrcL = newVRegV(env);
HReg vSrcR = newVRegV(env);
HReg dst = newVRegV(env);
+ ARM64VecBinOp op = ARM64vecb_INVALID;
+ switch (e->Iex.Binop.op) {
+ case Iop_PolynomialMull8x8: op = ARM64vecb_PMULL8x8; break;
+ case Iop_Mull32Ux2: op = ARM64vecb_UMULL2DSS; break;
+ case Iop_Mull16Ux4: op = ARM64vecb_UMULL4SHH; break;
+ case Iop_Mull8Ux8: op = ARM64vecb_UMULL8HBB; break;
+ case Iop_Mull32Sx2: op = ARM64vecb_SMULL2DSS; break;
+ case Iop_Mull16Sx4: op = ARM64vecb_SMULL4SHH; break;
+ case Iop_Mull8Sx8: op = ARM64vecb_SMULL8HBB; break;
+ default: vassert(0);
+ }
addInstr(env, ARM64Instr_VQfromXX(vSrcL, iSrcL, iSrcL));
addInstr(env, ARM64Instr_VQfromXX(vSrcR, iSrcR, iSrcR));
- addInstr(env, ARM64Instr_VBinV(ARM64vecb_PMULL8x8,
- dst, vSrcL, vSrcR));
+ addInstr(env, ARM64Instr_VBinV(op, dst, vSrcL, vSrcR));
return dst;
}