From 26a81837a9adf8281a3cb26aa85e032544ea26e0 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Sat, 28 Jun 2014 12:21:37 +0000 Subject: [PATCH] arm64: implement: sabal uabal sabdl uabdl saddl uaddl ssubl usubl smlal umlal smlsl umlsl smull umull git-svn-id: svn://svn.valgrind.org/vex/trunk@2893 --- VEX/priv/guest_arm64_toIR.c | 156 ++++++++++++++++++++++++++++++-- VEX/priv/host_arm64_defs.c | 172 +++++++++++++++++++++--------------- VEX/priv/host_arm64_defs.h | 4 + VEX/priv/host_arm64_isel.c | 23 ++++- 4 files changed, 278 insertions(+), 77 deletions(-) diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 966569ddec..36934cad20 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -5640,10 +5640,13 @@ IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128, static IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE ) { - const IROp opSUB[3] = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4 }; - const IROp opGTU[3] = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4 }; - const IROp opGTS[3] = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4 }; - vassert(size <= 2); + const IROp opSUB[4] + = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 }; + const IROp opGTU[4] + = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 }; + const IROp opGTS[4] + = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 }; + vassert(size <= 3); IRTemp argL = newTemp(Ity_V128); IRTemp argR = newTemp(Ity_V128); IRTemp msk = newTemp(Ity_V128); @@ -5664,6 +5667,51 @@ IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE ) } +/* Generate IR that takes a V128 and sign- or zero-widens + either the lower or upper set of lanes to twice-as-wide, + resulting in a new V128 value. */ +static +IRTemp math_WIDEN_LANES ( Bool zWiden, Bool fromUpperHalf, + UInt sizeNarrow, IRExpr* srcE ) +{ + IRTemp src = newTemp(Ity_V128); + IRTemp res = newTemp(Ity_V128); + assign(src, srcE); + switch (sizeNarrow) { + case X10: + assign(res, + binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2, + binop(fromUpperHalf ? Iop_InterleaveHI32x4 + : Iop_InterleaveLO32x4, + mkexpr(src), + mkexpr(src)), + mkU8(32))); + break; + case X01: + assign(res, + binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4, + binop(fromUpperHalf ? Iop_InterleaveHI16x8 + : Iop_InterleaveLO16x8, + mkexpr(src), + mkexpr(src)), + mkU8(16))); + break; + case X00: + assign(res, + binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8, + binop(fromUpperHalf ? Iop_InterleaveHI8x16 + : Iop_InterleaveLO8x16, + mkexpr(src), + mkexpr(src)), + mkU8(8))); + break; + default: + vassert(0); + } + return res; +} + + /* Let |new64| be a V128 in which only the lower 64 bits are interesting, and the upper can contain any value -- it is ignored. If |is2| is False, generate IR to put |new64| in the lower half of vector reg |dd| and zero @@ -6937,6 +6985,34 @@ Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn) vassert(size < 4); Bool is2 = bitQ == 1; + if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) { + /* -------- 0,0000 SADDL{2} -------- */ + /* -------- 1,0000 UADDL{2} -------- */ + /* -------- 0,0010 SSUBL{2} -------- */ + /* -------- 1,0010 USUBL{2} -------- */ + /* Widens, and size refers to the narrowed lanes. */ + const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 }; + const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 }; + if (size == X11) return False; + vassert(size <= 2); + Bool isU = bitU == 1; + Bool isADD = opcode == BITS4(0,0,0,0); + IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn)); + IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm)); + IRTemp res = newTemp(Ity_V128); + assign(res, binop(isADD ? opADD[size] : opSUB[size], + mkexpr(argL), mkexpr(argR))); + putQReg128(dd, mkexpr(res)); + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + const HChar* nm = isADD ? (isU ? "uaddl" : "saddl") + : (isU ? "usubl" : "ssubl"); + DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "", + nameQReg128(dd), arrWide, + nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow); + return True; + } + if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) { /* -------- 0,0100 ADDHN{2} -------- */ /* -------- 1,0100 RADDHN{2} -------- */ @@ -6990,9 +7066,79 @@ Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) { + /* -------- 0,0101 SABAL{2} -------- */ + /* -------- 1,0101 UABAL{2} -------- */ + /* -------- 0,0111 SABDL{2} -------- */ + /* -------- 1,0111 UABDL{2} -------- */ + /* Widens, and size refers to the narrowed lanes. */ + const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 }; + if (size == X11) return False; + vassert(size <= 2); + Bool isU = bitU == 1; + Bool isACC = opcode == BITS4(0,1,0,1); + IRTemp argL = math_WIDEN_LANES(isU, is2, size, getQReg128(nn)); + IRTemp argR = math_WIDEN_LANES(isU, is2, size, getQReg128(mm)); + IRTemp abd = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR)); + IRTemp res = newTemp(Ity_V128); + assign(res, isACC ? binop(opADD[size], mkexpr(abd), getQReg128(dd)) + : mkexpr(abd)); + putQReg128(dd, mkexpr(res)); + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + const HChar* nm = isACC ? (isU ? "uabal" : "sabal") + : (isU ? "uabdl" : "sabdl"); + DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "", + nameQReg128(dd), arrWide, + nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow); + return True; + } + + if (opcode == BITS4(1,1,0,0) + || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) { + /* -------- 0,1100 SMULL{2} -------- */ // 0 (ix) + /* -------- 1,1100 UMULL{2} -------- */ // 0 + /* -------- 0,1000 SMLAL{2} -------- */ // 1 + /* -------- 1,1000 UMLAL{2} -------- */ // 1 + /* -------- 0,1010 SMLSL{2} -------- */ // 2 + /* -------- 1,1010 UMLSL{2} -------- */ // 2 + /* Widens, and size refers to the narrowed lanes. */ + UInt ix = 3; + switch (opcode) { + case BITS4(1,1,0,0): ix = 0; break; + case BITS4(1,0,0,0): ix = 1; break; + case BITS4(1,0,1,0): ix = 2; break; + default: vassert(0); + } + vassert(ix >= 0 && ix <= 2); + const IROp opMULLU[3] = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2 }; + const IROp opMULLS[3] = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2 }; + const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 }; + const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 }; + if (size == X11) return False; + vassert(size <= 2); + Bool isU = bitU == 1; + IROp mulOp = isU ? opMULLU[size] : opMULLS[size]; + IROp accOp = (ix == 1) ? opADD[size] + : (ix == 2 ? opSUB[size] : Iop_INVALID); + IRTemp mul = math_BINARY_WIDENING_V128(is2, mulOp, + getQReg128(nn), getQReg128(mm)); + IRTemp res = newTemp(Ity_V128); + assign(res, ix == 0 ? mkexpr(mul) + : binop(accOp, getQReg128(dd), mkexpr(mul))); + putQReg128(dd, mkexpr(res)); + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + const HChar* nm = ix == 0 ? "mull" : (ix == 1 ? "mlal" : "mlsl"); + DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "", + nameQReg128(dd), arrWide, + nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow); + return True; + } + if (bitU == 0 && opcode == BITS4(1,1,1,0)) { /* -------- 0,1110 PMULL{2} -------- */ - /* Narrows, and size refers to the narrowed lanes. */ + /* Widens, and size refers to the narrowed lanes. */ if (size != X00) return False; IRTemp res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8, diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index e2fc180e4a..d964f9ba16 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -854,75 +854,81 @@ static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) { static void showARM64VecBinOp(/*OUT*/const HChar** nm, /*OUT*/const HChar** ar, ARM64VecBinOp op ) { switch (op) { - case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; - case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; - case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; - case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return; - case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; - case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; - case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; - case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return; - case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return; - case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return; - case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return; - case ARM64vecb_FADD64x2: *nm = "fadd"; *ar = "2d"; return; - case ARM64vecb_FSUB64x2: *nm = "fsub"; *ar = "2d"; return; - case ARM64vecb_FMUL64x2: *nm = "fmul"; *ar = "2d"; return; - case ARM64vecb_FDIV64x2: *nm = "fdiv"; *ar = "2d"; return; - case ARM64vecb_FADD32x4: *nm = "fadd"; *ar = "4s"; return; - case ARM64vecb_FSUB32x4: *nm = "fsub"; *ar = "4s"; return; - case ARM64vecb_FMUL32x4: *nm = "fmul"; *ar = "4s"; return; - case ARM64vecb_FDIV32x4: *nm = "fdiv"; *ar = "4s"; return; - case ARM64vecb_UMAX32x4: *nm = "umax"; *ar = "4s"; return; - case ARM64vecb_UMAX16x8: *nm = "umax"; *ar = "8h"; return; - case ARM64vecb_UMAX8x16: *nm = "umax"; *ar = "16b"; return; - case ARM64vecb_UMIN32x4: *nm = "umin"; *ar = "4s"; return; - case ARM64vecb_UMIN16x8: *nm = "umin"; *ar = "8h"; return; - case ARM64vecb_UMIN8x16: *nm = "umin"; *ar = "16b"; return; - case ARM64vecb_SMAX32x4: *nm = "smax"; *ar = "4s"; return; - case ARM64vecb_SMAX16x8: *nm = "smax"; *ar = "8h"; return; - case ARM64vecb_SMAX8x16: *nm = "smax"; *ar = "16b"; return; - case ARM64vecb_SMIN32x4: *nm = "smin"; *ar = "4s"; return; - case ARM64vecb_SMIN16x8: *nm = "smin"; *ar = "8h"; return; - case ARM64vecb_SMIN8x16: *nm = "smin"; *ar = "16b"; return; - case ARM64vecb_AND: *nm = "and "; *ar = "all"; return; - case ARM64vecb_ORR: *nm = "orr "; *ar = "all"; return; - case ARM64vecb_XOR: *nm = "eor "; *ar = "all"; return; - case ARM64vecb_CMEQ64x2: *nm = "cmeq"; *ar = "2d"; return; - case ARM64vecb_CMEQ32x4: *nm = "cmeq"; *ar = "4s"; return; - case ARM64vecb_CMEQ16x8: *nm = "cmeq"; *ar = "8h"; return; - case ARM64vecb_CMEQ8x16: *nm = "cmeq"; *ar = "16b"; return; - case ARM64vecb_CMHI64x2: *nm = "cmhi"; *ar = "2d"; return; - case ARM64vecb_CMHI32x4: *nm = "cmhi"; *ar = "4s"; return; - case ARM64vecb_CMHI16x8: *nm = "cmhi"; *ar = "8h"; return; - case ARM64vecb_CMHI8x16: *nm = "cmhi"; *ar = "16b"; return; - case ARM64vecb_CMGT64x2: *nm = "cmgt"; *ar = "2d"; return; - case ARM64vecb_CMGT32x4: *nm = "cmgt"; *ar = "4s"; return; - case ARM64vecb_CMGT16x8: *nm = "cmgt"; *ar = "8h"; return; - case ARM64vecb_CMGT8x16: *nm = "cmgt"; *ar = "16b"; return; - case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return; - case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return; - case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return; - case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return; - case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return; - case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return; - case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return; - case ARM64vecb_UZP164x2: *nm = "uzp1"; *ar = "2d"; return; - case ARM64vecb_UZP132x4: *nm = "uzp1"; *ar = "4s"; return; - case ARM64vecb_UZP116x8: *nm = "uzp1"; *ar = "8h"; return; - case ARM64vecb_UZP18x16: *nm = "uzp1"; *ar = "16b"; return; - case ARM64vecb_UZP264x2: *nm = "uzp2"; *ar = "2d"; return; - case ARM64vecb_UZP232x4: *nm = "uzp2"; *ar = "4s"; return; - case ARM64vecb_UZP216x8: *nm = "uzp2"; *ar = "8h"; return; - case ARM64vecb_UZP28x16: *nm = "uzp2"; *ar = "16b"; return; - case ARM64vecb_ZIP132x4: *nm = "zip1"; *ar = "4s"; return; - case ARM64vecb_ZIP116x8: *nm = "zip1"; *ar = "8h"; return; - case ARM64vecb_ZIP18x16: *nm = "zip1"; *ar = "16b"; return; - case ARM64vecb_ZIP232x4: *nm = "zip2"; *ar = "4s"; return; - case ARM64vecb_ZIP216x8: *nm = "zip2"; *ar = "8h"; return; - case ARM64vecb_ZIP28x16: *nm = "zip2"; *ar = "16b"; return; - case ARM64vecb_PMUL8x16: *nm = "pmul"; *ar = "16b"; return; - case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hb"; return; + case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; + case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; + case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; + case ARM64vecb_ADD8x16: *nm = "add "; *ar = "16b"; return; + case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; + case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; + case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; + case ARM64vecb_SUB8x16: *nm = "sub "; *ar = "16b"; return; + case ARM64vecb_MUL32x4: *nm = "mul "; *ar = "4s"; return; + case ARM64vecb_MUL16x8: *nm = "mul "; *ar = "8h"; return; + case ARM64vecb_MUL8x16: *nm = "mul "; *ar = "16b"; return; + case ARM64vecb_FADD64x2: *nm = "fadd "; *ar = "2d"; return; + case ARM64vecb_FSUB64x2: *nm = "fsub "; *ar = "2d"; return; + case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return; + case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return; + case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return; + case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return; + case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return; + case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return; + case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return; + case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return; + case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return; + case ARM64vecb_UMIN32x4: *nm = "umin "; *ar = "4s"; return; + case ARM64vecb_UMIN16x8: *nm = "umin "; *ar = "8h"; return; + case ARM64vecb_UMIN8x16: *nm = "umin "; *ar = "16b"; return; + case ARM64vecb_SMAX32x4: *nm = "smax "; *ar = "4s"; return; + case ARM64vecb_SMAX16x8: *nm = "smax "; *ar = "8h"; return; + case ARM64vecb_SMAX8x16: *nm = "smax "; *ar = "16b"; return; + case ARM64vecb_SMIN32x4: *nm = "smin "; *ar = "4s"; return; + case ARM64vecb_SMIN16x8: *nm = "smin "; *ar = "8h"; return; + case ARM64vecb_SMIN8x16: *nm = "smin "; *ar = "16b"; return; + case ARM64vecb_AND: *nm = "and "; *ar = "16b"; return; + case ARM64vecb_ORR: *nm = "orr "; *ar = "16b"; return; + case ARM64vecb_XOR: *nm = "eor "; *ar = "16b"; return; + case ARM64vecb_CMEQ64x2: *nm = "cmeq "; *ar = "2d"; return; + case ARM64vecb_CMEQ32x4: *nm = "cmeq "; *ar = "4s"; return; + case ARM64vecb_CMEQ16x8: *nm = "cmeq "; *ar = "8h"; return; + case ARM64vecb_CMEQ8x16: *nm = "cmeq "; *ar = "16b"; return; + case ARM64vecb_CMHI64x2: *nm = "cmhi "; *ar = "2d"; return; + case ARM64vecb_CMHI32x4: *nm = "cmhi "; *ar = "4s"; return; + case ARM64vecb_CMHI16x8: *nm = "cmhi "; *ar = "8h"; return; + case ARM64vecb_CMHI8x16: *nm = "cmhi "; *ar = "16b"; return; + case ARM64vecb_CMGT64x2: *nm = "cmgt "; *ar = "2d"; return; + case ARM64vecb_CMGT32x4: *nm = "cmgt "; *ar = "4s"; return; + case ARM64vecb_CMGT16x8: *nm = "cmgt "; *ar = "8h"; return; + case ARM64vecb_CMGT8x16: *nm = "cmgt "; *ar = "16b"; return; + case ARM64vecb_FCMEQ64x2: *nm = "fcmeq"; *ar = "2d"; return; + case ARM64vecb_FCMEQ32x4: *nm = "fcmeq"; *ar = "4s"; return; + case ARM64vecb_FCMGE64x2: *nm = "fcmge"; *ar = "2d"; return; + case ARM64vecb_FCMGE32x4: *nm = "fcmge"; *ar = "4s"; return; + case ARM64vecb_FCMGT64x2: *nm = "fcmgt"; *ar = "2d"; return; + case ARM64vecb_FCMGT32x4: *nm = "fcmgt"; *ar = "4s"; return; + case ARM64vecb_TBL1: *nm = "tbl "; *ar = "16b"; return; + case ARM64vecb_UZP164x2: *nm = "uzp1 "; *ar = "2d"; return; + case ARM64vecb_UZP132x4: *nm = "uzp1 "; *ar = "4s"; return; + case ARM64vecb_UZP116x8: *nm = "uzp1 "; *ar = "8h"; return; + case ARM64vecb_UZP18x16: *nm = "uzp1 "; *ar = "16b"; return; + case ARM64vecb_UZP264x2: *nm = "uzp2 "; *ar = "2d"; return; + case ARM64vecb_UZP232x4: *nm = "uzp2 "; *ar = "4s"; return; + case ARM64vecb_UZP216x8: *nm = "uzp2 "; *ar = "8h"; return; + case ARM64vecb_UZP28x16: *nm = "uzp2 "; *ar = "16b"; return; + case ARM64vecb_ZIP132x4: *nm = "zip1 "; *ar = "4s"; return; + case ARM64vecb_ZIP116x8: *nm = "zip1 "; *ar = "8h"; return; + case ARM64vecb_ZIP18x16: *nm = "zip1 "; *ar = "16b"; return; + case ARM64vecb_ZIP232x4: *nm = "zip2 "; *ar = "4s"; return; + case ARM64vecb_ZIP216x8: *nm = "zip2 "; *ar = "8h"; return; + case ARM64vecb_ZIP28x16: *nm = "zip2 "; *ar = "16b"; return; + case ARM64vecb_PMUL8x16: *nm = "pmul "; *ar = "16b"; return; + case ARM64vecb_PMULL8x8: *nm = "pmull"; *ar = "8hbb"; return; + case ARM64vecb_UMULL2DSS: *nm = "umull"; *ar = "2dss"; return; + case ARM64vecb_UMULL4SHH: *nm = "umull"; *ar = "4shh"; return; + case ARM64vecb_UMULL8HBB: *nm = "umull"; *ar = "8hbb"; return; + case ARM64vecb_SMULL2DSS: *nm = "smull"; *ar = "2dss"; return; + case ARM64vecb_SMULL4SHH: *nm = "smull"; *ar = "4shh"; return; + case ARM64vecb_SMULL8HBB: *nm = "smull"; *ar = "8hbb"; return; default: vpanic("showARM64VecBinOp"); } } @@ -5137,6 +5143,14 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 011 01110 00 1 m 100111 n d PMUL Vd.16b, Vn.16b, Vm.16b 000 01110 00 1 m 111000 n d PMULL Vd.8h, Vn.8b, Vm.8b + + 001 01110 10 1 m 110000 n d UMULL Vd.2d, Vn.2s, Vm.2s + 001 01110 01 1 m 110000 n d UMULL Vd.4s, Vn.4h, Vm.4h + 001 01110 00 1 m 110000 n d UMULL Vd.8h, Vn.8b, Vm.8b + + 000 01110 10 1 m 110000 n d SMULL Vd.2d, Vn.2s, Vm.2s + 000 01110 01 1 m 110000 n d SMULL Vd.4s, Vn.4h, Vm.4h + 000 01110 00 1 m 110000 n d SMULL Vd.8h, Vn.8b, Vm.8b */ UInt vD = qregNo(i->ARM64in.VBinV.dst); UInt vN = qregNo(i->ARM64in.VBinV.argL); @@ -5368,6 +5382,26 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = X_3_8_5_6_5_5(X000, X01110001, vM, X111000, vN, vD); break; + case ARM64vecb_UMULL2DSS: + *p++ = X_3_8_5_6_5_5(X001, X01110101, vM, X110000, vN, vD); + break; + case ARM64vecb_UMULL4SHH: + *p++ = X_3_8_5_6_5_5(X001, X01110011, vM, X110000, vN, vD); + break; + case ARM64vecb_UMULL8HBB: + *p++ = X_3_8_5_6_5_5(X001, X01110001, vM, X110000, vN, vD); + break; + + case ARM64vecb_SMULL2DSS: + *p++ = X_3_8_5_6_5_5(X000, X01110101, vM, X110000, vN, vD); + break; + case ARM64vecb_SMULL4SHH: + *p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110000, vN, vD); + break; + case ARM64vecb_SMULL8HBB: + *p++ = X_3_8_5_6_5_5(X000, X01110001, vM, X110000, vN, vD); + break; + default: goto bad; } diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 5296636522..1f7c10f680 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -346,6 +346,10 @@ typedef ARM64vecb_ZIP216x8, ARM64vecb_ZIP28x16, ARM64vecb_PMUL8x16, ARM64vecb_PMULL8x8, + ARM64vecb_UMULL2DSS, + ARM64vecb_UMULL4SHH, ARM64vecb_UMULL8HBB, + ARM64vecb_SMULL2DSS, + ARM64vecb_SMULL4SHH, ARM64vecb_SMULL8HBB, ARM64vecb_INVALID } ARM64VecBinOp; diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 32e0b5f2fc..b63548b313 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -5683,16 +5683,33 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) break; } - case Iop_PolynomialMull8x8: { + case Iop_PolynomialMull8x8: + case Iop_Mull32Ux2: + case Iop_Mull16Ux4: + case Iop_Mull8Ux8: + case Iop_Mull32Sx2: + case Iop_Mull16Sx4: + case Iop_Mull8Sx8: + { HReg iSrcL = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg iSrcR = iselIntExpr_R(env, e->Iex.Binop.arg2); HReg vSrcL = newVRegV(env); HReg vSrcR = newVRegV(env); HReg dst = newVRegV(env); + ARM64VecBinOp op = ARM64vecb_INVALID; + switch (e->Iex.Binop.op) { + case Iop_PolynomialMull8x8: op = ARM64vecb_PMULL8x8; break; + case Iop_Mull32Ux2: op = ARM64vecb_UMULL2DSS; break; + case Iop_Mull16Ux4: op = ARM64vecb_UMULL4SHH; break; + case Iop_Mull8Ux8: op = ARM64vecb_UMULL8HBB; break; + case Iop_Mull32Sx2: op = ARM64vecb_SMULL2DSS; break; + case Iop_Mull16Sx4: op = ARM64vecb_SMULL4SHH; break; + case Iop_Mull8Sx8: op = ARM64vecb_SMULL8HBB; break; + default: vassert(0); + } addInstr(env, ARM64Instr_VQfromXX(vSrcL, iSrcL, iSrcL)); addInstr(env, ARM64Instr_VQfromXX(vSrcR, iSrcR, iSrcR)); - addInstr(env, ARM64Instr_VBinV(ARM64vecb_PMULL8x8, - dst, vSrcL, vSrcR)); + addInstr(env, ARM64Instr_VBinV(op, dst, vSrcL, vSrcR)); return dst; } -- 2.47.2