From cb95e1b5c47873d9d1ba91adb39581f4945b8018 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Sun, 15 Jun 2014 19:36:29 +0000 Subject: [PATCH] arm64: implement: abs d_d, neg d_d, abs std7_std7, addhn, subhn, raddhn, rsubhn git-svn-id: svn://svn.valgrind.org/vex/trunk@2877 --- VEX/priv/guest_arm64_toIR.c | 141 +++++++++++++++++++++++++++++++++++- VEX/priv/host_arm64_defs.c | 22 ++++++ VEX/priv/host_arm64_defs.h | 123 +++++++++++-------------------- VEX/priv/host_arm64_isel.c | 11 ++- VEX/priv/ir_defs.c | 3 +- VEX/pub/libvex_ir.h | 2 +- 6 files changed, 219 insertions(+), 83 deletions(-) diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 71e20aa977..141b456083 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -273,6 +273,12 @@ static IRExpr* mkU32 ( UInt i ) return IRExpr_Const(IRConst_U32(i)); } +static IRExpr* mkU16 ( UInt i ) +{ + vassert(i < 65536); + return IRExpr_Const(IRConst_U16(i)); +} + static IRExpr* mkU8 ( UInt i ) { vassert(i < 256); @@ -3183,7 +3189,7 @@ const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size ) { vassert(bitQ <= 1 && size <= 3); const HChar* nms[8] - = { "2d", "4s", "8h", "16b", "1d", "2s", "4h", "8b" }; + = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" }; UInt ix = (bitQ << 2) | size; vassert(ix < 8); return nms[ix]; @@ -5516,6 +5522,36 @@ static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src, } +/* Let |new64| be a V128 in which only the lower 64 bits are interesting, + and the upper can contain any value -- it is ignored. If |is2| is False, + generate IR to put |new64| in the lower half of vector reg |dd| and zero + the upper half. If |is2| is True, generate IR to put |new64| in the upper + half of vector reg |dd| and leave the lower half unchanged. This + simulates the behaviour of the "foo/foo2" instructions in which the + destination is half the width of sources, for example addhn/addhn2. +*/ +static +void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 ) +{ + if (is2) { + /* Get the old contents of Vdd, zero the upper half, and replace + it with 'x'. */ + IRTemp t_zero_oldLO = newTemp(Ity_V128); + assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd))); + IRTemp t_newHI_zero = newTemp(Ity_V128); + assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64), + mkV128(0x0000))); + IRTemp res = newTemp(Ity_V128); + assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO), + mkexpr(t_newHI_zero))); + putQReg128(dd, mkexpr(res)); + } else { + /* This is simple. */ + putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64))); + } +} + + static Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn) { @@ -6153,6 +6189,22 @@ Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) { + /* -------- 0,11,01011 ABS d_d -------- */ + putQReg128(dd, unop(Iop_ZeroHI64ofV128, + unop(Iop_Abs64x2, getQReg128(nn)))); + DIP("abs d%u, d%u\n", dd, nn); + return True; + } + + if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) { + /* -------- 1,11,01011 NEG d_d -------- */ + putQReg128(dd, unop(Iop_ZeroHI64ofV128, + binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn)))); + DIP("neg d%u, d%u\n", dd, nn); + return True; + } + # define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) return False; # undef INSN @@ -6338,7 +6390,80 @@ Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn) static Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn) { + /* 31 30 29 28 23 21 20 15 11 9 4 + 0 Q U 01110 size 1 m opcode 00 n d + Decode fields: u,opcode + */ # define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) + if (INSN(31,31) != 0 + || INSN(28,24) != BITS5(0,1,1,1,0) + || INSN(21,21) != 1 + || INSN(11,10) != BITS2(0,0)) { + return False; + } + UInt bitQ = INSN(30,30); + UInt bitU = INSN(29,29); + UInt size = INSN(23,22); + UInt mm = INSN(20,16); + UInt opcode = INSN(15,12); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + vassert(size < 4); + Bool is2 = bitQ == 1; + + if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) { + /* -------- 0,0100 ADDHN{2} -------- */ + /* -------- 1,0100 RADDHN{2} -------- */ + /* -------- 0,0110 SUBHN{2} -------- */ + /* -------- 1,0110 RSUBHN{2} -------- */ + /* Narrows, and size refers to the narrowed lanes. */ + if (size == X11) return False; + vassert(size <= 2); + const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 }; + const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 }; + const IROp opSHR[3] = { Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 }; + const UInt shift[3] = { 8, 16, 32 }; + const IROp opCAT[3] = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8, + Iop_CatEvenLanes32x4 }; + Bool isADD = opcode == BITS4(0,1,0,0); + Bool isR = bitU == 1; + /* Combined elements in wide lanes */ + IRTemp wide = newTemp(Ity_V128); + IRExpr* wideE = binop(isADD ? opADD[size] : opSUB[size], + getQReg128(nn), getQReg128(mm)); + if (isR) { + IRType ty = Ity_INVALID; + IRTemp rcS = IRTemp_INVALID; + switch (size) { + case X00: ty = Ity_I16; + rcS = newTemp(ty); assign(rcS, mkU16(0x80)); break; + case X01: ty = Ity_I32; + rcS = newTemp(ty); assign(rcS, mkU32(0x8000)); break; + case X10: ty = Ity_I64; + rcS = newTemp(ty); assign(rcS, mkU64(0x80000000)); break; + default: vassert(0); + } + IRTemp rcV = math_DUP_TO_V128(rcS, ty); + wideE = binop(opADD[size], wideE, mkexpr(rcV)); + } + assign(wide, wideE); + /* Top halves of elements, still in wide lanes */ + IRTemp shrd = newTemp(Ity_V128); + assign(shrd, binop(opSHR[size], mkexpr(wide), mkU8(shift[size]))); + /* Elements now compacted into lower 64 bits */ + IRTemp new64 = newTemp(Ity_V128); + assign(new64, binop(opCAT[size], mkexpr(shrd), mkexpr(shrd))); + putLO64andZUorPutHI64(is2, dd, new64); + const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); + const HChar* arrWide = nameArr_Q_SZ(1, size+1); + const HChar* nm = isADD ? (isR ? "raddhn" : "addhn") + : (isR ? "rsubhn" : "subhn"); + DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "", + nameQReg128(dd), arrNarrow, + nameQReg128(nn), arrWide, nameQReg128(mm), arrWide); + return True; + } + return False; # undef INSN } @@ -6858,6 +6983,20 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) { + /* -------- 0,xx,01011: ABS std7_std7 -------- */ + if (bitQ == 0 && size == X11) return False; // implied 1d case + const IROp opABS[4] + = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 }; + IRTemp res = newTemp(Ity_V128); + assign(res, unop(opABS[size], getQReg128(nn))); + putQReg128(dd, bitQ == 0 ? unop(Iop_ZeroHI64ofV128, mkexpr(res)) + : mkexpr(res)); + const HChar* arr = nameArr_Q_SZ(bitQ, size); + DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr); + return True; + } + if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) { /* -------- 1,xx,01011: NEG std7_std7 -------- */ if (bitQ == 0 && size == X11) return False; // implied 1d case diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index 29f78db64c..5c5988aa6f 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -929,6 +929,10 @@ static void showARM64VecUnaryOp(/*OUT*/const HChar** nm, case ARM64vecu_FNEG32x4: *nm = "fneg "; *ar = "4s"; return; case ARM64vecu_FABS64x2: *nm = "fabs "; *ar = "2d"; return; case ARM64vecu_FABS32x4: *nm = "fabs "; *ar = "4s"; return; + case ARM64vecu_ABS64x2: *nm = "abs"; *ar = "2d"; return; + case ARM64vecu_ABS32x4: *nm = "abs"; *ar = "4s"; return; + case ARM64vecu_ABS16x8: *nm = "abs"; *ar = "8h"; return; + case ARM64vecu_ABS8x16: *nm = "abs"; *ar = "16b"; return; case ARM64vecu_NOT: *nm = "not "; *ar = "all"; return; default: vpanic("showARM64VecUnaryOp"); } @@ -3422,6 +3426,7 @@ static inline UChar qregNo ( HReg r ) #define X100101 BITS8(0,0, 1,0,0,1,0,1) #define X100110 BITS8(0,0, 1,0,0,1,1,0) #define X100111 BITS8(0,0, 1,0,0,1,1,1) +#define X101110 BITS8(0,0, 1,0,1,1,1,0) #define X110000 BITS8(0,0, 1,1,0,0,0,0) #define X110001 BITS8(0,0, 1,1,0,0,0,1) #define X110101 BITS8(0,0, 1,1,0,1,0,1) @@ -5309,6 +5314,11 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, 011 01110 11 1 00000 111110 n d FNEG Vd.2d, Vn.2d 011 01110 10 1 00000 111110 n d FNEG Vd.4s, Vn.4s 011 01110 00 1 00000 010110 n d NOT Vd.16b, Vn.16b + + 010 01110 11 1 00000 101110 n d ABS Vd.2d, Vn.2d + 010 01110 10 1 00000 101110 n d ABS Vd.4s, Vn.4s + 010 01110 01 1 00000 101110 n d ABS Vd.8h, Vn.8h + 010 01110 00 1 00000 101110 n d ABS Vd.16b, Vn.16b */ UInt vD = qregNo(i->ARM64in.VUnaryV.dst); UInt vN = qregNo(i->ARM64in.VUnaryV.arg); @@ -5328,6 +5338,18 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecu_NOT: *p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X010110, vN, vD); break; + case ARM64vecu_ABS64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X101110, vN, vD); + break; + case ARM64vecu_ABS32x4: + *p++ = X_3_8_5_6_5_5(X010, X01110101, X00000, X101110, vN, vD); + break; + case ARM64vecu_ABS16x8: + *p++ = X_3_8_5_6_5_5(X010, X01110011, X00000, X101110, vN, vD); + break; + case ARM64vecu_ABS8x16: + *p++ = X_3_8_5_6_5_5(X010, X01110001, X00000, X101110, vN, vD); + break; default: goto bad; } diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index baec464cf8..38b2910fb5 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -307,102 +307,67 @@ typedef typedef enum { - ARM64vecb_ADD64x2=120, - ARM64vecb_ADD32x4, - ARM64vecb_ADD16x8, - ARM64vecb_ADD8x16, - ARM64vecb_SUB64x2, - ARM64vecb_SUB32x4, - ARM64vecb_SUB16x8, - ARM64vecb_SUB8x16, - ARM64vecb_MUL32x4, - ARM64vecb_MUL16x8, - ARM64vecb_MUL8x16, - ARM64vecb_FADD64x2, - ARM64vecb_FSUB64x2, - ARM64vecb_FMUL64x2, - ARM64vecb_FDIV64x2, - ARM64vecb_FADD32x4, - ARM64vecb_FSUB32x4, - ARM64vecb_FMUL32x4, - ARM64vecb_FDIV32x4, - ARM64vecb_UMAX32x4, - ARM64vecb_UMAX16x8, - ARM64vecb_UMAX8x16, - ARM64vecb_UMIN32x4, - ARM64vecb_UMIN16x8, - ARM64vecb_UMIN8x16, - ARM64vecb_SMAX32x4, - ARM64vecb_SMAX16x8, - ARM64vecb_SMAX8x16, - ARM64vecb_SMIN32x4, - ARM64vecb_SMIN16x8, - ARM64vecb_SMIN8x16, + ARM64vecb_ADD64x2=120, ARM64vecb_ADD32x4, + ARM64vecb_ADD16x8, ARM64vecb_ADD8x16, + ARM64vecb_SUB64x2, ARM64vecb_SUB32x4, + ARM64vecb_SUB16x8, ARM64vecb_SUB8x16, + ARM64vecb_MUL32x4, + ARM64vecb_MUL16x8, ARM64vecb_MUL8x16, + ARM64vecb_FADD64x2, ARM64vecb_FADD32x4, + ARM64vecb_FSUB64x2, ARM64vecb_FSUB32x4, + ARM64vecb_FMUL64x2, ARM64vecb_FMUL32x4, + ARM64vecb_FDIV64x2, ARM64vecb_FDIV32x4, + ARM64vecb_UMAX32x4, + ARM64vecb_UMAX16x8, ARM64vecb_UMAX8x16, + ARM64vecb_UMIN32x4, + ARM64vecb_UMIN16x8, ARM64vecb_UMIN8x16, + ARM64vecb_SMAX32x4, + ARM64vecb_SMAX16x8, ARM64vecb_SMAX8x16, + ARM64vecb_SMIN32x4, + ARM64vecb_SMIN16x8, ARM64vecb_SMIN8x16, ARM64vecb_AND, ARM64vecb_ORR, ARM64vecb_XOR, - ARM64vecb_CMEQ64x2, - ARM64vecb_CMEQ32x4, - ARM64vecb_CMEQ16x8, - ARM64vecb_CMEQ8x16, - ARM64vecb_CMHI64x2, /* >u */ - ARM64vecb_CMHI32x4, - ARM64vecb_CMHI16x8, - ARM64vecb_CMHI8x16, - ARM64vecb_CMGT64x2, /* >s */ - ARM64vecb_CMGT32x4, - ARM64vecb_CMGT16x8, - ARM64vecb_CMGT8x16, - ARM64vecb_FCMEQ64x2, - ARM64vecb_FCMEQ32x4, - ARM64vecb_FCMGE64x2, - ARM64vecb_FCMGE32x4, - ARM64vecb_FCMGT64x2, - ARM64vecb_FCMGT32x4, + ARM64vecb_CMEQ64x2, ARM64vecb_CMEQ32x4, + ARM64vecb_CMEQ16x8, ARM64vecb_CMEQ8x16, + ARM64vecb_CMHI64x2, ARM64vecb_CMHI32x4, /* >u */ + ARM64vecb_CMHI16x8, ARM64vecb_CMHI8x16, + ARM64vecb_CMGT64x2, ARM64vecb_CMGT32x4, /* >s */ + ARM64vecb_CMGT16x8, ARM64vecb_CMGT8x16, + ARM64vecb_FCMEQ64x2, ARM64vecb_FCMEQ32x4, + ARM64vecb_FCMGE64x2, ARM64vecb_FCMGE32x4, + ARM64vecb_FCMGT64x2, ARM64vecb_FCMGT32x4, ARM64vecb_TBL1, - ARM64vecb_UZP164x2, - ARM64vecb_UZP132x4, - ARM64vecb_UZP116x8, - ARM64vecb_UZP18x16, - ARM64vecb_UZP264x2, - ARM64vecb_UZP232x4, - ARM64vecb_UZP216x8, - ARM64vecb_UZP28x16, - ARM64vecb_ZIP132x4, - ARM64vecb_ZIP116x8, - ARM64vecb_ZIP18x16, - ARM64vecb_ZIP232x4, - ARM64vecb_ZIP216x8, - ARM64vecb_ZIP28x16, + ARM64vecb_UZP164x2, ARM64vecb_UZP132x4, + ARM64vecb_UZP116x8, ARM64vecb_UZP18x16, + ARM64vecb_UZP264x2, ARM64vecb_UZP232x4, + ARM64vecb_UZP216x8, ARM64vecb_UZP28x16, + ARM64vecb_ZIP132x4, ARM64vecb_ZIP116x8, + ARM64vecb_ZIP18x16, ARM64vecb_ZIP232x4, + ARM64vecb_ZIP216x8, ARM64vecb_ZIP28x16, ARM64vecb_INVALID } ARM64VecBinOp; typedef enum { - ARM64vecu_FNEG64x2=300, - ARM64vecu_FNEG32x4, - ARM64vecu_FABS64x2, - ARM64vecu_FABS32x4, + ARM64vecu_FNEG64x2=300, ARM64vecu_FNEG32x4, + ARM64vecu_FABS64x2, ARM64vecu_FABS32x4, ARM64vecu_NOT, + ARM64vecu_ABS64x2, ARM64vecu_ABS32x4, + ARM64vecu_ABS16x8, ARM64vecu_ABS8x16, ARM64vecu_INVALID } ARM64VecUnaryOp; typedef enum { - ARM64vecsh_USHR64x2=350, - ARM64vecsh_USHR32x4, - ARM64vecsh_USHR16x8, - ARM64vecsh_USHR8x16, - ARM64vecsh_SSHR64x2, - ARM64vecsh_SSHR32x4, - ARM64vecsh_SSHR16x8, - ARM64vecsh_SSHR8x16, - ARM64vecsh_SHL64x2, - ARM64vecsh_SHL32x4, - ARM64vecsh_SHL16x8, - ARM64vecsh_SHL8x16, + ARM64vecsh_USHR64x2=350, ARM64vecsh_USHR32x4, + ARM64vecsh_USHR16x8, ARM64vecsh_USHR8x16, + ARM64vecsh_SSHR64x2, ARM64vecsh_SSHR32x4, + ARM64vecsh_SSHR16x8, ARM64vecsh_SSHR8x16, + ARM64vecsh_SHL64x2, ARM64vecsh_SHL32x4, + ARM64vecsh_SHL16x8, ARM64vecsh_SHL8x16, ARM64vecsh_INVALID } ARM64VecShiftOp; diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 17d76e479f..d12c72d863 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -4413,7 +4413,12 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Abs64Fx2: case Iop_Abs32Fx4: case Iop_Neg64Fx2: - case Iop_Neg32Fx4: { + case Iop_Neg32Fx4: + case Iop_Abs64x2: + case Iop_Abs32x4: + case Iop_Abs16x8: + case Iop_Abs8x16: + { HReg res = newVRegV(env); HReg arg = iselV128Expr(env, e->Iex.Unop.arg); ARM64VecUnaryOp op = ARM64vecu_INVALID; @@ -4423,6 +4428,10 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Abs32Fx4: op = ARM64vecu_FABS32x4; break; case Iop_Neg64Fx2: op = ARM64vecu_FNEG64x2; break; case Iop_Neg32Fx4: op = ARM64vecu_FNEG32x4; break; + case Iop_Abs64x2: op = ARM64vecu_ABS64x2; break; + case Iop_Abs32x4: op = ARM64vecu_ABS32x4; break; + case Iop_Abs16x8: op = ARM64vecu_ABS16x8; break; + case Iop_Abs8x16: op = ARM64vecu_ABS8x16; break; default: vassert(0); } addInstr(env, ARM64Instr_VUnaryV(op, res, arg)); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index fc3ef47bc9..4d65dafa37 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -726,6 +726,7 @@ void ppIROp ( IROp op ) case Iop_Abs8x16: vex_printf("Abs8x16"); return; case Iop_Abs16x8: vex_printf("Abs16x8"); return; case Iop_Abs32x4: vex_printf("Abs32x4"); return; + case Iop_Abs64x2: vex_printf("Abs64x2"); return; case Iop_Add8x16: vex_printf("Add8x16"); return; case Iop_Add16x8: vex_printf("Add16x8"); return; @@ -2910,7 +2911,7 @@ void typeOfPrimop ( IROp op, case Iop_Reverse32_8x16: case Iop_Reverse32_16x8: case Iop_Reverse16_8x16: case Iop_Neg64Fx2: case Iop_Neg32Fx4: - case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4: + case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4: case Iop_Abs64x2: case Iop_CipherSV128: case Iop_PwBitMtxXpose64x2: case Iop_ZeroHI64ofV128: case Iop_ZeroHI96ofV128: diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 7bc68c8b09..c61ce2384a 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1483,7 +1483,7 @@ typedef Iop_PwBitMtxXpose64x2, /* ABSOLUTE VALUE */ - Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, + Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2, /* AVERAGING: note: (arg1 + arg2 + 1) >>u 1 */ Iop_Avg8Ux16, Iop_Avg16Ux8, Iop_Avg32Ux4, -- 2.47.2