From: Julian Seward Date: Sun, 26 Jan 2014 19:11:14 +0000 (+0000) Subject: Improve front and back end support for SIMD instructions on Arm64. X-Git-Tag: svn/VALGRIND_3_10_1^2~160 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c4a62f1f1684669092028b4aca027c6836df3793;p=thirdparty%2Fvalgrind.git Improve front and back end support for SIMD instructions on Arm64. Implement the following instructions -- some but not necessarily all laneage combinations: LD1 {vT.2d}, [Xn|SP] ST1 {vT.2d}, [Xn|SP] LD1 {vT.4s}, [Xn|SP] ST1 {vT.4s}, [Xn|SP] LD1 {vT.8h}, [Xn|SP] ST1 {vT.8h}, [Xn|SP] LD1 {vT.16b}, [Xn|SP] ST1 {vT.16b}, [Xn|SP] LD1 {vT.1d}, [Xn|SP] ST1 {vT.1d}, [Xn|SP] LD1 {vT.2s}, [Xn|SP] ST1 {vT.2s}, [Xn|SP] LD1 {vT.4h}, [Xn|SP] ST1 {vT.4h}, [Xn|SP] LD1 {vT.8b}, [Xn|SP] ST1 {vT.8b}, [Xn|SP] ST1 {vT.2d}, [xN|SP], #16 LD1 {vT.2d}, [xN|SP], #16 ST1 {vT.4s}, [xN|SP], #16 ST1 {vT.8h}, [xN|SP], #16 ST1 {vT.2s}, [xN|SP], #8 SCVTF Vd, Vn UCVTF Vd, Vn FADD Vd,Vn,Vm 1 FSUB Vd,Vn,Vm 2 FMUL Vd,Vn,Vm 3 FDIV Vd,Vn,Vm 4 FMLA Vd,Vn,Vm 5 FMLS Vd,Vn,Vm 6 ADD Vd.T, Vn.T, Vm.T SUB Vd.T, Vn.T, Vm.T XTN {,2} DUP Vd.T, Vn.Ts[index] git-svn-id: svn://svn.valgrind.org/vex/trunk@2810 --- diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 7fbb6a6ed7..290e1c840c 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -896,7 +896,6 @@ static Int offsetQReg128 ( UInt qregNo ) } } - /* Write to a complete Qreg. */ static void putQReg128 ( UInt qregNo, IRExpr* e ) { @@ -929,54 +928,61 @@ static IRType preferredVectorSubTypeFromSize ( UInt szB ) } } -/* Find the offset of the szB'th least significant bytes of the given - Qreg. This requires knowing the endianness of the host. */ -static Int offsetQReg ( UInt szB, UInt qregNo ) +/* Find the offset of the laneNo'th lane of type laneTy in the given + Qreg. Since the host is little-endian, the least significant lane + has the lowest offset. */ +static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo ) { vassert(!host_is_bigendian); Int base = offsetQReg128(qregNo); - /* Since we're dealing with a little-endian host, all of the - sub-parts will have the same offset as the base register. But - we still need to check that szB is valid. */ - switch (szB) { - case 1: case 2: case 4: case 8: case 16: break; - default: vassert(0); + /* Since the host is little-endian, the least significant lane + will be at the lowest address. */ + /* Restrict this to known types, so as to avoid silently accepting + stupid types. */ + UInt laneSzB = 0; + switch (laneTy) { + case Ity_F32: case Ity_I32: laneSzB = 4; break; + case Ity_F64: case Ity_I64: laneSzB = 8; break; + case Ity_V128: laneSzB = 16; break; + default: break; } - return base; + vassert(laneSzB > 0); + UInt minOff = laneNo * laneSzB; + UInt maxOff = minOff + laneSzB - 1; + vassert(maxOff < 16); + return base + minOff; } -static void putQReg ( UInt qregNo, IRExpr* e ) +/* Put to the least significant lane of a Qreg. */ +static void putQRegLO ( UInt qregNo, IRExpr* e ) { IRType ty = typeOfIRExpr(irsb->tyenv, e); - Int off = offsetQReg(sizeofIRType(ty), qregNo); + Int off = offsetQRegLane(qregNo, ty, 0); switch (ty) { - case Ity_I8: break; - case Ity_I16: break; - case Ity_I32: break; - case Ity_F32: break; - case Ity_I64: break; - case Ity_F64: break; - case Ity_V128: break; - default: vassert(0); // Other cases are ATC + case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64: + case Ity_F32: case Ity_F64: case Ity_V128: + break; + default: + vassert(0); // Other cases are probably invalid } stmt(IRStmt_Put(off, e)); } -static IRExpr* getQReg ( IRType ty, UInt qregNo ) +/* Get from the least significant lane of a Qreg. */ +static IRExpr* getQRegLO ( UInt qregNo, IRType ty ) { - Int off = offsetQReg(sizeofIRType(ty), qregNo); + Int off = offsetQRegLane(qregNo, ty, 0); switch (ty) { - case Ity_I32: break; - case Ity_F32: break; - case Ity_I64: break; - case Ity_F64: break; - case Ity_V128: break; - default: vassert(0); // Other cases are ATC + case Ity_I32: case Ity_I64: + case Ity_F32: case Ity_F64: case Ity_V128: + break; + default: + vassert(0); // Other cases are ATC } return IRExpr_Get(off, ty); } -static const HChar* nameQReg ( UInt szB, UInt qregNo ) +static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy ) { static const HChar* namesQ[32] = { "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", @@ -1004,7 +1010,7 @@ static const HChar* nameQReg ( UInt szB, UInt qregNo ) "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" }; vassert(qregNo < 32); - switch (szB) { + switch (sizeofIRType(laneTy)) { case 1: return namesB[qregNo]; case 2: return namesH[qregNo]; case 4: return namesS[qregNo]; @@ -1015,34 +1021,64 @@ static const HChar* nameQReg ( UInt szB, UInt qregNo ) /*NOTREACHED*/ } +static const HChar* nameQReg128 ( UInt qregNo ) +{ + return nameQRegLO(qregNo, Ity_V128); +} + /* Find the offset of the most significant half (8 bytes) of the given Qreg. This requires knowing the endianness of the host. */ -static Int offsetQReg64HI ( UInt qregNo ) +static Int offsetQRegHI64 ( UInt qregNo ) { - vassert(!host_is_bigendian); - Int base = offsetQReg128(qregNo); - /* Since the host is little endian, the least significant half is - at the lower offset. So add 8 to get the MS half offset. */ - return base+8; + return offsetQRegLane(qregNo, Ity_I64, 1); } -static IRExpr* getQReg64HI ( UInt qregNo ) +static IRExpr* getQRegHI64 ( UInt qregNo ) { - return IRExpr_Get(offsetQReg64HI(qregNo), Ity_I64); + return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64); } -static void putQReg64HI ( UInt qregNo, IRExpr* e ) +static void putQRegHI64 ( UInt qregNo, IRExpr* e ) { IRType ty = typeOfIRExpr(irsb->tyenv, e); - Int off = offsetQReg64HI(qregNo); + Int off = offsetQRegHI64(qregNo); switch (ty) { - case Ity_I64: break; - case Ity_F64: break; - default: vassert(0); // Other cases are plain wrong + case Ity_I64: case Ity_F64: + break; + default: + vassert(0); // Other cases are plain wrong } stmt(IRStmt_Put(off, e)); } +/* Put to a specified lane of a Qreg. */ +static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e ) +{ + IRType laneTy = typeOfIRExpr(irsb->tyenv, e); + Int off = offsetQRegLane(qregNo, laneTy, laneNo); + switch (laneTy) { + case Ity_F64: case Ity_I64: + break; + default: + vassert(0); // Other cases are ATC + } + stmt(IRStmt_Put(off, e)); +} + +/* Get from the least significant lane of a Qreg. */ +static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy ) +{ + Int off = offsetQRegLane(qregNo, laneTy, laneNo); + switch (laneTy) { + case Ity_I64: case Ity_I32: + break; + default: + vassert(0); // Other cases are ATC + } + return IRExpr_Get(off, laneTy); +} + + //ZZ /* ---------------- Misc registers ---------------- */ //ZZ //ZZ static void putMiscReg32 ( UInt gsoffset, @@ -1533,6 +1569,45 @@ static IRTemp math_BSWAP64 ( IRTemp t1 ) } +/* Duplicates the bits at the bottom of the given word to fill the + whole word. src :: Ity_I64 is assumed to have zeroes everywhere + except for the bottom bits. */ +static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy ) +{ + if (srcTy == Ity_I8) { + IRTemp t16 = newTemp(Ity_I64); + assign(t16, binop(Iop_Or64, mkexpr(src), + binop(Iop_Shl64, mkexpr(src), mkU8(8)))); + IRTemp t32 = newTemp(Ity_I64); + assign(t32, binop(Iop_Or64, mkexpr(t16), + binop(Iop_Shl64, mkexpr(t16), mkU8(16)))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_Or64, mkexpr(t32), + binop(Iop_Shl64, mkexpr(t32), mkU8(32)))); + return t64; + } + if (srcTy == Ity_I16) { + IRTemp t32 = newTemp(Ity_I64); + assign(t32, binop(Iop_Or64, mkexpr(src), + binop(Iop_Shl64, mkexpr(src), mkU8(16)))); + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_Or64, mkexpr(t32), + binop(Iop_Shl64, mkexpr(t32), mkU8(32)))); + return t64; + } + if (srcTy == Ity_I32) { + IRTemp t64 = newTemp(Ity_I64); + assign(t64, binop(Iop_Or64, mkexpr(src), + binop(Iop_Shl64, mkexpr(src), mkU8(32)))); + return t64; + } + if (srcTy == Ity_I64) { + return src; + } + vassert(0); +} + + /*------------------------------------------------------------*/ /*--- FP comparison helpers ---*/ /*------------------------------------------------------------*/ @@ -3535,15 +3610,15 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) } if (isLD) { - putQReg(tt1, - loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0)))); - putQReg(tt2, - loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB)))); + putQRegLO(tt1, + loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0)))); + putQRegLO(tt2, + loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB)))); } else { storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)), - getQReg(ty, tt1)); + getQRegLO(tt1, ty)); storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)), - getQReg(ty, tt2)); + getQRegLO(tt2, ty)); } if (wBack) @@ -3564,7 +3639,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) vassert(0); } DIP(fmt_str, isLD ? "ld" : "st", - nameQReg(szB, tt1), nameQReg(szB, tt2), + nameQRegLO(tt1, ty), nameQRegLO(tt2, ty), nameIReg64orSP(nn), simm7); return True; } @@ -3598,43 +3673,43 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) case 0: /* 8 bit */ if (isLD) { putQReg128(tt, mkV128(0x0000)); - putQReg(tt, loadLE(Ity_I8, mkexpr(ea))); - DIP("ldr %s, %s\n", nameQReg(1, tt), dis_buf); + putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea))); + DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf); } else { vassert(0); //ATC - storeLE(mkexpr(ea), getQReg(Ity_I8, tt)); - DIP("str %s, %s\n", nameQReg(1, tt), dis_buf); + storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8)); + DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf); } break; case 1: if (isLD) { putQReg128(tt, mkV128(0x0000)); - putQReg(tt, loadLE(Ity_I16, mkexpr(ea))); - DIP("ldr %s, %s\n", nameQReg(2, tt), dis_buf); + putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea))); + DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf); } else { vassert(0); //ATC - storeLE(mkexpr(ea), getQReg(Ity_I16, tt)); - DIP("str %s, %s\n", nameQReg(2, tt), dis_buf); + storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16)); + DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf); } break; case 2: /* 32 bit */ if (isLD) { putQReg128(tt, mkV128(0x0000)); - putQReg(tt, loadLE(Ity_I32, mkexpr(ea))); - DIP("ldr %s, %s\n", nameQReg(4, tt), dis_buf); + putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea))); + DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf); } else { - storeLE(mkexpr(ea), getQReg(Ity_I32, tt)); - DIP("str %s, %s\n", nameQReg(4, tt), dis_buf); + storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32)); + DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf); } break; case 3: /* 64 bit */ if (isLD) { putQReg128(tt, mkV128(0x0000)); - putQReg(tt, loadLE(Ity_I64, mkexpr(ea))); - DIP("ldr %s, %s\n", nameQReg(8, tt), dis_buf); + putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea))); + DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf); } else { - storeLE(mkexpr(ea), getQReg(Ity_I64, tt)); - DIP("str %s, %s\n", nameQReg(8, tt), dis_buf); + storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64)); + DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf); } break; case 4: return False; //ATC @@ -3727,13 +3802,13 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) if (szLg2 < 4) { putQReg128(tt, mkV128(0x0000)); } - putQReg(tt, loadLE(ty, mkexpr(tEA))); + putQRegLO(tt, loadLE(ty, mkexpr(tEA))); } else { - storeLE(mkexpr(tEA), getQReg(ty, tt)); + storeLE(mkexpr(tEA), getQRegLO(tt, ty)); } DIP("%s %s, [%s, #%u]\n", isLD ? "ldr" : "str", - nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), pimm12); + nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12); return True; } @@ -3778,14 +3853,14 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) if (szLg2 < 4) { putQReg128(tt, mkV128(0x0000)); } - putQReg(tt, loadLE(ty, mkexpr(tTA))); + putQRegLO(tt, loadLE(ty, mkexpr(tTA))); } else { - storeLE(mkexpr(tTA), getQReg(ty, tt)); + storeLE(mkexpr(tTA), getQRegLO(tt, ty)); } putIReg64orSP(nn, mkexpr(tEA)); DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n", isLD ? "ldr" : "str", - nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), simm9); + nameQRegLO(tt, ty), nameIReg64orSP(nn), simm9); return True; } @@ -3816,16 +3891,16 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) IRType ty = preferredVectorSubTypeFromSize(1 << szLg2); assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9))); if (isLD) { - if (szLg2 < 4) { - putQReg128(tt, mkV128(0x0000)); - } - putQReg(tt, loadLE(ty, mkexpr(tEA))); + if (szLg2 < 4) { + putQReg128(tt, mkV128(0x0000)); + } + putQRegLO(tt, loadLE(ty, mkexpr(tEA))); } else { - storeLE(mkexpr(tEA), getQReg(ty, tt)); + storeLE(mkexpr(tEA), getQRegLO(tt, ty)); } DIP("%s %s, [%s, #%lld]\n", isLD ? "ldur" : "stur", - nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), (Long)simm9); + nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9); return True; } @@ -3841,49 +3916,98 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) UInt tt = INSN(4,0); ULong ea = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21); IRType ty = preferredVectorSubTypeFromSize(szB); - putQReg(tt, loadLE(ty, mkU64(ea))); - DIP("ldr %s, 0x%llx (literal)\n", nameQReg(szB, tt), ea); + putQReg128(tt, mkV128(0x0000)); + putQRegLO(tt, loadLE(ty, mkU64(ea))); + DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea); return True; } - /* FIXME Temporary hacks to get through ld.so FIXME */ + /* ---------- LD1/ST1 (single structure, no offset) ---------- */ + /* 31 23 + 0100 1100 0100 0000 0111 11 N T LD1 {vT.2d}, [Xn|SP] + 0100 1100 0000 0000 0111 11 N T ST1 {vT.2d}, [Xn|SP] + 0100 1100 0100 0000 0111 10 N T LD1 {vT.4s}, [Xn|SP] + 0100 1100 0000 0000 0111 10 N T ST1 {vT.4s}, [Xn|SP] + 0100 1100 0100 0000 0111 01 N T LD1 {vT.8h}, [Xn|SP] + 0100 1100 0000 0000 0111 01 N T ST1 {vT.8h}, [Xn|SP] + 0100 1100 0100 0000 0111 00 N T LD1 {vT.16b}, [Xn|SP] + 0100 1100 0000 0000 0111 00 N T ST1 {vT.16b}, [Xn|SP] + FIXME does this assume that the host is little endian? + */ + if ( (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases + || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases + ) { + Bool isLD = INSN(22,22) == 1; + UInt rN = INSN(9,5); + UInt vT = INSN(4,0); + IRTemp tEA = newTemp(Ity_I64); + const HChar* names[4] = { "2d", "4s", "8h", "16b" }; + const HChar* name = names[INSN(11,10)]; + assign(tEA, getIReg64orSP(rN)); + if (rN == 31) { /* FIXME generate stack alignment check */ } + if (isLD) { + putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA))); + } else { + storeLE(mkexpr(tEA), getQReg128(vT)); + } + DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1", + vT, name, nameIReg64orSP(rN)); + return True; + } - /* ------------------ ST1 variants ------------------ */ - /* st1 {vT.2d}, [], #16. - Note that #16 is implied and cannot be set to any - other value. - 0100 1100 1001 1111 0111 11 N T - FIXME doesn't this assume that the host is little endian? + /* 31 23 + 0000 1100 0100 0000 0111 11 N T LD1 {vT.1d}, [Xn|SP] + 0000 1100 0000 0000 0111 11 N T ST1 {vT.1d}, [Xn|SP] + 0000 1100 0100 0000 0111 10 N T LD1 {vT.2s}, [Xn|SP] + 0000 1100 0000 0000 0111 10 N T ST1 {vT.2s}, [Xn|SP] + 0000 1100 0100 0000 0111 01 N T LD1 {vT.4h}, [Xn|SP] + 0000 1100 0000 0000 0111 01 N T ST1 {vT.4h}, [Xn|SP] + 0000 1100 0100 0000 0111 00 N T LD1 {vT.8b}, [Xn|SP] + 0000 1100 0000 0000 0111 00 N T ST1 {vT.8b}, [Xn|SP] + FIXME does this assume that the host is little endian? */ - if ((insn & 0xFFFFFC00) == 0x4C9F7C00) { - UInt rN = INSN(9,5); - UInt vT = INSN(4,0); - IRTemp tEA = newTemp(Ity_I64); + if ( (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases + || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases + ) { + Bool isLD = INSN(22,22) == 1; + UInt rN = INSN(9,5); + UInt vT = INSN(4,0); + IRTemp tEA = newTemp(Ity_I64); + const HChar* names[4] = { "1d", "2s", "4h", "8b" }; + const HChar* name = names[INSN(11,10)]; assign(tEA, getIReg64orSP(rN)); if (rN == 31) { /* FIXME generate stack alignment check */ } - storeLE(mkexpr(tEA), getQReg128(vT)); - putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16))); - DIP("st1 {v%u.2d}, [%s], #16\n", vT, nameIReg64orSP(rN)); + if (isLD) { + putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA))); + putQRegLane(vT, 1, mkU64(0)); + } else { + storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64)); + } + DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1", + vT, name, nameIReg64orSP(rN)); return True; } - /* ------------------ LD1 variants ------------------ */ + /* ---------- LD1/ST1 (single structure, post index) ---------- */ /* 31 23 - 0100 1100 0100 0000 0111 11 N T LD1 {vT.2d}, [Xn|SP] - 0100 1100 0000 0000 0111 11 N T ST1 {vT.2d}, [Xn|SP] - 0100 1100 0100 0000 0111 00 N T LD1 {vT.16b}, [Xn|SP] - 0100 1100 0000 0000 0111 00 N T ST1 {vT.16b}, [Xn|SP] - FIXME doesn't this assume that the host is little endian? + 0100 1100 1001 1111 0111 11 N T ST1 {vT.2d}, [xN|SP], #16 + 0100 1100 1101 1111 0111 11 N T LD1 {vT.2d}, [xN|SP], #16 + 0100 1100 1001 1111 0111 10 N T ST1 {vT.4s}, [xN|SP], #16 + 0100 1100 1001 1111 0111 01 N T ST1 {vT.8h}, [xN|SP], #16 + Note that #16 is implied and cannot be any other value. + FIXME does this assume that the host is little endian? */ - if ( (insn & 0xFFFFFC00) == 0x4C407C00 // LD1 {vT.2d}, [Xn|SP] - || (insn & 0xFFFFFC00) == 0x4C007C00 // ST1 {vT.2d}, [Xn|SP] - || (insn & 0xFFFFFC00) == 0x4C407000 // LD1 {vT.16b}, [Xn|SP] - || (insn & 0xFFFFFC00) == 0x4C007000 // ST1 {vT.16b}, [Xn|SP] + if ( (insn & 0xFFFFFC00) == 0x4C9F7C00 // ST1 {vT.2d}, [xN|SP], #16 + || (insn & 0xFFFFFC00) == 0x4CDF7C00 // LD1 {vT.2d}, [xN|SP], #16 + || (insn & 0xFFFFFC00) == 0x4C9F7800 // ST1 {vT.4s}, [xN|SP], #16 + || (insn & 0xFFFFFC00) == 0x4C9F7400 // ST1 {vT.8h}, [xN|SP], #16 ) { Bool isLD = INSN(22,22) == 1; UInt rN = INSN(9,5); UInt vT = INSN(4,0); IRTemp tEA = newTemp(Ity_I64); + const HChar* names[4] = { "2d", "4s", "8h", "16b" }; + const HChar* name = names[INSN(11,10)]; assign(tEA, getIReg64orSP(rN)); if (rN == 31) { /* FIXME generate stack alignment check */ } if (isLD) { @@ -3891,12 +4015,34 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn) } else { storeLE(mkexpr(tEA), getQReg128(vT)); } - DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1", - vT, INSN(11,10) == BITS2(0,0) ? "16b" : "2d", - nameIReg64orSP(rN)); + putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16))); + DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1", + vT, name, nameIReg64orSP(rN)); + return True; + } + + /* + 0000 1100 1001 1111 0111 10 N T ST1 {vT.2s}, [xN|SP], #8 + Note that #8 is implied and cannot be any other value. + FIXME does this assume that the host is little endian? + */ + if ( (insn & 0xFFFFFC00) == 0x0C9F7800 // st1 {vT.2s}, [xN|SP], #8 + ) { + UInt rN = INSN(9,5); + UInt vT = INSN(4,0); + IRTemp tEA = newTemp(Ity_I64); + const HChar* names[4] = { "1d", "2s", "4h", "8b" }; + const HChar* name = names[INSN(11,10)]; + assign(tEA, getIReg64orSP(rN)); + if (rN == 31) { /* FIXME generate stack alignment check */ } + storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64)); + putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8))); + DIP("st1 {v%u.%s}, [%s], #8\n", vT, name, nameIReg64orSP(rN)); return True; } + /* FIXME Temporary hacks to get through ld.so FIXME */ + /* -------------------- LD{A}XR -------------------- */ /* FIXME: this is a hack; needs real atomicity stuff. */ /* 31 29 20 19 9 4 @@ -4216,36 +4362,102 @@ Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn) /* Generate N copies of |bit| in the bottom of a ULong. */ static ULong Replicate ( ULong bit, Int N ) { - vassert(bit <= 1 && N >= 1 && N < 64); - if (bit == 0) { - return 0; - } else { - /* Careful. This won't work for N == 64. */ - return (1ULL << N) - 1; - } + vassert(bit <= 1 && N >= 1 && N < 64); + if (bit == 0) { + return 0; + } else { + /* Careful. This won't work for N == 64. */ + return (1ULL << N) - 1; + } } static ULong VFPExpandImm ( ULong imm8, Int N ) { - vassert(imm8 <= 0xFF); - vassert(N == 32 || N == 64); - Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2. - Int F = N - E - 1; - ULong imm8_6 = (imm8 >> 6) & 1; - /* sign: 1 bit */ - /* exp: E bits */ - /* frac: F bits */ - ULong sign = (imm8 >> 7) & 1; - ULong exp = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1); - ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6); - vassert(sign < (1ULL << 1)); - vassert(exp < (1ULL << E)); - vassert(frac < (1ULL << F)); - vassert(1 + E + F == N); - ULong res = (sign << (E+F)) | (exp << F) | frac; - return res; + vassert(imm8 <= 0xFF); + vassert(N == 32 || N == 64); + Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2. + Int F = N - E - 1; + ULong imm8_6 = (imm8 >> 6) & 1; + /* sign: 1 bit */ + /* exp: E bits */ + /* frac: F bits */ + ULong sign = (imm8 >> 7) & 1; + ULong exp = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1); + ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6); + vassert(sign < (1ULL << 1)); + vassert(exp < (1ULL << E)); + vassert(frac < (1ULL << F)); + vassert(1 + E + F == N); + ULong res = (sign << (E+F)) | (exp << F) | frac; + return res; } +/* Help a bit for decoding laneage for vector operations that can be + of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q + and SZ bits, typically for vector floating point. */ +static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI, /*OUT*/IRType* tyF, + /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper, + /*OUT*/const HChar** arrSpec, + Bool bitQ, Bool bitSZ ) +{ + vassert(bitQ == True || bitQ == False); + vassert(bitSZ == True || bitSZ == False); + if (bitQ && bitSZ) { // 2x64 + if (tyI) *tyI = Ity_I64; + if (tyF) *tyF = Ity_F64; + if (nLanes) *nLanes = 2; + if (zeroUpper) *zeroUpper = False; + if (arrSpec) *arrSpec = "2d"; + return True; + } + if (bitQ && !bitSZ) { // 4x32 + if (tyI) *tyI = Ity_I32; + if (tyF) *tyF = Ity_F32; + if (nLanes) *nLanes = 4; + if (zeroUpper) *zeroUpper = False; + if (arrSpec) *arrSpec = "4s"; + return True; + } + if (!bitQ && !bitSZ) { // 2x32 + if (tyI) *tyI = Ity_I32; + if (tyF) *tyF = Ity_F32; + if (nLanes) *nLanes = 2; + if (zeroUpper) *zeroUpper = True; + if (arrSpec) *arrSpec = "2s"; + return True; + } + // Else impliedly 1x64, which isn't allowed. + return False; +} + +/* Helper for decoding laneage for simple vector operations, + eg integer add. */ +static Bool getLaneInfo_SIMPLE ( /*OUT*/Bool* zeroUpper, + /*OUT*/const HChar** arrSpec, + Bool bitQ, UInt szBlg2 ) +{ + vassert(bitQ == True || bitQ == False); + vassert(szBlg2 < 4); + Bool zu = False; + const HChar* as = NULL; + switch ((szBlg2 << 1) | (bitQ ? 1 : 0)) { + case 0: zu = True; as = "8b"; break; + case 1: zu = False; as = "16b"; break; + case 2: zu = True; as = "4h"; break; + case 3: zu = False; as = "8h"; break; + case 4: zu = True; as = "2s"; break; + case 5: zu = False; as = "4s"; break; + case 6: return False; // impliedly 1x64 + case 7: zu = False; as = "2d"; break; + default: vassert(0); + } + vassert(as); + if (arrSpec) *arrSpec = as; + if (zeroUpper) *zeroUpper = zu; + return True; +} + + static Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) { @@ -4294,28 +4506,28 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) switch (ix) { case 1: putQReg128(dd, mkV128(0)); - putQReg(dd, getIReg32orZR(nn)); + putQRegLO(dd, getIReg32orZR(nn)); DIP("fmov s%u, w%u\n", dd, nn); break; case 2: putQReg128(dd, mkV128(0)); - putQReg(dd, getIReg64orZR(nn)); + putQRegLO(dd, getIReg64orZR(nn)); DIP("fmov d%u, x%u\n", dd, nn); break; case 3: - putQReg64HI(dd, getIReg64orZR(nn)); + putQRegHI64(dd, getIReg64orZR(nn)); DIP("fmov v%u.d[1], x%u\n", dd, nn); break; case 4: - putIReg32orZR(dd, getQReg(Ity_I32, nn)); + putIReg32orZR(dd, getQRegLO(nn, Ity_I32)); DIP("fmov w%u, s%u\n", dd, nn); break; case 5: - putIReg64orZR(dd, getQReg(Ity_I64, nn)); + putIReg64orZR(dd, getQRegLO(nn, Ity_I64)); DIP("fmov x%u, d%u\n", dd, nn); break; case 6: - putIReg64orZR(dd, getQReg64HI(nn)); + putIReg64orZR(dd, getQRegHI64(nn)); DIP("fmov x%u, v%u.d[1]\n", dd, nn); break; default: @@ -4341,8 +4553,9 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) vassert(0 == (imm & 0xFFFFFFFF00000000ULL)); } putQReg128(dd, mkV128(0)); - putQReg(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL)); - DIP("fmov %s, #0x%llx\n", nameQReg(isD ? 8 : 4, dd), imm); + putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL)); + DIP("fmov %s, #0x%llx\n", + nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm); return True; } @@ -4377,9 +4590,9 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) ? unop(ops[ix], src) : binop(ops[ix], mkexpr(mk_get_IR_rounding_mode()), src); putQReg128(dd, mkV128(0)); - putQReg(dd, res); + putQRegLO(dd, res); DIP("%ccvtf %s, %s\n", - isU ? 'u' : 's', nameQReg(isF64 ? 8 : 4, dd), + isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32), nameIRegOrZR(isI64, nn)); return True; } @@ -4402,7 +4615,6 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) UInt dd = INSN(4,0); IROp iop = Iop_INVALID; IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; Bool neg = False; const HChar* nm = "???"; switch (op) { @@ -4416,13 +4628,13 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) } vassert(iop != Iop_INVALID); IRExpr* resE = triop(iop, mkexpr(mk_get_IR_rounding_mode()), - getQReg(ty, nn), getQReg(ty, mm)); + getQRegLO(nn, ty), getQRegLO(mm, ty)); IRTemp res = newTemp(ty); assign(res, neg ? unop(mkNEGF(ty),resE) : resE); putQReg128(dd, mkV128(0)); - putQReg(dd, mkexpr(res)); + putQRegLO(dd, mkexpr(res)); DIP("%s %s, %s, %s\n", - nm, nameQReg(szB, dd), nameQReg(szB, nn), nameQReg(szB, mm)); + nm, nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty)); return True; } @@ -4442,32 +4654,32 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) UInt nn = INSN(9,5); UInt dd = INSN(4,0); IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; IRTemp res = newTemp(ty); if (opc == BITS2(0,0)) { - assign(res, getQReg(ty, nn)); + assign(res, getQRegLO(nn, ty)); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); - DIP("fmov %s, %s\n", nameQReg(szB, dd), nameQReg(szB, nn)); + putQRegLO(dd, mkexpr(res)); + DIP("fmov %s, %s\n", + nameQRegLO(dd, ty), nameQRegLO(nn, ty)); return True; } if (opc == BITS2(1,0) || opc == BITS2(0,1)) { Bool isAbs = opc == BITS2(0,1); IROp op = isAbs ? mkABSF(ty) : mkNEGF(ty); - assign(res, unop(op, getQReg(ty, nn))); + assign(res, unop(op, getQRegLO(nn, ty))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); + putQRegLO(dd, mkexpr(res)); DIP("%s %s, %s\n", isAbs ? "fabs" : "fneg", - nameQReg(szB, dd), nameQReg(szB, nn)); + nameQRegLO(dd, ty), nameQRegLO(nn, ty)); return True; } if (opc == BITS2(1,1)) { assign(res, binop(mkSQRTF(ty), - mkexpr(mk_get_IR_rounding_mode()), getQReg(ty, nn))); + mkexpr(mk_get_IR_rounding_mode()), getQRegLO(nn, ty))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); - DIP("fsqrt %s, %s\n", nameQReg(szB, dd), nameQReg(szB, nn)); + putQRegLO(dd, mkexpr(res)); + DIP("fsqrt %s, %s\n", nameQRegLO(dd, ty), nameQRegLO(nn, ty)); return True; } /* else fall through; other cases are ATC */ @@ -4498,26 +4710,25 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) Bool isCMPE = INSN(4,4) == 1; Bool cmpZero = INSN(3,3) == 1; IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; Bool valid = True; if (cmpZero && mm != 0) valid = False; if (valid) { IRTemp argL = newTemp(ty); IRTemp argR = newTemp(ty); IRTemp irRes = newTemp(Ity_I32); - assign(argL, getQReg(ty, nn)); + assign(argL, getQRegLO(nn, ty)); assign(argR, cmpZero ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0))) - : getQReg(ty, mm)); + : getQRegLO(mm, ty)); assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32, mkexpr(argL), mkexpr(argR))); IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes); IRTemp nzcv_28x0 = newTemp(Ity_I64); assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28))); setFlags_COPY(nzcv_28x0); - DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", - nameQReg(szB, nn), cmpZero ? "#0.0" : nameQReg(szB, mm)); + DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ty), + cmpZero ? "#0.0" : nameQRegLO(mm, ty)); return True; } } @@ -4544,15 +4755,14 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) UInt dd = INSN(4,0); UInt ix = (INSN(21,21) << 1) | INSN(15,15); IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; IROp opADD = mkADDF(ty); IROp opSUB = mkSUBF(ty); IROp opMUL = mkMULF(ty); IROp opNEG = mkNEGF(ty); IRTemp res = newTemp(ty); - IRExpr* eA = getQReg(ty, aa); - IRExpr* eN = getQReg(ty, nn); - IRExpr* eM = getQReg(ty, mm); + IRExpr* eA = getQRegLO(aa, ty); + IRExpr* eN = getQRegLO(nn, ty); + IRExpr* eM = getQRegLO(mm, ty); IRExpr* rm = mkexpr(mk_get_IR_rounding_mode()); IRExpr* eNxM = triop(opMUL, rm, eN, eM); switch (ix) { @@ -4563,11 +4773,11 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) default: vassert(0); } putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); + putQRegLO(dd, mkexpr(res)); const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" }; DIP("%s %s, %s, %s, %s\n", - names[ix], nameQReg(szB, dd), nameQReg(szB, nn), - nameQReg(szB, mm), nameQReg(szB, aa)); + names[ix], nameQRegLO(dd, ty), nameQRegLO(nn, ty), + nameQRegLO(mm, ty), nameQRegLO(aa, ty)); return True; } @@ -4642,16 +4852,15 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) } else { return False; } - UInt srcSzB = isF64 ? 8 : 4; IRType srcTy = isF64 ? Ity_F64 : Ity_F32; IRType dstTy = isI64 ? Ity_I64 : Ity_I32; IRTemp src = newTemp(srcTy); IRTemp dst = newTemp(dstTy); - assign(src, getQReg(srcTy, nn)); + assign(src, getQRegLO(nn, srcTy)); assign(dst, binop(op, mkU32(irrm), mkexpr(src))); putIRegOrZR(isI64, dd, mkexpr(dst)); DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's', - nameIRegOrZR(isI64, dd), nameQReg(srcSzB, nn)); + nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy)); return True; } @@ -4677,7 +4886,6 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) UInt nn = INSN(9,5); UInt dd = INSN(4,0); IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; IRExpr* irrmE = NULL; UChar ch = '?'; switch (rm) { @@ -4689,12 +4897,13 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) if (irrmE) { IRTemp src = newTemp(ty); IRTemp dst = newTemp(ty); - assign(src, getQReg(ty, nn)); + assign(src, getQRegLO(nn, ty)); assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt, irrmE, mkexpr(src))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(dst)); - DIP("frint%c %s, %s\n", ch, nameQReg(szB, dd), nameQReg(szB, nn)); + putQRegLO(dd, mkexpr(dst)); + DIP("frint%c %s, %s\n", + ch, nameQRegLO(dd, ty), nameQRegLO(nn, ty)); return True; } /* else unhandled rounding mode case -- fall through */ @@ -4720,20 +4929,22 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) if (b2322 == BITS2(0,0) && b1615 == BITS2(0,1)) { /* Convert S to D */ IRTemp res = newTemp(Ity_F64); - assign(res, unop(Iop_F32toF64, getQReg(Ity_F32, nn))); + assign(res, unop(Iop_F32toF64, getQRegLO(nn, Ity_F32))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); - DIP("fcvt %s, %s\n", nameQReg(8, dd), nameQReg(4, nn)); + putQRegLO(dd, mkexpr(res)); + DIP("fcvt %s, %s\n", + nameQRegLO(dd, Ity_F64), nameQRegLO(nn, Ity_F32)); return True; } if (b2322 == BITS2(0,1) && b1615 == BITS2(0,0)) { /* Convert D to S */ IRTemp res = newTemp(Ity_F32); assign(res, binop(Iop_F64toF32, mkexpr(mk_get_IR_rounding_mode()), - getQReg(Ity_F64, nn))); + getQRegLO(nn, Ity_F64))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); - DIP("fcvt %s, %s\n", nameQReg(4, dd), nameQReg(8, nn)); + putQRegLO(dd, mkexpr(res)); + DIP("fcvt %s, %s\n", + nameQRegLO(dd, Ity_F32), nameQRegLO(nn, Ity_F64)); return True; } /* else unhandled */ @@ -4751,18 +4962,242 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) UInt nn = INSN(9,5); UInt dd = INSN(4,0); IRType ty = isD ? Ity_F64 : Ity_F32; - UInt szB = isD ? 8 : 4; IRTemp res = newTemp(ty); - assign(res, unop(mkABSF(ty), triop(mkSUBF(ty), - mkexpr(mk_get_IR_rounding_mode()), - getQReg(ty,nn), getQReg(ty,mm)))); + assign(res, unop(mkABSF(ty), + triop(mkSUBF(ty), + mkexpr(mk_get_IR_rounding_mode()), + getQRegLO(nn,ty), getQRegLO(mm,ty)))); putQReg128(dd, mkV128(0x0000)); - putQReg(dd, mkexpr(res)); + putQRegLO(dd, mkexpr(res)); DIP("fabd %s, %s, %s\n", - nameQReg(szB, dd), nameQReg(szB, nn), nameQReg(szB, mm)); + nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty)); return True; } + /* -------------- {S,U}CVTF (vector, integer) -------------- */ + /* 31 28 22 21 15 9 4 + 0q0 01110 0 sz 1 00001 110110 n d SCVTF Vd, Vn + 0q1 01110 0 sz 1 00001 110110 n d UCVTF Vd, Vn + with laneage: + case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D + */ + if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,0,0) + && INSN(21,16) == BITS6(1,0,0,0,0,1) + && INSN(15,10) == BITS6(1,1,0,1,1,0)) { + Bool isQ = INSN(30,30) == 1; + Bool isU = INSN(29,29) == 1; + Bool isF64 = INSN(22,22) == 1; + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + if (isQ || !isF64) { + IRType tyF = Ity_INVALID, tyI = Ity_INVALID; + UInt nLanes = 0; + Bool zeroHI = False; + const HChar* arrSpec = NULL; + Bool ok = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec, + isQ, isF64 ); + IROp op = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32) + : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32); + IRTemp rm = mk_get_IR_rounding_mode(); + UInt i; + vassert(ok); /* the 'if' above should ensure this */ + for (i = 0; i < nLanes; i++) { + putQRegLane(dd, i, + binop(op, mkexpr(rm), getQRegLane(nn, i, tyI))); + } + if (zeroHI) { + putQRegLane(dd, 1, mkU64(0)); + } + DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's', + nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec); + return True; + } + /* else fall through */ + } + + /* ---------- F{ADD,SUB,MUL,DIV,MLA,MLS} (vector) ---------- */ + /* 31 28 22 21 20 15 9 4 case + 0q0 01110 0 sz 1 m 110101 n d FADD Vd,Vn,Vm 1 + 0q0 01110 1 sz 1 m 110101 n d FSUB Vd,Vn,Vm 2 + 0q1 01110 0 sz 1 m 110111 n d FMUL Vd,Vn,Vm 3 + 0q1 01110 0 sz 1 m 111111 n d FDIV Vd,Vn,Vm 4 + 0q0 01110 0 sz 1 m 110011 n d FMLA Vd,Vn,Vm 5 + 0q0 01110 1 sz 1 m 110011 n d FMLS Vd,Vn,Vm 6 + */ + if (INSN(31,31) == 0 + && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) { + Bool isQ = INSN(30,30) == 1; + UInt b29 = INSN(29,29); + UInt b23 = INSN(23,23); + Bool isF64 = INSN(22,22) == 1; + UInt mm = INSN(20,16); + UInt b1510 = INSN(15,10); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + UInt ix = 0; + /**/ if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,1,0,1)) ix = 1; + else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,1,0,1)) ix = 2; + else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,0,1,1,1)) ix = 3; + else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,1,1,1,1)) ix = 4; + else if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,0,1,1)) ix = 5; + else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,0,1,1)) ix = 6; + IRType laneTy = Ity_INVALID; + Bool zeroHI = False; + const HChar* arr = "??"; + Bool ok + = getLaneInfo_Q_SZ(NULL, &laneTy, NULL, &zeroHI, &arr, isQ, isF64); + /* Skip MLA/MLS for the time being */ + if (ok && ix >= 1 && ix <= 4) { + const IROp ops64[4] + = { Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2 }; + const IROp ops32[4] + = { Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4 }; + const HChar* names[4] + = { "fadd", "fsub", "fmul", "fdiv" }; + IROp op = laneTy==Ity_F64 ? ops64[ix-1] : ops32[ix-1]; + IRTemp rm = mk_get_IR_rounding_mode(); + IRTemp t1 = newTemp(Ity_V128); + IRTemp t2 = newTemp(Ity_V128); + assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm))); + assign(t2, zeroHI ? unop(Iop_ZeroHI64, mkexpr(t1)) : mkexpr(t1)); + putQReg128(dd, mkexpr(t2)); + DIP("%s %s.%s, %s.%s, %s.%s\n", names[ix-1], + nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr); + return True; + } + } + + /* ---------------- ADD/SUB (vector) ---------------- */ + /* 31 28 23 21 20 15 9 4 + 0q0 01110 size 1 m 100001 n d ADD Vd.T, Vn.T, Vm.T + 0q1 01110 size 1 m 100001 n d SUB Vd.T, Vn.T, Vm.T + */ + if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0) + && INSN(21,21) == 1 && INSN(15,10) == BITS6(1,0,0,0,0,1)) { + Bool isQ = INSN(30,30) == 1; + UInt szBlg2 = INSN(23,22); + Bool isSUB = INSN(29,29) == 1; + UInt mm = INSN(20,16); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + Bool zeroHI = False; + const HChar* arrSpec = ""; + Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 ); + if (ok) { + const IROp opADD[4] + = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 }; + const IROp opSUB[4] + = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 }; + vassert(szBlg2 < 4); + IROp op = isSUB ? opSUB[szBlg2] : opADD[szBlg2]; + IRTemp t = newTemp(Ity_V128); + assign(t, binop(op, getQReg128(nn), getQReg128(mm))); + putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64, mkexpr(t)) : mkexpr(t)); + const HChar* nm = isSUB ? "sub" : "add"; + DIP("%s %s.%s, %s.%s, %s.%s\n", nm, + nameQReg128(dd), arrSpec, + nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec); + return True; + } + /* else fall through */ + } + + /* -------------------- XTN{,2} -------------------- */ + /* 31 28 23 21 15 9 4 + 0q0 01110 size 100001 001010 n d + */ + if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,1,0) + && INSN(21,16) == BITS6(1,0,0,0,0,1) + && INSN(15,10) == BITS6(0,0,1,0,1,0)) { + Bool isQ = INSN(30,30) == 1; + UInt size = INSN(23,22); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + IROp op = Iop_INVALID; + const HChar* tb = NULL; + const HChar* ta = NULL; + switch ((size << 1) | (isQ ? 1 : 0)) { + case 0: tb = "8b"; ta = "8h"; op = Iop_NarrowUn16to8x8; break; + case 1: tb = "16b"; ta = "8h"; op = Iop_NarrowUn16to8x8; break; + case 2: tb = "4h"; ta = "4s"; op = Iop_NarrowUn32to16x4; break; + case 3: tb = "8h"; ta = "4s"; op = Iop_NarrowUn32to16x4; break; + case 4: tb = "2s"; ta = "2d"; op = Iop_NarrowUn64to32x2; break; + case 5: tb = "4s"; ta = "2d"; op = Iop_NarrowUn64to32x2; break; + case 6: break; + case 7: break; + default: vassert(0); + } + if (op != Iop_INVALID) { + if (!isQ) { + putQRegLane(dd, 1, mkU64(0)); + } + putQRegLane(dd, isQ ? 1 : 0, unop(op, getQReg128(nn))); + DIP("xtn%s %s.%s, %s.%s\n", isQ ? "2" : "", + nameQReg128(dd), tb, nameQReg128(nn), ta); + return True; + } + /* else fall through */ + } + + /* ---------------- DUP (element, vector) ---------------- */ + /* 31 28 20 15 9 4 + 0q0 01110000 imm5 000001 n d DUP Vd.T, Vn.Ts[index] + */ + if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0) + && INSN(15,10) == BITS6(0,0,0,0,0,1)) { + Bool isQ = INSN(30,30) == 1; + UInt imm5 = INSN(20,16); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + IRTemp w0 = newTemp(Ity_I64); + const HChar* arT = "??"; + const HChar* arTs = "??"; + IRType laneTy = Ity_INVALID; + UInt laneNo = 16; /* invalid */ + if (imm5 & 1) { + arT = isQ ? "16b" : "8b"; + arTs = "b"; + laneNo = (imm5 >> 1) & 15; + laneTy = Ity_I8; + assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy))); + } + else if (imm5 & 2) { + arT = isQ ? "8h" : "4h"; + arTs = "h"; + laneNo = (imm5 >> 2) & 7; + laneTy = Ity_I16; + assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy))); + } + else if (imm5 & 4) { + arT = isQ ? "4s" : "2s"; + arTs = "s"; + laneNo = (imm5 >> 3) & 3; + laneTy = Ity_I32; + assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy))); + } + else if ((imm5 & 8) && isQ) { + arT = "2d"; + arTs = "d"; + laneNo = (imm5 >> 4) & 1; + laneTy = Ity_I64; + assign(w0, getQRegLane(nn, laneNo, laneTy)); + } + else { + /* invalid; leave laneTy unchanged. */ + } + /* */ + if (laneTy != Ity_INVALID) { + vassert(laneNo < 16); + IRTemp w1 = math_DUP_TO_64(w0, laneTy); + putQReg128(dd, binop(Iop_64HLtoV128, + isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1))); + DIP("dup %s.%s, %s.%s[%u]\n", + nameQReg128(dd), arT, nameQReg128(nn), arTs, laneNo); + return True; + } + /* else fall through */ + } + /* FIXME Temporary hacks to get through ld.so FIXME */ /* ------------------ movi vD.4s, #0x0 ------------------ */ diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index d2061face9..aef98dba91 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -848,6 +848,23 @@ static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) { } } +static void showARM64VecBinOp(/*OUT*/const HChar** nm, + /*OUT*/const HChar** ar, ARM64VecBinOp op ) { + switch (op) { + case ARM64vecb_ADD64x2: *nm = "add "; *ar = "2d"; return; + case ARM64vecb_ADD32x4: *nm = "add "; *ar = "4s"; return; + case ARM64vecb_ADD16x8: *nm = "add "; *ar = "8h"; return; + case ARM64vecb_SUB64x2: *nm = "sub "; *ar = "2d"; return; + case ARM64vecb_SUB32x4: *nm = "sub "; *ar = "4s"; return; + case ARM64vecb_SUB16x8: *nm = "sub "; *ar = "8h"; return; + case ARM64vecb_FADD64x2: *nm = "fadd"; *ar = "2d"; return; + case ARM64vecb_FSUB64x2: *nm = "fsub"; *ar = "2d"; return; + case ARM64vecb_FMUL64x2: *nm = "fmul"; *ar = "2d"; return; + case ARM64vecb_FDIV64x2: *nm = "fdiv"; *ar = "2d"; return; + default: vpanic("showARM64VecBinOp"); + } +} + //ZZ const HChar* showARMNeonBinOp ( ARMNeonBinOp op ) { //ZZ switch (op) { //ZZ case ARMneon_VAND: return "vand"; @@ -1512,6 +1529,25 @@ ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ) { i->ARM64in.FPCR.iReg = iReg; return i; } +ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, + HReg dst, HReg argL, HReg argR ) { + ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); + i->tag = ARM64in_VBinV; + i->ARM64in.VBinV.op = op; + i->ARM64in.VBinV.dst = dst; + i->ARM64in.VBinV.argL = argL; + i->ARM64in.VBinV.argR = argR; + return i; +} +ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ) { + ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr)); + i->tag = ARM64in_VNarrowV; + i->ARM64in.VNarrowV.dszBlg2 = dszBlg2; + i->ARM64in.VNarrowV.dst = dst; + i->ARM64in.VNarrowV.src = src; + vassert(dszBlg2 == 0 || dszBlg2 == 1 || dszBlg2 == 2); + return i; +} //ZZ ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) { //ZZ ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr)); //ZZ i->tag = ARMin_VAluS; @@ -2104,6 +2140,30 @@ void ppARM64Instr ( ARM64Instr* i ) { vex_printf(", fpcr"); } return; + case ARM64in_VBinV: { + const HChar* nm = "??"; + const HChar* ar = "??"; + showARM64VecBinOp(&nm, &ar, i->ARM64in.VBinV.op); + vex_printf("%s ", nm); + ppHRegARM64(i->ARM64in.VBinV.dst); + vex_printf(".%s, ", ar); + ppHRegARM64(i->ARM64in.VBinV.argL); + vex_printf(".%s, ", ar); + ppHRegARM64(i->ARM64in.VBinV.argR); + vex_printf(".%s", ar); + return; + } + case ARM64in_VNarrowV: { + UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2; + const HChar* darr[3] = { "8b", "4h", "2s" }; + const HChar* sarr[3] = { "8h", "4s", "2d" }; + vex_printf("xtn "); + ppHRegARM64(i->ARM64in.VNarrowV.dst); + vex_printf(".%s, ", dszBlg2 < 3 ? darr[dszBlg2] : "??"); + ppHRegARM64(i->ARM64in.VNarrowV.src); + vex_printf(".%s", dszBlg2 < 3 ? sarr[dszBlg2] : "??"); + return; + } //ZZ case ARMin_VAluS: //ZZ vex_printf("f%-3ss ", showARMVfpOp(i->ARMin.VAluS.op)); //ZZ ppHRegARM(i->ARMin.VAluS.dst); @@ -2567,6 +2627,15 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 ) else addHRegUse(u, HRmWrite, i->ARM64in.FPCR.iReg); return; + case ARM64in_VBinV: + addHRegUse(u, HRmWrite, i->ARM64in.VBinV.dst); + addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL); + addHRegUse(u, HRmRead, i->ARM64in.VBinV.argR); + return; + case ARM64in_VNarrowV: + addHRegUse(u, HRmWrite, i->ARM64in.VNarrowV.dst); + addHRegUse(u, HRmRead, i->ARM64in.VNarrowV.src); + return; //ZZ case ARMin_VAluS: //ZZ addHRegUse(u, HRmWrite, i->ARMin.VAluS.dst); //ZZ addHRegUse(u, HRmRead, i->ARMin.VAluS.argL); @@ -2842,6 +2911,15 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 ) case ARM64in_FPCR: i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg); return; + case ARM64in_VBinV: + i->ARM64in.VBinV.dst = lookupHRegRemap(m, i->ARM64in.VBinV.dst); + i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL); + i->ARM64in.VBinV.argR = lookupHRegRemap(m, i->ARM64in.VBinV.argR); + return; + case ARM64in_VNarrowV: + i->ARM64in.VNarrowV.dst = lookupHRegRemap(m, i->ARM64in.VNarrowV.dst); + i->ARM64in.VNarrowV.src = lookupHRegRemap(m, i->ARM64in.VNarrowV.src); + return; //ZZ case ARMin_VAluS: //ZZ i->ARMin.VAluS.dst = lookupHRegRemap(m, i->ARMin.VAluS.dst); //ZZ i->ARMin.VAluS.argL = lookupHRegRemap(m, i->ARMin.VAluS.argL); @@ -3117,15 +3195,16 @@ static inline UChar qregNo ( HReg r ) #define X110 BITS4(0, 1,1,0) #define X111 BITS4(0, 1,1,1) -#define BITS8(zzb7,zzb6,zzb5,zzb4,zzb3,zzb2,zzb1,zzb0) \ - ((BITS4(zzb7,zzb6,zzb5,zzb4) << 4) | BITS4(zzb3,zzb2,zzb1,zzb0)) - #define X0000 BITS4(0,0,0,0) #define X0001 BITS4(0,0,0,1) #define X0010 BITS4(0,0,1,0) #define X0011 BITS4(0,0,1,1) +#define BITS8(zzb7,zzb6,zzb5,zzb4,zzb3,zzb2,zzb1,zzb0) \ + ((BITS4(zzb7,zzb6,zzb5,zzb4) << 4) | BITS4(zzb3,zzb2,zzb1,zzb0)) + #define X00000 BITS8(0,0,0, 0,0,0,0,0) +#define X00001 BITS8(0,0,0, 0,0,0,0,1) #define X00111 BITS8(0,0,0, 0,0,1,1,1) #define X01000 BITS8(0,0,0, 0,1,0,0,0) #define X10000 BITS8(0,0,0, 1,0,0,0,0) @@ -3143,14 +3222,18 @@ static inline UChar qregNo ( HReg r ) #define X010001 BITS8(0,0, 0,1,0,0,0,1) #define X011010 BITS8(0,0, 0,1,1,0,1,0) #define X011111 BITS8(0,0, 0,1,1,1,1,1) +#define X100001 BITS8(0,0, 1,0,0,0,0,1) #define X100100 BITS8(0,0, 1,0,0,1,0,0) #define X100101 BITS8(0,0, 1,0,0,1,0,1) #define X100110 BITS8(0,0, 1,0,0,1,1,0) #define X110000 BITS8(0,0, 1,1,0,0,0,0) #define X110001 BITS8(0,0, 1,1,0,0,0,1) +#define X110101 BITS8(0,0, 1,1,0,1,0,1) +#define X110111 BITS8(0,0, 1,1,0,1,1,1) #define X111000 BITS8(0,0, 1,1,1,0,0,0) #define X111001 BITS8(0,0, 1,1,1,0,0,1) #define X111101 BITS8(0,0, 1,1,1,1,0,1) +#define X111111 BITS8(0,0, 1,1,1,1,1,1) #define X00100000 BITS8(0,0,1,0,0,0,0,0) #define X00100001 BITS8(0,0,1,0,0,0,0,1) @@ -3165,6 +3248,10 @@ static inline UChar qregNo ( HReg r ) #define X01100010 BITS8(0,1,1,0,0,0,1,0) #define X01100011 BITS8(0,1,1,0,0,0,1,1) #define X01110000 BITS8(0,1,1,1,0,0,0,0) +#define X01110001 BITS8(0,1,1,1,0,0,0,1) +#define X01110011 BITS8(0,1,1,1,0,0,1,1) +#define X01110101 BITS8(0,1,1,1,0,1,0,1) +#define X01110111 BITS8(0,1,1,1,0,1,1,1) #define X11000001 BITS8(1,1,0,0,0,0,0,1) #define X11000011 BITS8(1,1,0,0,0,0,1,1) #define X11010100 BITS8(1,1,0,1,0,1,0,0) @@ -4418,7 +4505,7 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, /* 31 28 23 21 20 18 15 9 4 000 11110 00 1 00 010 000000 n d SCVTF Sd, Wn 000 11110 01 1 00 010 000000 n d SCVTF Dd, Wn - 100 11110 00 1 00 010 000000 n d SCVTF Sd, Xn x + 100 11110 00 1 00 010 000000 n d SCVTF Sd, Xn 100 11110 01 1 00 010 000000 n d SCVTF Dd, Xn 000 11110 00 1 00 011 000000 n d UCVTF Sd, Wn 000 11110 01 1 00 011 000000 n d UCVTF Dd, Wn @@ -4521,16 +4608,6 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, } goto done; } - case ARM64in_FPCR: { - Bool toFPCR = i->ARM64in.FPCR.toFPCR; - UInt iReg = iregNo(i->ARM64in.FPCR.iReg); - if (toFPCR) { - /* 0xD51B44 000 Rt MSR fpcr, rT */ - *p++ = 0xD51B4400 | (iReg & 0x1F); - goto done; - } - goto bad; // FPCR -> iReg case currently ATC - } case ARM64in_VUnaryD: { /* 31 23 21 16 14 9 4 000,11110 01 1,0000 0,0 10000 n d FMOV Dd, Dn (not handled) @@ -4653,6 +4730,75 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = X_3_8_5_6_5_5(X000, X11110001, sM, X001000, sN, X00000); goto done; } + case ARM64in_FPCR: { + Bool toFPCR = i->ARM64in.FPCR.toFPCR; + UInt iReg = iregNo(i->ARM64in.FPCR.iReg); + if (toFPCR) { + /* 0xD51B44 000 Rt MSR fpcr, rT */ + *p++ = 0xD51B4400 | (iReg & 0x1F); + goto done; + } + goto bad; // FPCR -> iReg case currently ATC + } + case ARM64in_VBinV: { + /* 31 23 20 15 9 4 + 010 01110 11 1 m 100001 n d ADD Vd.2d, Vn.2d, Vm.2d + 010 01110 10 1 m 100001 n d ADD Vd.4s, Vn.4s, Vm.4s + 011 01110 11 1 m 100001 n d SUB Vd.2d, Vn.2d, Vm.2d + 011 01110 10 1 m 100001 n d SUB Vd.4s, Vn.4s, Vm.4s + 011 01110 01 1 m 100001 n d SUB Vd.8h, Vn.8h, Vm.8h + 010 01110 01 1 m 110101 n d FADD Vd.2d, Vn.2d, Vm.2d + 010 01110 11 1 m 110101 n d FSUB Vd.2d, Vn.2d, Vm.2d + 011 01110 01 1 m 110111 n d FMUL Vd.2d, Vn.2d, Vm.2d + 011 01110 01 1 m 111111 n d FDIV Vd.2d, Vn.2d, Vm.2d + */ + UInt vD = qregNo(i->ARM64in.VBinV.dst); + UInt vN = qregNo(i->ARM64in.VBinV.argL); + UInt vM = qregNo(i->ARM64in.VBinV.argR); + switch (i->ARM64in.VBinV.op) { + case ARM64vecb_ADD64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X100001, vN, vD); + break; + case ARM64vecb_SUB64x2: + *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100001, vN, vD); + break; + case ARM64vecb_SUB32x4: + *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X100001, vN, vD); + break; + case ARM64vecb_SUB16x8: + *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X100001, vN, vD); + break; + case ARM64vecb_FADD64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X110101, vN, vD); + break; + case ARM64vecb_FSUB64x2: + *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X110101, vN, vD); + break; + case ARM64vecb_FMUL64x2: + *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X110111, vN, vD); + break; + case ARM64vecb_FDIV64x2: + *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X111111, vN, vD); + break; + default: + goto bad; + } + goto done; + } + case ARM64in_VNarrowV: { + /* 31 23 21 15 9 4 + 000 01110 00 1,00001 001010 n d XTN Vd.8b, Vn.8h + 000 01110 01 1,00001 001010 n d XTN Vd.4h, Vn.4s + 000 01110 10 1,00001 001010 n d XTN Vd.2s, Vn.2d + */ + UInt vD = qregNo(i->ARM64in.VNarrowV.dst); + UInt vN = qregNo(i->ARM64in.VNarrowV.src); + UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2; + vassert(dszBlg2 >= 0 && dszBlg2 <= 2); + *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1), + X00001, X001010, vN, vD); + goto done; + } //ZZ case ARMin_VAluS: { //ZZ UInt dN = fregNo(i->ARMin.VAluS.argL); //ZZ UInt dD = fregNo(i->ARMin.VAluS.dst); diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 6a52377d73..c3a63c8368 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -119,7 +119,7 @@ typedef typedef enum { - ARM64am_RI9=1, /* reg + simm9 */ + ARM64am_RI9=10, /* reg + simm9 */ ARM64am_RI12, /* reg + uimm12 * szB (iow, scaled by access size) */ ARM64am_RR /* reg1 + reg2 */ } @@ -155,8 +155,8 @@ extern ARM64AMode* ARM64AMode_RR ( HReg base, HReg index ); typedef enum { - ARM64riA_I12=4, /* uimm12 << 0 or 12 only */ - ARM64riA_R /* reg */ + ARM64riA_I12=20, /* uimm12 << 0 or 12 only */ + ARM64riA_R /* reg */ } ARM64RIATag; @@ -212,7 +212,7 @@ extern ARM64RIL* ARM64RIL_R ( HReg ); typedef enum { - ARM64ri6_I6=8, /* uimm6, 1 .. 63 only */ + ARM64ri6_I6=30, /* uimm6, 1 .. 63 only */ ARM64ri6_R /* reg */ } ARM64RI6Tag; @@ -239,7 +239,7 @@ extern ARM64RI6* ARM64RI6_R ( HReg ); typedef enum { - ARM64lo_AND=10, + ARM64lo_AND=40, ARM64lo_OR, ARM64lo_XOR } @@ -247,7 +247,7 @@ typedef typedef enum { - ARM64sh_SHL=13, + ARM64sh_SHL=50, ARM64sh_SHR, ARM64sh_SAR } @@ -255,7 +255,7 @@ typedef typedef enum { - ARM64un_NEG=16, + ARM64un_NEG=60, ARM64un_NOT, ARM64un_CLZ, } @@ -263,7 +263,7 @@ typedef typedef enum { - ARM64mul_PLAIN=60, /* lo64(64 * 64) */ + ARM64mul_PLAIN=70, /* lo64(64 * 64) */ ARM64mul_ZX, /* hi64(64 *u 64) */ ARM64mul_SX /* hi64(64 *s 64) */ } @@ -273,7 +273,7 @@ typedef /* These characterise an integer-FP conversion, but don't imply any particular direction. */ enum { - ARM64cvt_F32_I32S=65, + ARM64cvt_F32_I32S=80, ARM64cvt_F64_I32S, ARM64cvt_F32_I64S, ARM64cvt_F64_I64S, @@ -287,7 +287,7 @@ typedef typedef enum { - ARM64fpb_ADD=75, + ARM64fpb_ADD=100, ARM64fpb_SUB, ARM64fpb_MUL, ARM64fpb_DIV, @@ -297,7 +297,7 @@ typedef typedef enum { - ARM64fpu_NEG=82, + ARM64fpu_NEG=110, ARM64fpu_ABS, ARM64fpu_SQRT, ARM64fpu_RINT, @@ -305,6 +305,22 @@ typedef } ARM64FpUnaryOp; +typedef + enum { + ARM64vecb_ADD64x2=120, + ARM64vecb_ADD32x4, + ARM64vecb_ADD16x8, + ARM64vecb_SUB64x2, + ARM64vecb_SUB32x4, + ARM64vecb_SUB16x8, + ARM64vecb_FADD64x2, + ARM64vecb_FSUB64x2, + ARM64vecb_FMUL64x2, + ARM64vecb_FDIV64x2, + ARM64vecb_INVALID + } + ARM64VecBinOp; + //ZZ extern const HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op ); //ZZ //ZZ typedef @@ -470,7 +486,7 @@ typedef ARM64in_Mul, //ZZ ARMin_LdrEX, //ZZ ARMin_StrEX, - /* vector */ + /* ARM64in_V*: scalar ops involving vector registers */ ARM64in_VLdStS, /* 32-bit FP load/store, with imm offset */ ARM64in_VLdStD, /* 64-bit FP load/store, with imm offset */ ARM64in_VLdStQ, @@ -484,6 +500,9 @@ typedef ARM64in_VCmpD, ARM64in_VCmpS, ARM64in_FPCR, + /* ARM64in_V*V: vector ops on vector registers */ + ARM64in_VBinV, + ARM64in_VNarrowV, //ZZ ARMin_VAluS, //ZZ ARMin_VCMovD, //ZZ ARMin_VCMovS, @@ -749,6 +768,20 @@ typedef Bool toFPCR; HReg iReg; } FPCR; + /* binary vector operation on vector registers */ + struct { + ARM64VecBinOp op; + HReg dst; + HReg argL; + HReg argR; + } VBinV; + /* vector narrowing, Q -> Q. Result goes in the bottom half + of dst and the top half is zeroed out. Iow is XTN. */ + struct { + UInt dszBlg2; // 0: 16to8_x8 1: 32to16_x4 2: 64to32_x2 + HReg dst; // Q reg + HReg src; // Q reg + } VNarrowV; //ZZ /* 32-bit FP binary arithmetic */ //ZZ struct { //ZZ ARMVfpOp op; @@ -949,6 +982,8 @@ extern ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op, HReg, HReg, HReg ); extern ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR ); extern ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ); +extern ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg, HReg, HReg ); +extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ); //ZZ extern ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg, HReg, HReg ); //ZZ extern ARMInstr* ARMInstr_VCMovD ( ARMCondCode, HReg dst, HReg src ); //ZZ extern ARMInstr* ARMInstr_VCMovS ( ARMCondCode, HReg dst, HReg src ); diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 9852fe3ab5..125497e0c4 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -2118,7 +2118,21 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) ARM64sh_SAR)); return dst; } - + case Iop_NarrowUn32to16x4: + case Iop_NarrowUn64to32x2: { + HReg src = iselV128Expr(env, e->Iex.Unop.arg); + HReg tmp = newVRegV(env); + HReg dst = newVRegI(env); + UInt dszBlg2 = 3; /* illegal */ + switch (e->Iex.Unop.op) { + case Iop_NarrowUn32to16x4: dszBlg2 = 1; break; // 32to16_x4 + case Iop_NarrowUn64to32x2: dszBlg2 = 2; break; // 64to32_x2 + default: vassert(0); + } + addInstr(env, ARM64Instr_VNarrowV(dszBlg2, tmp, src)); + addInstr(env, ARM64Instr_VXfromQ(dst, tmp, 0/*laneNo*/)); + return dst; + } //ZZ case Iop_64HIto32: { //ZZ HReg rHi, rLo; //ZZ iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg); @@ -4835,49 +4849,24 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) //ZZ case Iop_Add8x16: //ZZ case Iop_Add16x8: //ZZ case Iop_Add32x4: -//ZZ case Iop_Add64x2: { -//ZZ /* -//ZZ FIXME: remove this if not used -//ZZ DECLARE_PATTERN(p_vrhadd_32sx4); -//ZZ ULong one = (1LL << 32) | 1LL; -//ZZ DEFINE_PATTERN(p_vrhadd_32sx4, -//ZZ binop(Iop_Add32x4, -//ZZ binop(Iop_Add32x4, -//ZZ binop(Iop_SarN32x4, -//ZZ bind(0), -//ZZ mkU8(1)), -//ZZ binop(Iop_SarN32x4, -//ZZ bind(1), -//ZZ mkU8(1))), -//ZZ binop(Iop_SarN32x4, -//ZZ binop(Iop_Add32x4, -//ZZ binop(Iop_Add32x4, -//ZZ binop(Iop_AndV128, -//ZZ bind(0), -//ZZ mkU128(one)), -//ZZ binop(Iop_AndV128, -//ZZ bind(1), -//ZZ mkU128(one))), -//ZZ mkU128(one)), -//ZZ mkU8(1)))); -//ZZ */ -//ZZ HReg res = newVRegV(env); -//ZZ HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1); -//ZZ HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2); -//ZZ UInt size; -//ZZ switch (e->Iex.Binop.op) { -//ZZ case Iop_Add8x16: size = 0; break; -//ZZ case Iop_Add16x8: size = 1; break; -//ZZ case Iop_Add32x4: size = 2; break; -//ZZ case Iop_Add64x2: size = 3; break; -//ZZ default: -//ZZ ppIROp(e->Iex.Binop.op); -//ZZ vpanic("Illegal element size in VADD"); -//ZZ } -//ZZ addInstr(env, ARMInstr_NBinary(ARMneon_VADD, -//ZZ res, argL, argR, size, True)); -//ZZ return res; -//ZZ } + case Iop_Add64x2: + case Iop_Sub64x2: + case Iop_Sub32x4: + case Iop_Sub16x8: { + HReg res = newVRegV(env); + HReg argL = iselV128Expr(env, e->Iex.Binop.arg1); + HReg argR = iselV128Expr(env, e->Iex.Binop.arg2); + ARM64VecBinOp op = ARM64vecb_INVALID; + switch (e->Iex.Binop.op) { + case Iop_Add64x2: op = ARM64vecb_ADD64x2; break; + case Iop_Sub64x2: op = ARM64vecb_SUB64x2; break; + case Iop_Sub32x4: op = ARM64vecb_SUB32x4; break; + case Iop_Sub16x8: op = ARM64vecb_SUB16x8; break; + default: vassert(0); + } + addInstr(env, ARM64Instr_VBinV(op, res, argL, argR)); + return res; + } //ZZ case Iop_Add32Fx4: { //ZZ HReg res = newVRegV(env); //ZZ HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1); @@ -5750,9 +5739,25 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) } /* switch on the binop */ } /* if (e->tag == Iex_Binop) */ -//ZZ if (e->tag == Iex_Triop) { -//ZZ IRTriop *triop = e->Iex.Triop.details; -//ZZ + if (e->tag == Iex_Triop) { + IRTriop* triop = e->Iex.Triop.details; + ARM64VecBinOp vecbop = ARM64vecb_INVALID; + switch (triop->op) { + case Iop_Add64Fx2: vecbop = ARM64vecb_FADD64x2; break; + case Iop_Sub64Fx2: vecbop = ARM64vecb_FSUB64x2; break; + case Iop_Mul64Fx2: vecbop = ARM64vecb_FMUL64x2; break; + case Iop_Div64Fx2: vecbop = ARM64vecb_FDIV64x2; break; + default: break; + } + if (vecbop != ARM64vecb_INVALID) { + HReg argL = iselV128Expr(env, triop->arg2); + HReg argR = iselV128Expr(env, triop->arg3); + HReg dst = newVRegV(env); + set_FPCR_rounding_mode(env, triop->arg1); + addInstr(env, ARM64Instr_VBinV(vecbop, dst, argL, argR)); + return dst; + } + //ZZ switch (triop->op) { //ZZ case Iop_ExtractV128: { //ZZ HReg res = newVRegV(env); @@ -5776,8 +5781,8 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) //ZZ default: //ZZ break; //ZZ } -//ZZ } -//ZZ + } + //ZZ if (e->tag == Iex_ITE) { // VFD //ZZ ARMCondCode cc; //ZZ HReg r1 = iselNeonExpr(env, e->Iex.ITE.iftrue); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 33e795fc13..0cbb118ff2 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -694,6 +694,7 @@ void ppIROp ( IROp op ) case Iop_64UtoV128: vex_printf("64UtoV128"); return; case Iop_SetV128lo64: vex_printf("SetV128lo64"); return; + case Iop_ZeroHI64: vex_printf("ZeroHI64"); return; case Iop_32UtoV128: vex_printf("32UtoV128"); return; case Iop_V128to32: vex_printf("V128to32"); return; diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 32936d3c5d..cef10c1c8f 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1363,6 +1363,9 @@ typedef Iop_64UtoV128, Iop_SetV128lo64, + /* Copies lower 64 bits, zeroes out upper 64 bits. */ + Iop_ZeroHI64, // :: V128 -> V128 + /* 32 <-> 128 bit vector */ Iop_32UtoV128, Iop_V128to32, // :: V128 -> I32, lowest lane