From: Julian Seward Date: Tue, 26 Aug 2014 18:35:13 +0000 (+0000) Subject: Handle the new IROps introduced to support ARM64 SIMD. X-Git-Tag: svn/VALGRIND_3_10_0~141 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3af5a88ca61182af2c5b7e40a65797ffbba78bc7;p=thirdparty%2Fvalgrind.git Handle the new IROps introduced to support ARM64 SIMD. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14362 --- diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 073cff0400..fef5b49b17 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -48,6 +48,13 @@ Check the interpretation for vector narrowing and widening ops, particularly the saturating ones. I suspect they are either overly pessimistic and/or wrong. + + Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional + saturating shifts): the interpretation is overly pessimistic. + See comments on the relevant cases below for details. + + Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts, + both rounding and non-rounding variants): ditto */ /* This file implements the Memcheck instrumentation, and in @@ -790,6 +797,31 @@ static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits ) return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp)); } + if (src_ty == Ity_V128 && dst_ty == Ity_I64) { + /* Use InterleaveHI64x2 to copy the top half of the vector into + the bottom half. Then we can UifU it with the original, throw + away the upper half of the result, and PCast-I64-to-I64 + the lower half. */ + // Generates vbits[127:64] : vbits[127:64] + IRAtom* hi64hi64 + = assignNew('V', mce, Ity_V128, + binop(Iop_InterleaveHI64x2, vbits, vbits)); + // Generates + // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0]) + // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0]) + IRAtom* lohi64 + = mkUifUV128(mce, hi64hi64, vbits); + // Generates UifU(vbits[127:64],vbits[63:0]) + IRAtom* lo64 + = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64)); + // Generates + // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] ) + // == PCast-to-I64( vbits[127:0] ) + IRAtom* res + = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64)); + return res; + } + /* Else do it the slow way .. */ /* First of all, collapse vbits down to a single bit. */ tmp1 = NULL; @@ -857,6 +889,42 @@ static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits ) } } +/* This is a minor variant. It takes an arg of some type and returns + a value of the same type. The result consists entirely of Defined + (zero) bits except its least significant bit, which is a PCast of + the entire argument down to a single bit. */ +static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty ) +{ + if (ty == Ity_V128) { + /* --- Case for V128 --- */ + IRAtom* varg128 = varg; + // generates: PCast-to-I64(varg128) + IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128); + // Now introduce zeros (defined bits) in the top 63 places + // generates: Def--(63)--Def PCast-to-I1(varg128) + IRAtom* d63pc + = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1))); + // generates: Def--(64)--Def + IRAtom* d64 + = definedOfType(Ity_I64); + // generates: Def--(127)--Def PCast-to-I1(varg128) + IRAtom* res + = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc)); + return res; + } + if (ty == Ity_I64) { + /* --- Case for I64 --- */ + // PCast to 64 + IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg); + // Zero (Def) out the top 63 bits + IRAtom* res + = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1))); + return res; + } + /*NOTREACHED*/ + tl_assert(0); +} + /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */ /* Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and @@ -3123,11 +3191,20 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)); /* V x V shifts/rotates are done using the standard lazy scheme. */ + /* For the non-rounding variants of bi-di vector x vector + shifts (the Iop_Sh.. ops, that is) we use the lazy scheme. + But note that this is overly pessimistic, because in fact only + the bottom 8 bits of each lane of the second argument are taken + into account when shifting. So really we ought to ignore + undefinedness in bits 8 and above of each lane in the + second argument. */ case Iop_Shl8x16: case Iop_Shr8x16: case Iop_Sar8x16: case Iop_Sal8x16: case Iop_Rol8x16: + case Iop_Sh8Sx16: + case Iop_Sh8Ux16: return mkUifUV128(mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), mkPCast8x16(mce,vatom2) @@ -3138,6 +3215,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_Sar16x8: case Iop_Sal16x8: case Iop_Rol16x8: + case Iop_Sh16Sx8: + case Iop_Sh16Ux8: return mkUifUV128(mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), mkPCast16x8(mce,vatom2) @@ -3148,6 +3227,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_Sar32x4: case Iop_Sal32x4: case Iop_Rol32x4: + case Iop_Sh32Sx4: + case Iop_Sh32Ux4: return mkUifUV128(mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), mkPCast32x4(mce,vatom2) @@ -3158,11 +3239,31 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_Sar64x2: case Iop_Sal64x2: case Iop_Rol64x2: + case Iop_Sh64Sx2: + case Iop_Sh64Ux2: return mkUifUV128(mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), mkPCast64x2(mce,vatom2) ); + /* For the rounding variants of bi-di vector x vector shifts, the + rounding adjustment can cause undefinedness to propagate through + the entire lane, in the worst case. Too complex to handle + properly .. just UifU the arguments and then PCast them. + Suboptimal but safe. */ + case Iop_Rsh8Sx16: + case Iop_Rsh8Ux16: + return binary8Ix16(mce, vatom1, vatom2); + case Iop_Rsh16Sx8: + case Iop_Rsh16Ux8: + return binary16Ix8(mce, vatom1, vatom2); + case Iop_Rsh32Sx4: + case Iop_Rsh32Ux4: + return binary32Ix4(mce, vatom1, vatom2); + case Iop_Rsh64Sx2: + case Iop_Rsh64Ux2: + return binary64Ix2(mce, vatom1, vatom2); + case Iop_F32ToFixed32Ux4_RZ: case Iop_F32ToFixed32Sx4_RZ: case Iop_Fixed32UToF32x4_RN: @@ -3191,6 +3292,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_Avg8Sx16: case Iop_QAdd8Ux16: case Iop_QAdd8Sx16: + case Iop_QAddExtUSsatSS8x16: + case Iop_QAddExtSUsatUU8x16: case Iop_QSal8x16: case Iop_QShl8x16: case Iop_Add8x16: @@ -3216,6 +3319,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_Avg16Sx8: case Iop_QAdd16Ux8: case Iop_QAdd16Sx8: + case Iop_QAddExtUSsatSS16x8: + case Iop_QAddExtSUsatUU16x8: case Iop_QSal16x8: case Iop_QShl16x8: case Iop_Add16x8: @@ -3232,6 +3337,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_QAdd32Ux4: case Iop_QSub32Sx4: case Iop_QSub32Ux4: + case Iop_QAddExtUSsatSS32x4: + case Iop_QAddExtSUsatUU32x4: case Iop_QSal32x4: case Iop_QShl32x4: case Iop_Avg32Ux4: @@ -3262,6 +3369,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_QAdd64Sx2: case Iop_QSub64Ux2: case Iop_QSub64Sx2: + case Iop_QAddExtUSsatSS64x2: + case Iop_QAddExtSUsatUU64x2: case Iop_PolynomialMulAdd64x2: case Iop_CipherV128: case Iop_CipherLV128: @@ -3359,6 +3468,80 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, complainIfUndefined(mce, atom2, NULL); return mkPCast32x4(mce, vatom1); + /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128. + To make this simpler, do the following: + * complain if the shift amount (the I8) is undefined + * pcast each lane at the wide width + * truncate each lane to half width + * pcast the resulting 64-bit value to a single bit and use + that as the least significant bit of the upper half of the + result. */ + case Iop_QandQShrNnarrow64Uto32Ux2: + case Iop_QandQSarNnarrow64Sto32Sx2: + case Iop_QandQSarNnarrow64Sto32Ux2: + case Iop_QandQRShrNnarrow64Uto32Ux2: + case Iop_QandQRSarNnarrow64Sto32Sx2: + case Iop_QandQRSarNnarrow64Sto32Ux2: + case Iop_QandQShrNnarrow32Uto16Ux4: + case Iop_QandQSarNnarrow32Sto16Sx4: + case Iop_QandQSarNnarrow32Sto16Ux4: + case Iop_QandQRShrNnarrow32Uto16Ux4: + case Iop_QandQRSarNnarrow32Sto16Sx4: + case Iop_QandQRSarNnarrow32Sto16Ux4: + case Iop_QandQShrNnarrow16Uto8Ux8: + case Iop_QandQSarNnarrow16Sto8Sx8: + case Iop_QandQSarNnarrow16Sto8Ux8: + case Iop_QandQRShrNnarrow16Uto8Ux8: + case Iop_QandQRSarNnarrow16Sto8Sx8: + case Iop_QandQRSarNnarrow16Sto8Ux8: + { + IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL; + IROp opNarrow = Iop_INVALID; + switch (op) { + case Iop_QandQShrNnarrow64Uto32Ux2: + case Iop_QandQSarNnarrow64Sto32Sx2: + case Iop_QandQSarNnarrow64Sto32Ux2: + case Iop_QandQRShrNnarrow64Uto32Ux2: + case Iop_QandQRSarNnarrow64Sto32Sx2: + case Iop_QandQRSarNnarrow64Sto32Ux2: + fnPessim = mkPCast64x2; + opNarrow = Iop_NarrowUn64to32x2; + break; + case Iop_QandQShrNnarrow32Uto16Ux4: + case Iop_QandQSarNnarrow32Sto16Sx4: + case Iop_QandQSarNnarrow32Sto16Ux4: + case Iop_QandQRShrNnarrow32Uto16Ux4: + case Iop_QandQRSarNnarrow32Sto16Sx4: + case Iop_QandQRSarNnarrow32Sto16Ux4: + fnPessim = mkPCast32x4; + opNarrow = Iop_NarrowUn32to16x4; + break; + case Iop_QandQShrNnarrow16Uto8Ux8: + case Iop_QandQSarNnarrow16Sto8Sx8: + case Iop_QandQSarNnarrow16Sto8Ux8: + case Iop_QandQRShrNnarrow16Uto8Ux8: + case Iop_QandQRSarNnarrow16Sto8Sx8: + case Iop_QandQRSarNnarrow16Sto8Ux8: + fnPessim = mkPCast16x8; + opNarrow = Iop_NarrowUn16to8x8; + break; + default: + tl_assert(0); + } + complainIfUndefined(mce, atom2, NULL); + // Pessimised shift result + IRAtom* shV + = fnPessim(mce, vatom1); + // Narrowed, pessimised shift result + IRAtom* shVnarrowed + = assignNew('V', mce, Ity_I64, unop(opNarrow, shV)); + // Generates: Def--(63)--Def PCast-to-I1(narrowed) + IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64); + // and assemble the result + return assignNew('V', mce, Ity_V128, + binop(Iop_64HLtoV128, qV, shVnarrowed)); + } + case Iop_Mull32Sx2: case Iop_Mull32Ux2: case Iop_QDMull32Sx2: @@ -3977,6 +4160,66 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, mkPCast32x8(mce, vatom2) ); + /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256. + Handle the shifted results in the same way that other + binary Q ops are handled, eg QSub: UifU the two args, + then pessimise -- which is binaryNIxM. But for the upper + V128, we require to generate just 1 bit which is the + pessimised shift result, with 127 defined zeroes above it. + + Note that this overly pessimistic in that in fact only the + bottom 8 bits of each lane of the second arg determine the shift + amount. Really we ought to ignore any undefinedness in the + rest of the lanes of the second arg. */ + case Iop_QandSQsh64x2: case Iop_QandUQsh64x2: + case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2: + case Iop_QandSQsh32x4: case Iop_QandUQsh32x4: + case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4: + case Iop_QandSQsh16x8: case Iop_QandUQsh16x8: + case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8: + case Iop_QandSQsh8x16: case Iop_QandUQsh8x16: + case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16: + { + // The function to generate the pessimised shift result + IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL; + switch (op) { + case Iop_QandSQsh64x2: + case Iop_QandUQsh64x2: + case Iop_QandSQRsh64x2: + case Iop_QandUQRsh64x2: + binaryNIxM = binary64Ix2; + break; + case Iop_QandSQsh32x4: + case Iop_QandUQsh32x4: + case Iop_QandSQRsh32x4: + case Iop_QandUQRsh32x4: + binaryNIxM = binary32Ix4; + break; + case Iop_QandSQsh16x8: + case Iop_QandUQsh16x8: + case Iop_QandSQRsh16x8: + case Iop_QandUQRsh16x8: + binaryNIxM = binary16Ix8; + break; + case Iop_QandSQsh8x16: + case Iop_QandUQsh8x16: + case Iop_QandSQRsh8x16: + case Iop_QandUQRsh8x16: + binaryNIxM = binary8Ix16; + break; + default: + tl_assert(0); + } + tl_assert(binaryNIxM); + // Pessimised shift result, shV[127:0] + IRAtom* shV = binaryNIxM(mce, vatom1, vatom2); + // Generates: Def--(127)--Def PCast-to-I1(shV) + IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128); + // and assemble the result + return assignNew('V', mce, Ity_V256, + binop(Iop_V128HLtoV256, qV, shV)); + } + default: ppIROp(op); VG_(tool_panic)("memcheck:expr2vbits_Binop"); @@ -4047,6 +4290,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_Dup8x16: case Iop_Dup16x8: case Iop_Dup32x4: + case Iop_Reverse1sIn8_x16: case Iop_Reverse8sIn16_x8: case Iop_Reverse8sIn32_x4: case Iop_Reverse16sIn32_x4: @@ -4232,6 +4476,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_FtoI32Ux4_RZ: case Iop_FtoI32Sx4_RZ: case Iop_Abs32x4: + case Iop_RSqrtEst32Ux4: return mkPCast32x4(mce, vatom); case Iop_CmpwNEZ32: