From: Julian Seward Date: Mon, 11 Mar 2024 16:53:14 +0000 (+0100) Subject: Handle gcc __builtin_strcmp using 128/256 bit vectors with sse4.1, avx/avx2 X-Git-Tag: VALGRIND_3_23_0~109 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b8ad8a0774c4be2d2260bfb6a11a660334dba3e8;p=thirdparty%2Fvalgrind.git Handle gcc __builtin_strcmp using 128/256 bit vectors with sse4.1, avx/avx2 * amd64 front end: redo the translation into IR for PTEST, so as to use only IROps which we know Memcheck can do exact instrumentation for. Handling for both the 128- and 256-bit cases is has been changed. * ir_opt.c: add some constant folding rules to support the above. In particular, for the case `ptest %reg, %reg` (the same reg twice), we want rflags.C to be set to a defined-1 even if %reg is completely undefined. Doing that requires folding `x and not(x)` to zero when x has type V128 or V256. * memcheck/tests/amd64/rh2257546_{128,256}.c: new test cases https://bugzilla.redhat.com/show_bug.cgi?id=2257546 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 0414aa5c50..d7c25042d2 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -16826,13 +16826,18 @@ static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta, static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign ) { - /* Set Z=1 iff (vecE & vecG) == 0 - Set C=1 iff (vecE & not vecG) == 0 + /* Set Z=1 iff (vecE & vecG) == 0--(128)--0 + Set C=1 iff (vecE & not vecG) == 0--(128)--0 + + For the case `sign == 0`, be careful to use only IROps that can be + instrumented exactly by memcheck. This is because PTEST is used for + __builtin_strcmp in gcc14. See + https://bugzilla.redhat.com/show_bug.cgi?id=2257546 */ /* andV, andnV: vecE & vecG, vecE and not(vecG) */ - /* andV resp. andnV, reduced to 64-bit values, by or-ing the top + /* andV resp. andnV, are reduced to 64-bit values by or-ing the top and bottom 64-bits together. It relies on this trick: InterleaveLO64x2([a,b],[c,d]) == [b,d] hence @@ -16862,11 +16867,13 @@ static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign ) binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))))); + // Make z64 and c64 be either all-0s or all-1s IRTemp z64 = newTemp(Ity_I64); IRTemp c64 = newTemp(Ity_I64); + if (sign == 64) { - /* When only interested in the most significant bit, just shift - arithmetically right and negate. */ + /* When only interested in the most significant bit, just copy bit 63 + into all bit positions, then invert. */ assign(z64, unop(Iop_Not64, binop(Iop_Sar64, mkexpr(and64), mkU8(63)))); @@ -16874,37 +16881,28 @@ static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign ) assign(c64, unop(Iop_Not64, binop(Iop_Sar64, mkexpr(andn64), mkU8(63)))); - } else { - if (sign == 32) { - /* When interested in bit 31 and bit 63, mask those bits and - fallthrough into the PTEST handling. */ - IRTemp t0 = newTemp(Ity_I64); - IRTemp t1 = newTemp(Ity_I64); - IRTemp t2 = newTemp(Ity_I64); - assign(t0, mkU64(0x8000000080000000ULL)); - assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0))); - assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0))); - and64 = t1; - andn64 = t2; - } - /* Now convert and64, andn64 to all-zeroes or all-1s, so we can - slice out the Z and C bits conveniently. We use the standard - trick all-zeroes -> all-zeroes, anything-else -> all-ones - done by "(x | -x) >>s (word-size - 1)". - */ + } else if (sign == 32) { + /* If we're interested into bits 63 and 31, OR bit 31 into bit 63, copy + bit 63 into all bit positions, then invert. */ + IRTemp and3264 = newTemp(Ity_I64); + assign(and3264, binop(Iop_Or64, mkexpr(and64), + binop(Iop_Shl64, mkexpr(and64), mkU8(32)))); assign(z64, unop(Iop_Not64, - binop(Iop_Sar64, - binop(Iop_Or64, - binop(Iop_Sub64, mkU64(0), mkexpr(and64)), - mkexpr(and64)), mkU8(63)))); + binop(Iop_Sar64, mkexpr(and3264), mkU8(63)))); + IRTemp andn3264 = newTemp(Ity_I64); + assign(andn3264, binop(Iop_Or64, mkexpr(andn64), + binop(Iop_Shl64, mkexpr(andn64), mkU8(32)))); assign(c64, unop(Iop_Not64, - binop(Iop_Sar64, - binop(Iop_Or64, - binop(Iop_Sub64, mkU64(0), mkexpr(andn64)), - mkexpr(andn64)), mkU8(63)))); + binop(Iop_Sar64, mkexpr(andn3264), mkU8(63)))); + } else { + vassert(sign == 0); + assign(z64, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(and64), mkU64(0)), + mkU64(~0ULL), mkU64(0ULL))); + assign(c64, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(andn64), mkU64(0)), + mkU64(~0ULL), mkU64(0ULL))); } /* And finally, slice out the Z and C flags and set the flags @@ -16966,9 +16964,7 @@ static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp andnV = newTemp(Ity_V128); assign(andV, binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG))); assign(andnV, binop(Iop_AndV128, - mkexpr(vecE), - binop(Iop_XorV128, mkexpr(vecG), - mkV128(0xFFFF)))); + mkexpr(vecE), unop(Iop_NotV128, mkexpr(vecG)))); finish_xTESTy ( andV, andnV, sign ); return delta; diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c index f918e9f858..6453f4fdf5 100644 --- a/VEX/priv/ir_opt.c +++ b/VEX/priv/ir_opt.c @@ -1693,6 +1693,17 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e ) break; } + /* Similarly .. */ + case Iop_V256toV128_0: case Iop_V256toV128_1: { + UInt v256 = e->Iex.Unop.arg->Iex.Const.con->Ico.V256; + if (v256 == 0x00000000) { + e2 = IRExpr_Const(IRConst_V128(0x0000)); + } else { + goto unhandled; + } + break; + } + case Iop_ZeroHI64ofV128: { /* Could do better here -- only need to look at the bottom 64 bits of the argument, really. */ @@ -2129,6 +2140,8 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e ) } /* -- V128 stuff -- */ + case Iop_InterleaveLO64x2: // This, and the HI64, are created + case Iop_InterleaveHI64x2: // by the amd64 PTEST translation case Iop_InterleaveLO8x16: { /* This turns up a lot in Memcheck instrumentation of Icc generated code. I don't know why. */ @@ -2321,6 +2334,27 @@ static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e ) break; } } + /* AndV128( x, NotV128( x ) ) ==> 0...0 and + AndV256( x, NotV256( x ) ) ==> 0...0 + This is generated by amd64 `ptest %xmmReg, %xmmReg` + (the same reg both times) + See https://bugzilla.redhat.com/show_bug.cgi?id=2257546 */ + if (e->Iex.Binop.op == Iop_AndV128 + || e->Iex.Binop.op == Iop_AndV256) { + Bool isV256 = e->Iex.Binop.op == Iop_AndV256; + IRExpr* x1 = chase(env, e->Iex.Binop.arg1); + IRExpr* rhs = chase(env, e->Iex.Binop.arg2); + if (x1 && rhs + && rhs->tag == Iex_Unop + && rhs->Iex.Unop.op == (isV256 ? Iop_NotV256 + : Iop_NotV128)) { + IRExpr* x2 = chase(env, rhs->Iex.Unop.arg); + if (x2 && sameIRExprs(env, x1, x2)) { + e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op); + break; + } + } + } break; case Iop_OrV128: diff --git a/memcheck/tests/amd64/Makefile.am b/memcheck/tests/amd64/Makefile.am index 0d2812dd89..906b8f393b 100644 --- a/memcheck/tests/amd64/Makefile.am +++ b/memcheck/tests/amd64/Makefile.am @@ -20,6 +20,10 @@ EXTRA_DIST = \ insn-pmovmskb.stderr.exp-clang \ more_x87_fp.stderr.exp more_x87_fp.stdout.exp more_x87_fp.vgtest \ pcmpgt.stderr.exp pcmpgt.vgtest \ + rh2257546_128.vgtest \ + rh2257546_128.stderr.exp rh2257546_128.stdout.exp \ + rh2257546_256.vgtest \ + rh2257546_256.stderr.exp rh2257546_256.stdout.exp \ sh-mem-vec128-plo-no.vgtest \ sh-mem-vec128-plo-no.stderr.exp \ sh-mem-vec128-plo-no.stdout.exp \ @@ -46,12 +50,13 @@ check_PROGRAMS = \ insn-bsfl \ insn-pmovmskb \ pcmpgt \ + rh2257546_128 \ sh-mem-vec128 \ sse_memory \ xor-undef-amd64 if BUILD_AVX_TESTS - check_PROGRAMS += sh-mem-vec256 xsave-avx + check_PROGRAMS += rh2257546_256 sh-mem-vec256 xsave-avx endif if HAVE_ASM_CONSTRAINT_P check_PROGRAMS += insn-pcmpistri diff --git a/memcheck/tests/amd64/rh2257546_128.c b/memcheck/tests/amd64/rh2257546_128.c new file mode 100644 index 0000000000..a405aa7751 --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_128.c @@ -0,0 +1,32 @@ + +// This should run on memcheck without reporting an undef-value error. +// See https://bugzilla.redhat.com/show_bug.cgi?id=2257546 + +#include +#include + +int main ( void ) +{ + char* c1 = malloc(16); + c1[0] = 'x'; c1[1] = 'y'; c1[2] = 'x'; c1[3] = 0; + + char* c2 = "foobarxyzzyfoobarzyzzy"; // strlen > 16 + + long long int res; + __asm__ __volatile__( + "movdqu (%1), %%xmm4" "\n\t" + "movdqu (%2), %%xmm5" "\n\t" + "pxor %%xmm4, %%xmm5" "\n\t" + "ptest %%xmm5, %%xmm5" "\n\t" + "je zzz1f" "\n\t" + "mov $99, %0" "\n\t" + "jmp zzzafter" "\n" + "zzz1f:" "\n\t" + "mov $88, %0" "\n" + "zzzafter:" "\n\t" + : /*OUT*/"=r"(res) : /*IN*/"r"(c1),"r"(c2) : /*TRASH*/"xmm4","xmm5","cc" + ); + printf("res = %lld\n", res); + free(c1); + return 0; +} diff --git a/memcheck/tests/amd64/rh2257546_128.stderr.exp b/memcheck/tests/amd64/rh2257546_128.stderr.exp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/memcheck/tests/amd64/rh2257546_128.stdout.exp b/memcheck/tests/amd64/rh2257546_128.stdout.exp new file mode 100644 index 0000000000..4541310897 --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_128.stdout.exp @@ -0,0 +1 @@ +res = 99 diff --git a/memcheck/tests/amd64/rh2257546_128.vgtest b/memcheck/tests/amd64/rh2257546_128.vgtest new file mode 100644 index 0000000000..94414cd6fb --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_128.vgtest @@ -0,0 +1,2 @@ +prog: rh2257546_128 +vgopts: -q diff --git a/memcheck/tests/amd64/rh2257546_256.c b/memcheck/tests/amd64/rh2257546_256.c new file mode 100644 index 0000000000..235005ca6f --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_256.c @@ -0,0 +1,32 @@ + +// This should run on memcheck without reporting an undef-value error. +// See https://bugzilla.redhat.com/show_bug.cgi?id=2257546 + +#include +#include + +int main ( void ) +{ + char* c1 = malloc(32); + c1[0] = 'x'; c1[1] = 'y'; c1[2] = 'x'; c1[3] = 0; + + char* c2 = "foobarxyzzyfoobarzyzzyandawholelotmoretoo"; // strlen > 32 + + long long int res; + __asm__ __volatile__( + "vmovdqu (%1), %%ymm4" "\n\t" + "vmovdqu (%2), %%ymm5" "\n\t" + "vpxor %%ymm4, %%ymm5, %%ymm5" "\n\t" + "vptest %%ymm5, %%ymm5" "\n\t" + "je zzz1f" "\n\t" + "mov $99, %0" "\n\t" + "jmp zzzafter" "\n" + "zzz1f:" "\n\t" + "mov $88, %0" "\n" + "zzzafter:" "\n\t" + : /*OUT*/"=r"(res) : /*IN*/"r"(c1),"r"(c2) : /*TRASH*/"ymm4","ymm5","cc" + ); + printf("res = %lld\n", res); + free(c1); + return 0; +} diff --git a/memcheck/tests/amd64/rh2257546_256.stderr.exp b/memcheck/tests/amd64/rh2257546_256.stderr.exp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/memcheck/tests/amd64/rh2257546_256.stdout.exp b/memcheck/tests/amd64/rh2257546_256.stdout.exp new file mode 100644 index 0000000000..4541310897 --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_256.stdout.exp @@ -0,0 +1 @@ +res = 99 diff --git a/memcheck/tests/amd64/rh2257546_256.vgtest b/memcheck/tests/amd64/rh2257546_256.vgtest new file mode 100644 index 0000000000..86eef8fe26 --- /dev/null +++ b/memcheck/tests/amd64/rh2257546_256.vgtest @@ -0,0 +1,3 @@ +prereq: test -e rh2257546_256 +prog: rh2257546_256 +vgopts: -q