From: Mark Wielaard Date: Thu, 5 Mar 2026 17:49:32 +0000 (+0100) Subject: Add PACKUSDW SSE4.1 support for x86 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=c651c83f24cc8be21386f019dfd75e9bb04351bc;p=thirdparty%2Fvalgrind.git Add PACKUSDW SSE4.1 support for x86 Add handling of PACKUSDW to VEX/priv/guest_x86_toIR.c based on the guest_amd64_toIR.c implementation. Handle Iop_QNarrowBin32Sto16Ux8 using h_generic_calc_QNarrowBin32Sto16Ux8 in VEX/priv/host_x86_isel.c. Move test_PACKUSDW from none/tests/amd64/sse4-64.c to none/tests/sse4-common.h and add the same test to none/tests/x86/sse4-x86.c with new PACKUSDW output in stdout.exp. https://bugs.kde.org/show_bug.cgi?id=517144 --- diff --git a/NEWS b/NEWS index cd174283c..a474b7b25 100644 --- a/NEWS +++ b/NEWS @@ -198,6 +198,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. 516289 illumos lsframe2 regtest fails 516748 Incorrect use of SET_STATUS_Failure for syscall wrappers that return error codes rather than -1 on error +517144 Add PACKUSDW SSE4.1 support for x86 517455 Add PCMPEQQ SSE4.1 support for x86 517697 Implement CLRSSONSTACK and SETUJMPBUF handling on Solaris. 517748 Add ability to redirect global functions to Darwin diff --git a/VEX/priv/guest_x86_toIR.c b/VEX/priv/guest_x86_toIR.c index f1637c187..804a1e01d 100644 --- a/VEX/priv/guest_x86_toIR.c +++ b/VEX/priv/guest_x86_toIR.c @@ -13332,6 +13332,39 @@ DisResult disInstr_X86_WRK ( goto decode_success; } + /* 66 0F 38 2B /r - PACKUSDW xmm1, xmm2/m128 + 2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */ + if ( sz == 2 + && insn[0] == 0x0f && insn[1] == 0x38 && insn[2] == 0x2b ) { + modrm = insn[3]; + + IRTemp argL = newTemp(Ity_V128); + IRTemp argR = newTemp(Ity_V128); + + if ( epartIsReg(modrm) ) { + assign( argL, getXMMReg( eregOfRM(modrm) ) ); + delta += 3 + 1; + DIP( "packusdw %s,%s\n", + nameXMMReg( eregOfRM(modrm) ), + nameXMMReg( gregOfRM(modrm) ) ); + } else { + addr = disAMode( &alen, sorb, delta+3, dis_buf ); + gen_SEGV_if_not_16_aligned( addr ); + assign( argL, loadLE( Ity_V128, mkexpr(addr) )); + delta += 3 + alen; + DIP( "packusdw %s,%s\n", + dis_buf, nameXMMReg( gregOfRM(modrm) ) ); + } + + assign(argR, getXMMReg( gregOfRM(modrm) )); + + putXMMReg( gregOfRM(modrm), + binop( Iop_QNarrowBin32Sto16Ux8, + mkexpr(argL), mkexpr(argR)) ); + + goto decode_success; + } + /* 66 0F 38 38 /r - PMINSB xmm1, xmm2/m128 66 0F 38 3C /r - PMAXSB xmm1, xmm2/m128 Minimum/Maximum of Packed Signed Byte Integers (XMM) diff --git a/VEX/priv/host_x86_isel.c b/VEX/priv/host_x86_isel.c index dfff1cec9..b044152b1 100644 --- a/VEX/priv/host_x86_isel.c +++ b/VEX/priv/host_x86_isel.c @@ -3903,6 +3903,9 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16; goto do_SseAssistedVectorAndScalar; + case Iop_QNarrowBin32Sto16Ux8: + fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8; + goto do_SseAssistedBinary; do_SseAssistedBinary: { /* As with the amd64 case (where this is copied from) we generate pretty bad code. */ diff --git a/none/tests/amd64/sse4-64.c b/none/tests/amd64/sse4-64.c index f3d546af5..6d1cdafac 100644 --- a/none/tests/amd64/sse4-64.c +++ b/none/tests/amd64/sse4-64.c @@ -993,31 +993,6 @@ void test_INSERTPS ( void ) } } -void test_PACKUSDW ( void ) -{ - V128 src, dst; - Int i; - for (i = 0; i < 10; i++) { - if (i < 9) { - randV128(&src); - randV128(&dst); - } else { - memset(&src, 0, sizeof(src)); - memset(&dst, 0, sizeof(src)); - src[0] = 0x11; src[1] = 0x22; - src[4] = 0x33; src[5] = 0x44; - src[8] = 0x55; src[9] = 0x66; - src[12] = 0x77; src[13] = 0x88; - dst[0] = 0xaa; dst[1] = 0xbb; - dst[4] = 0xcc; dst[5] = 0xdd; - dst[8] = 0xee; dst[9] = 0xff; - dst[12] = 0xa1; dst[13] = 0xb2; - } - DO_mandr_r("packusdw", src, dst); - } -} - - void test_PEXTRB ( void ) { V128 src; diff --git a/none/tests/sse4-common.h b/none/tests/sse4-common.h index 1139fe9cd..e62691b7c 100644 --- a/none/tests/sse4-common.h +++ b/none/tests/sse4-common.h @@ -1621,4 +1621,28 @@ static inline void test_ROUNDSS_w_mxcsr_rounding ( void ) assert(rm == 0); // 0 == RN == default } +static inline void test_PACKUSDW ( void ) +{ + V128 src, dst; + Int i; + for (i = 0; i < 10; i++) { + if (i < 9) { + randV128(&src); + randV128(&dst); + } else { + memset(&src, 0, sizeof(src)); + memset(&dst, 0, sizeof(src)); + src[0] = 0x11; src[1] = 0x22; + src[4] = 0x33; src[5] = 0x44; + src[8] = 0x55; src[9] = 0x66; + src[12] = 0x77; src[13] = 0x88; + dst[0] = 0xaa; dst[1] = 0xbb; + dst[4] = 0xcc; dst[5] = 0xdd; + dst[8] = 0xee; dst[9] = 0xff; + dst[12] = 0xa1; dst[13] = 0xb2; + } + DO_mandr_r("packusdw", src, dst); + } +} + #endif /* __SSE4_COMMON_H */ diff --git a/none/tests/x86/sse4-x86.c b/none/tests/x86/sse4-x86.c index 781769cd6..39f3516b2 100644 --- a/none/tests/x86/sse4-x86.c +++ b/none/tests/x86/sse4-x86.c @@ -167,6 +167,7 @@ int main(void) test_ROUNDSD_w_mxcsr_rounding(); test_ROUNDSS_w_mxcsr_rounding(); test_PEXTRD(); + test_PACKUSDW(); return 0; } diff --git a/none/tests/x86/sse4-x86.stdout.exp b/none/tests/x86/sse4-x86.stdout.exp index 6fb6b8807..3a79ab919 100644 --- a/none/tests/x86/sse4-x86.stdout.exp +++ b/none/tests/x86/sse4-x86.stdout.exp @@ -2764,3 +2764,23 @@ r pextrd $2 7703dd30953c90570c5d2a681c677080 5555555555555555 00000000953c90 m pextrd $2 7703dd30953c90570c5d2a681c677080 5555555555555555 55555555953c9057 r pextrd $3 7703dd30953c90570c5d2a681c677080 5555555555555555 000000007703dd30 m pextrd $3 7703dd30953c90570c5d2a681c677080 5555555555555555 555555557703dd30 +r packusdw ab12e36c26fabc992a39f4f9e49d4836 7bd6fab1f066f1c39b728b98b7359f05 0000ffffffff0000ffff000000000000 +m packusdw ab12e36c26fabc992a39f4f9e49d4836 7bd6fab1f066f1c39b728b98b7359f05 0000ffffffff0000ffff000000000000 +r packusdw e209d1bd35a2e4bb69102ed3e89ebea3 bdc4f7ac192e2dc6817efc96aaaccb26 0000ffffffff00000000ffff00000000 +m packusdw e209d1bd35a2e4bb69102ed3e89ebea3 bdc4f7ac192e2dc6817efc96aaaccb26 0000ffffffff00000000ffff00000000 +r packusdw c681dcfd9fed458aad85f6301190cf03 9919cf8cac20820d9a4efa4f10dab010 000000000000ffff000000000000ffff +m packusdw c681dcfd9fed458aad85f6301190cf03 9919cf8cac20820d9a4efa4f10dab010 000000000000ffff000000000000ffff +r packusdw b1c6009702681bb3d263c8fe7b7d3782 6b217ebd46c72a43c1ae00ad05ca0aee 0000ffff0000ffffffffffff0000ffff +m packusdw b1c6009702681bb3d263c8fe7b7d3782 6b217ebd46c72a43c1ae00ad05ca0aee 0000ffff0000ffffffffffff0000ffff +r packusdw 012339f9fa9ea4e2b47820294273b14b 8f2701ab82af6115d46a8a9ea68994ed ffff00000000ffff0000000000000000 +m packusdw 012339f9fa9ea4e2b47820294273b14b 8f2701ab82af6115d46a8a9ea68994ed ffff00000000ffff0000000000000000 +r packusdw 10e6848e211b1ac22f8e7b9b847efa8a 617952c2fc64652fae4d140d0f220b39 ffffffffffff0000ffff00000000ffff +m packusdw 10e6848e211b1ac22f8e7b9b847efa8a 617952c2fc64652fae4d140d0f220b39 ffffffffffff0000ffff00000000ffff +r packusdw 3c59dcc3166cba001e7254415aa8ce6d 3d616f6e5273703c2b251ae55aa12bfd ffffffffffffffffffffffffffffffff +m packusdw 3c59dcc3166cba001e7254415aa8ce6d 3d616f6e5273703c2b251ae55aa12bfd ffffffffffffffffffffffffffffffff +r packusdw dfc93d02731cbf485ff02707e2ffe81e 7f2c531b1d67bfe927bc1914a613af65 0000ffffffff0000ffffffffffff0000 +m packusdw dfc93d02731cbf485ff02707e2ffe81e 7f2c531b1d67bfe927bc1914a613af65 0000ffffffff0000ffffffffffff0000 +r packusdw 5682a3b8d4b76745cdd370da388f06c9 8226fb36fbcd8ee17ddf8c850d83549f ffff00000000ffff00000000ffffffff +m packusdw 5682a3b8d4b76745cdd370da388f06c9 8226fb36fbcd8ee17ddf8c850d83549f ffff00000000ffff00000000ffffffff +r packusdw 00008877000066550000443300002211 0000b2a10000ffee0000ddcc0000bbaa 8877665544332211b2a1ffeeddccbbaa +m packusdw 00008877000066550000443300002211 0000b2a10000ffee0000ddcc0000bbaa 8877665544332211b2a1ffeeddccbbaa