From: liuhongt Date: Tue, 18 Jul 2023 08:32:54 +0000 (+0800) Subject: Optimize vlddqu + inserti128 to vbroadcasti128 X-Git-Tag: basepoints/gcc-15~7221 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1b446a9760942bfcfbde042290452f0c8f298276;p=thirdparty%2Fgcc.git Optimize vlddqu + inserti128 to vbroadcasti128 vlddqu + vinserti128 will use shuffle port in addition to load port comparing to vbroadcasti128, For latency perspective,vbroadcasti is no worse than vlddqu + vinserti128. gcc/ChangeLog: * config/i386/sse.md (*avx2_lddqu_inserti_to_bcasti): New pre_reload define_insn_and_split. gcc/testsuite/ChangeLog: * gcc.target/i386/vlddqu_vinserti128.c: New test. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 51961bbfc0b5..8dea05703569 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -26609,6 +26609,24 @@ (set_attr "prefix" "vex,evex,evex") (set_attr "mode" "OI")]) +;; optimize vlddqu + vinserti128 to vbroadcasti128, the former will use +;; extra shuffle port in addition to load port than the latter. +;; For latency perspective,vbroadcasti is no worse. +(define_insn_and_split "avx2_lddqu_inserti_to_bcasti" + [(set (match_operand:V4DI 0 "register_operand" "=x,v,v") + (vec_concat:V4DI + (subreg:V2DI + (unspec:V16QI [(match_operand:V16QI 1 "memory_operand")] + UNSPEC_LDDQU) 0) + (subreg:V2DI (unspec:V16QI [(match_dup 1)] + UNSPEC_LDDQU) 0)))] + "TARGET_AVX2 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V4DI (match_dup 1) (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V2DImode, 0);") + ;; Modes handled by AVX vec_dup patterns. (define_mode_iterator AVX_VEC_DUP_MODE [V8SI V8SF V4DI V4DF]) diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c new file mode 100644 index 000000000000..29699a5fa7fc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2" } */ +/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */ +/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */ + +#include +__m256i foo(void *data) { + __m128i X1 = _mm_lddqu_si128((__m128i*)data); + __m256i V1 = _mm256_broadcastsi128_si256 (X1); + return V1; +}