From: Pranav Gorantla Date: Thu, 29 May 2025 13:02:24 +0000 (+0200) Subject: i386: Use Shuffles instead of shifts for Reduction in AMD znver4/5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5080d98a383de244a7b78ae50456fd41881268c2;p=thirdparty%2Fgcc.git i386: Use Shuffles instead of shifts for Reduction in AMD znver4/5 In AMD znver4, znver5 targets vpshufd, vpsrldq have latencies 1,2 and throughput 4 (2 for znver4),2 respectively. It is better to generate shuffles instead of shifts wherever possible. In this patch we try to generate appropriate shuffle instruction to copy higher half to lower half instead of a simple right shift during horizontal vector reduction. gcc/ChangeLog: * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to generate reduc half for V4SI, similar modes. * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro. * config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF): New tuning. gcc/testsuite/ChangeLog: * gcc.target/i386/reduc-pshuf.c: New test. --- diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 7fd03c88630..181e64a86bf 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -18724,6 +18724,33 @@ emit_reduc_half (rtx dest, rtx src, int i) case E_V8HFmode: case E_V4SImode: case E_V2DImode: + if (TARGET_SSE_REDUCTION_PREFER_PSHUF) + { + if (i == 128) + { + d = gen_reg_rtx (V4SImode); + tem = gen_sse2_pshufd_1 ( + d, force_reg (V4SImode, gen_lowpart (V4SImode, src)), + GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3)); + break; + } + else if (i == 64) + { + d = gen_reg_rtx (V4SImode); + tem = gen_sse2_pshufd_1 ( + d, force_reg (V4SImode, gen_lowpart (V4SImode, src)), + GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1)); + break; + } + else if (i == 32) + { + d = gen_reg_rtx (V8HImode); + tem = gen_sse2_pshuflw_1 ( + d, force_reg (V8HImode, gen_lowpart (V8HImode, src)), + GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1)); + break; + } + } d = gen_reg_rtx (V1TImode); tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), GEN_INT (i / 2)); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index ccc62fc3e7c..d32d9ad997e 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -490,7 +490,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_SSE_MOVCC_USE_BLENDV \ ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] #define TARGET_ALIGN_TIGHT_LOOPS \ - ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] + ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] +#define TARGET_SSE_REDUCTION_PREFER_PSHUF \ + ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF] /* Feature tests against the various architecture variations. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index e6044c6032e..91cdca7fbfc 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -572,6 +572,11 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, "sse_movcc_use_blendv", ~m_CORE_ATOM) +/* X86_TUNE_V4SI_REDUCTION_PREFER_SHUFD: Prefer pshuf to reduce V16QI, + V8HI, V8HI, V4SI, V4FI, V2DI modes when lshr are costlier. */ +DEF_TUNE (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF, + "sse_reduction_prefer_pshuf", m_ZNVER4 | m_ZNVER5) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/gcc.target/i386/reduc-pshuf.c b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c new file mode 100644 index 00000000000..e46d2bab9c5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=znver5 " } */ + +#define N 32 +#define T short +T +foo (T *a) +{ + T sum = 0; + for (int i = 0; i < N; i++) + sum += a[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vpsrl" 0 } } */ +/* { dg-final { scan-assembler-times "vpshuf" 3 } } */