]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
i386: Use Shuffles instead of shifts for Reduction in AMD znver4/5
authorPranav Gorantla <Pranav.Gorantla@amd.com>
Thu, 29 May 2025 13:02:24 +0000 (15:02 +0200)
committerJan Hubicka <hubicka@ucw.cz>
Thu, 29 May 2025 13:03:23 +0000 (15:03 +0200)
In AMD znver4, znver5 targets vpshufd, vpsrldq have latencies 1,2 and
throughput 4 (2 for znver4),2 respectively. It is better to generate
shuffles instead of shifts wherever possible. In this patch we try to
generate appropriate shuffle instruction to copy higher half to lower
half instead of a simple right shift during horizontal vector reduction.

gcc/ChangeLog:

* config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to
generate reduc half for V4SI, similar modes.
* config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF):
New tuning.

gcc/testsuite/ChangeLog:

* gcc.target/i386/reduc-pshuf.c: New test.

gcc/config/i386/i386-expand.cc
gcc/config/i386/i386.h
gcc/config/i386/x86-tune.def
gcc/testsuite/gcc.target/i386/reduc-pshuf.c [new file with mode: 0644]

index 7fd03c88630ff0fff1db9f10949dcf1d0d2b46bc..181e64a86bf6d4e30ca0bb73f9ebee6f43ee65dd 100644 (file)
@@ -18724,6 +18724,33 @@ emit_reduc_half (rtx dest, rtx src, int i)
     case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
+      if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
+       {
+         if (i == 128)
+           {
+             d = gen_reg_rtx (V4SImode);
+             tem = gen_sse2_pshufd_1 (
+                 d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
+                 GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
+             break;
+           }
+         else if (i == 64)
+           {
+             d = gen_reg_rtx (V4SImode);
+             tem = gen_sse2_pshufd_1 (
+                 d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
+                 GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
+             break;
+           }
+         else if (i == 32)
+           {
+             d = gen_reg_rtx (V8HImode);
+             tem = gen_sse2_pshuflw_1 (
+                 d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
+                 GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
+             break;
+           }
+       }
       d = gen_reg_rtx (V1TImode);
       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
                                GEN_INT (i / 2));
index ccc62fc3e7ca53c86ec7e541ea6a42b5a7eab92a..d32d9ad997e677e617654e9e8ba0de6872eb32f0 100644 (file)
@@ -490,7 +490,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_SSE_MOVCC_USE_BLENDV \
        ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 #define TARGET_ALIGN_TIGHT_LOOPS \
-        ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS]
+       ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS]
+#define TARGET_SSE_REDUCTION_PREFER_PSHUF \
+       ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF]
 
 
 /* Feature tests against the various architecture variations.  */
index e6044c6032e4bbee2fc550f25e085c3253fa21bb..91cdca7fbfc26ceb221048dea26cc6b1f9c952a4 100644 (file)
@@ -572,6 +572,11 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV,
          "sse_movcc_use_blendv", ~m_CORE_ATOM)
 
+/* X86_TUNE_V4SI_REDUCTION_PREFER_SHUFD: Prefer pshuf to reduce V16QI,
+   V8HI, V8HI, V4SI, V4FI, V2DI modes when lshr are costlier. */
+DEF_TUNE (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF,
+   "sse_reduction_prefer_pshuf", m_ZNVER4 | m_ZNVER5)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/
diff --git a/gcc/testsuite/gcc.target/i386/reduc-pshuf.c b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c
new file mode 100644 (file)
index 0000000..e46d2ba
--- /dev/null
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 " } */
+
+#define N 32
+#define T short
+T
+foo (T *a)
+{
+  T sum = 0;
+  for (int i = 0; i < N; i++)
+    sum += a[i];
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times "vpsrl" 0 } } */
+/* { dg-final { scan-assembler-times "vpshuf" 3 } } */