]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Use movss/movsd to implement V4SI/V2DI VEC_PERM on x86.
authorRoger Sayle <roger@nextmovesoftware.com>
Sun, 25 Dec 2022 11:57:12 +0000 (11:57 +0000)
committerRoger Sayle <roger@nextmovesoftware.com>
Sun, 25 Dec 2022 11:57:12 +0000 (11:57 +0000)
This patch tweaks the x86 backend to use the movss and movsd instructions
to perform some vector permutations on integer vectors (V4SI and V2DI) in
the same way they are used for floating point vectors (V4SF and V2DF).

As a motivating example, consider:

typedef unsigned int v4si __attribute__((vector_size(16)));
typedef float v4sf __attribute__((vector_size(16)));
v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }

which is currently compiled with -O2 to:

foo: movdqa %xmm0, %xmm2
shufps $80, %xmm0, %xmm1
movdqa %xmm1, %xmm0
shufps $232, %xmm2, %xmm0
ret

bar: movss %xmm1, %xmm0
ret

with this patch both functions compile to the same form.
Likewise for the V2DI case:

typedef unsigned long v2di __attribute__((vector_size(16)));
typedef double v2df __attribute__((vector_size(16)));

v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }

which currently generates:

foo: shufpd $2, %xmm0, %xmm1
movdqa %xmm1, %xmm0
ret

bar: movsd %xmm1, %xmm0
ret

2022-12-25  Roger Sayle  <roger@nextmovesoftware.com>
    Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
* config/i386/i386-builtin.def (__builtin_ia32_movss): Update
CODE_FOR_sse_movss to CODE_FOR_sse_movss_v4sf.
(__builtin_ia32_movsd): Likewise, update CODE_FOR_sse2_movsd to
CODE_FOR_sse2_movsd_v2df.
* config/i386/i386-expand.cc (split_convert_uns_si_sse): Update
gen_sse_movss call to gen_sse_movss_v4sf, and gen_sse2_movsd call
to gen_sse2_movsd_v2df.
(expand_vec_perm_movs): Also allow V4SImode with TARGET_SSE and
V2DImode with TARGET_SSE2.
* config/i386/sse.md
(avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>): Update
gen_sse_movss call to gen_sse_movss_v4sf.
(avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>): Likewise.
(sse_movss_<mode>): Renamed from sse_movss using VI4F_128 mode
iterator to handle both V4SF and V4SI.
(sse2_movsd_<mode>): Likewise, renamed from sse2_movsd using
VI8F_128 mode iterator to handle both V2DF and V2DI.

gcc/testsuite/ChangeLog
* gcc.target/i386/sse-movss-4.c: New test case.
* gcc.target/i386/sse2-movsd-3.c: New test case.

gcc/config/i386/i386-builtin.def
gcc/config/i386/i386-expand.cc
gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/sse-movss-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/sse2-movsd-3.c [new file with mode: 0644]

index d85b1753039c3b4c0794695c6bde66c3669fe1c3..0d1fc342e61c6db97f92dba6c5c5fc6a17b68b54 100644 (file)
@@ -679,7 +679,7 @@ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_
 
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss_v4sf,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
@@ -781,7 +781,7 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86
 
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 
-BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd_v2df,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 
index 0b887e74515f655fafe0c31e8a7badd58cf925a8..b6cc76d8a9cccb78f6dcb02c8a5e59e4c33ca4e7 100644 (file)
@@ -1774,9 +1774,9 @@ ix86_split_convert_uns_si_sse (rtx operands[])
       input = gen_rtx_REG (vecmode, REGNO (input));
       emit_move_insn (value, CONST0_RTX (vecmode));
       if (vecmode == V4SFmode)
-       emit_insn (gen_sse_movss (value, value, input));
+       emit_insn (gen_sse_movss_v4sf (value, value, input));
       else
-       emit_insn (gen_sse2_movsd (value, value, input));
+       emit_insn (gen_sse2_movsd_v2df (value, value, input));
     }
 
   emit_move_insn (large, two31);
@@ -18905,8 +18905,10 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
     return false;
 
   if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_SSE && vmode == V4SImode)
       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
-      && !(TARGET_SSE2 && vmode == V2DFmode))
+      && !(TARGET_SSE2 && vmode == V2DFmode)
+      && !(TARGET_SSE2 && vmode == V2DImode))
     return false;
 
   /* Only the first element is changed.  */
index de632b2d8fe5da02520201d1825f9cb75bc73770..d50627a7d6cbfd26ccd734f7b3415f6647be5e5d 100644 (file)
   if (!MEM_P (operands[3]))
     operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
   emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
   if (!MEM_P (operands[3]))
     operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
   emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
 
-(define_insn "sse_movss"
-  [(set (match_operand:V4SF 0 "register_operand"   "=x,v")
-       (vec_merge:V4SF
-         (match_operand:V4SF 2 "register_operand" " x,v")
-         (match_operand:V4SF 1 "register_operand" " 0,v")
+(define_insn "sse_movss_<mode>"
+  [(set (match_operand:VI4F_128 0 "register_operand"   "=x,v")
+       (vec_merge:VI4F_128
+         (match_operand:VI4F_128 2 "register_operand" " x,v")
+         (match_operand:VI4F_128 1 "register_operand" " 0,v")
          (const_int 1)))]
   "TARGET_SSE"
   "@
   [(set (match_dup 0) (match_dup 1))]
   "operands[0] = adjust_address (operands[0], DFmode, 0);")
 
-(define_insn "sse2_movsd"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"   "=x,v,x,v,m,x,x,v,o")
-       (vec_merge:V2DF
-         (match_operand:V2DF 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
-         (match_operand:V2DF 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
+(define_insn "sse2_movsd_<mode>"
+  [(set (match_operand:VI8F_128 0 "nonimmediate_operand"   "=x,v,x,v,m,x,x,v,o")
+       (vec_merge:VI8F_128
+         (match_operand:VI8F_128 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
+         (match_operand:VI8F_128 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
          (const_int 1)))]
   "TARGET_SSE2"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/sse-movss-4.c b/gcc/testsuite/gcc.target/i386/sse-movss-4.c
new file mode 100644 (file)
index 0000000..ec3019c
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
+v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movss\t" 2 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "vpblendw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c
new file mode 100644 (file)
index 0000000..fadbe2b
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned long long v2di __attribute__((vector_size(16)));
+typedef double v2df __attribute__((vector_size(16)));
+
+v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
+v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movsd\t" 2 } } */
+/* { dg-final { scan-assembler-not "v?shufpd" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "v?punpckldq" } } */
+/* { dg-final { scan-assembler-not "v?movq" } } */