]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
x86_cse: Convert CONST_VECTOR load to constant integer load
authorH.J. Lu <hjl.tools@gmail.com>
Wed, 29 Apr 2026 11:50:38 +0000 (19:50 +0800)
committerH.J. Lu <hjl.tools@gmail.com>
Thu, 30 Apr 2026 08:21:45 +0000 (16:21 +0800)
Convert CONST_VECTOR load no larger than integer register:

  (set (reg:V2SI 106)
       (const_vector:V2SI [(const_int 1 [1]) repeated x2]))

to constant integer load:

  (set (subreg:DI (reg:V2SI 106 [ _20 ]) 0)
       (const_int 4294967297 [0x100000001]))

and keep redundant constant integer load.  Generate zero CONST_VECTOR
load which works for both MMX and XMM registers.

Tested on Linux/x86-64 and Linux/i686.

gcc/

PR target/125026
PR target/125032
* config/i386/i386-features.cc (ix86_place_single_vector_set):
Don't check CONST_VECTOR load size.
(replace_vector_const): Handle constant integer load.
(x86_cse::x86_cse): Convert CONST_VECTOR load no larger than
integer to constant integer load and keep redundant constant
integer load.  Generate zero CONST_VECTOR load.

gcc/testsuite/

PR target/125026
PR target/125032
* gcc.target/i386/pr125026.c: New test.
* gcc.target/i386/pr125032-1.c: Likewise.
* gcc.target/i386/pr125032-2.c: Likewise.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
gcc/config/i386/i386-features.cc
gcc/testsuite/gcc.target/i386/pr125026.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/pr125032-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/pr125032-2.c [new file with mode: 0644]

index 63f9dcc9f93f3e1a77a41910e5f81329630c9809..ce5f0e9c178974df1627451083e6a1e75c5b91a3 100644 (file)
@@ -3321,16 +3321,10 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
        }
     }
 
-  /* CONST_VECTOR load no larger than integer register
-
-     (set (reg:V2QI 294)
-         (const_vector:V2QI [(const_int 0 [0]) repeated x2]))
-
-     can use integer load.  */
+  /* NB: CONST_VECTOR load is generated and handled in x86_cse.  */
   if (load
-      && load->kind == X86_CSE_VEC_DUP
-      && (!CONST_VECTOR_P (src)
-         || GET_MODE_SIZE (GET_MODE (dest)) > UNITS_PER_WORD))
+      && !CONST_VECTOR_P (src)
+      && load->kind == X86_CSE_VEC_DUP)
     {
       /* Get the source from LOAD as (reg:SI 99) in
 
@@ -3644,7 +3638,9 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
 
       rtx replace;
       /* Replace the source operand with VECTOR_CONST.  */
-      if (SUBREG_P (src) || mode == vector_mode)
+      if (SUBREG_P (src)
+         || mode == vector_mode
+         || CONST_INT_P (vector_const))
        replace = vector_const;
       else
        {
@@ -3686,6 +3682,11 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
          print_rtl_single (dump_file, insn);
        }
       SET_SRC (set) = replace;
+      if (CONST_INT_P (replace))
+       {
+         dest = gen_rtx_SUBREG (scalar_mode, dest, 0);
+         SET_DEST (set) = dest;
+       }
       /* Drop possible dead definitions.  */
       PATTERN (insn) = set;
       INSN_CODE (insn) = -1;
@@ -4701,7 +4702,8 @@ pass_x86_cse::x86_cse (void)
     if (load->count >= load->threshold)
       {
        machine_mode mode;
-       rtx reg, broadcast_source, broadcast_reg;
+       rtx reg, broadcast_reg;
+       rtx broadcast_source = nullptr;
        replaced = true;
        switch (load->kind)
          {
@@ -4716,9 +4718,61 @@ pass_x86_cse::x86_cse (void)
            load->broadcast_reg = broadcast_reg;
            break;
 
+         case X86_CSE_VEC_DUP:
+           if (CONST_INT_P (load->val)
+               && (load->val == CONST0_RTX (load->mode)
+                   || load->size <= UNITS_PER_WORD))
+             {
+               /* Generate CONST_VECTOR load.  */
+               mode = ix86_get_vector_cse_mode (load->size,
+                                                load->mode);
+
+               if (load->val == CONST0_RTX (load->mode))
+                 broadcast_source = CONST0_RTX (mode);
+               else if (load->val == CONSTM1_RTX (load->mode))
+                 broadcast_source = CONSTM1_RTX (mode);
+               else
+                 {
+                   int nunits = GET_MODE_NUNITS (mode);
+                   rtvec v = rtvec_alloc (nunits);
+                   for (int j = 0; j < nunits ; j++)
+                     RTVEC_ELT (v, j) = load->val;
+                   broadcast_source = gen_rtx_CONST_VECTOR (mode, v);
+                 }
+
+               /* NB: Zero CONST_VECTOR load works for MMX and XMM
+                  registers.  */
+               if (load->size <= UNITS_PER_WORD)
+                 {
+                   /* Convert CONST_VECTOR load no larger than integer
+                      register:
+
+                      (set (reg:V2SI 106)
+                           (const_vector:V2SI [(const_int 1 [1]) repeated x2]))
+
+                      to constant integer load:
+
+                      (set (subreg:DI (reg:V2SI 106 [ _20 ]) 0)
+                           (const_int 4294967297 [0x100000001]))
+                      */
+                   machine_mode int_mode
+                     = int_mode_for_mode (mode).require ();
+                   broadcast_source = simplify_subreg (int_mode,
+                                                       broadcast_source,
+                                                       mode, 0);
+                   gcc_assert (broadcast_source != nullptr);
+                   replace_vector_const (mode, broadcast_source,
+                                         load->insns, int_mode);
+                   /* Keep redundant constant integer load.  */
+                   load->broadcast_source = nullptr;
+                   load->broadcast_reg = nullptr;
+                   break;
+                 }
+             }
+           /* FALLTHRU */
+
          case X86_CSE_CONST0_VECTOR:
          case X86_CSE_CONSTM1_VECTOR:
-         case X86_CSE_VEC_DUP:
            mode = ix86_get_vector_cse_mode (load->size, load->mode);
            broadcast_reg = gen_reg_rtx (mode);
            if (load->def_insn)
@@ -4743,18 +4797,7 @@ pass_x86_cse::x86_cse (void)
                  broadcast_source = CONSTM1_RTX (mode);
                  break;
                case X86_CSE_VEC_DUP:
-                 if (CONST_INT_P (load->val)
-                     && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
-                   {
-                     /* CONST_VECTOR load no larger than integer
-                        register size can use integer load.  */
-                     int nunits = GET_MODE_NUNITS (mode);
-                     rtvec v = rtvec_alloc (nunits);
-                     for (int j = 0; j < nunits ; j++)
-                       RTVEC_ELT (v, j) = load->val;
-                     broadcast_source = gen_rtx_CONST_VECTOR (mode, v);
-                   }
-                 else
+                 if (!broadcast_source)
                    {
                      reg = gen_reg_rtx (load->mode);
                      broadcast_source = gen_rtx_VEC_DUPLICATE (mode,
@@ -4844,9 +4887,13 @@ pass_x86_cse::x86_cse (void)
                                              updated_gnu_tls_insns,
                                              updated_gnu2_tls_insns);
                  break;
+               case X86_CSE_VEC_DUP:
+                 /* Keep redundant constant integer load.  */
+                 if (!load->broadcast_reg)
+                   break;
+                 /* FALLTHRU */
                case X86_CSE_CONST0_VECTOR:
                case X86_CSE_CONSTM1_VECTOR:
-               case X86_CSE_VEC_DUP:
                  ix86_place_single_vector_set (load->broadcast_reg,
                                                load->broadcast_source,
                                                load->bbs,
diff --git a/gcc/testsuite/gcc.target/i386/pr125026.c b/gcc/testsuite/gcc.target/i386/pr125026.c
new file mode 100644 (file)
index 0000000..96ac6a9
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+
+extern void a(int[]);
+int b;
+int d(int e, volatile int f) {
+  b = f - e;
+  int t[2] = {b, b};
+  a(t);
+}
+void g(int h[1]) {
+  if (d(0, 1))
+    h[0] = 0;
+  d(0, 1);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[ \\t\]+\\\$4294967297, %r\[a-z0-9\]+" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "xmm" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr125032-1.c b/gcc/testsuite/gcc.target/i386/pr125032-1.c
new file mode 100644 (file)
index 0000000..7c54bab
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { ia32 && pie } } } */
+/* { dg-options "-O2 -march=i686 -mmmx -fPIE" } */
+
+typedef int __m64 __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+int mmx_composite_over_n_8_0565_info_0, mmx_composite_over_n_8_0565_w;
+long long mmx_composite_over_n_8_0565_m3;
+__m64 mmx_composite_over_n_8_0565_v2, mmx_composite_over_n_8_0565_v3;
+__m64 in_over(__m64 src)
+{
+  __m64 __m1 = src, __m2;
+  return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+__m64 load8888()
+{
+  __m64 __m2, __m1;
+  return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+void mmx_composite_over_n_8_0565()
+{
+  __m64 vsrc = load8888();
+  mmx_composite_over_n_8_0565_w = mmx_composite_over_n_8_0565_info_0;
+  while (mmx_composite_over_n_8_0565_info_0)
+    if (mmx_composite_over_n_8_0565_m3)
+      {
+       mmx_composite_over_n_8_0565_v2 = in_over(vsrc);
+       mmx_composite_over_n_8_0565_v3 = in_over(vsrc);
+      }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr125032-2.c b/gcc/testsuite/gcc.target/i386/pr125032-2.c
new file mode 100644 (file)
index 0000000..71b072a
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-do compile { target fpic } } */
+/* { dg-options "-O2 -fPIC" } */
+
+long _HMAC_SHA256_Init_Klen;
+char _crypt_HMAC_SHA256_Init_pad[64];
+char _crypt_HMAC_SHA256_Init_pad_0, _crypt_HMAC_SHA256_Init_K_0;
+void _crypt_HMAC_SHA256_Init_i() {
+  if (_HMAC_SHA256_Init_Klen)
+    _HMAC_SHA256_Init_Klen = 2;
+  long __trans_tmp_1 =
+      __builtin_dynamic_object_size(_crypt_HMAC_SHA256_Init_pad, 0);
+  __builtin___memset_chk(_crypt_HMAC_SHA256_Init_pad, 2, 64, __trans_tmp_1);
+  for (; _HMAC_SHA256_Init_Klen;)
+    _crypt_HMAC_SHA256_Init_pad_0 ^= _crypt_HMAC_SHA256_Init_K_0;
+}