match.pd: Improve fneg/fadd optimization [PR109240]

author Jakub Jelinek <jakub@redhat.com>

Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)

committer Jakub Jelinek <jakub@redhat.com>

Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)
author Jakub Jelinek <jakub@redhat.com>
Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)
committer Jakub Jelinek <jakub@redhat.com>
Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)
diff --git a/gcc/match.pd b/gcc/match.pd

index c5d2c36e117298782bc154eaed1b381bf3418701..b7d7a5db6a968d321aa2c029b0d7fd750d077975 100644 (file)
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8074,63 +8074,76 @@ and,
     under IEEE 754 the fneg of the wider type will negate every even entry
     and when doing an add we get a sub of the even and add of every odd
     elements.  */
-(simplify
- (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
- (if (!VECTOR_INTEGER_TYPE_P (type)
-      && !FLOAT_WORDS_BIG_ENDIAN)
-  (with
-   {
-     /* Build a vector of integers from the tree mask.  */
-     vec_perm_builder builder;
-   }
-   (if (tree_to_vec_perm_builder (&builder, @2))
-    (with
-     {
-       /* Create a vec_perm_indices for the integer vector.  */
-       poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
-       vec_perm_indices sel (builder, 2, nelts);
-       machine_mode vec_mode = TYPE_MODE (type);
-       machine_mode wide_mode;
-       scalar_mode wide_elt_mode;
-       poly_uint64 wide_nunits;
-       scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
-     }
-     (if (sel.series_p (0, 2, 0, 2)
-         && sel.series_p (1, 2, nelts + 1, 2)
-         && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
-         && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
-         && related_vector_mode (vec_mode, wide_elt_mode,
-                                 wide_nunits).exists (&wide_mode))
-       (with
-        {
-          tree stype
-            = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
-                                              TYPE_UNSIGNED (type));
-          tree ntype = build_vector_type_for_mode (stype, wide_mode);
-
-          /* The format has to be a non-extended ieee format.  */
-          const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
-          const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
-        }
-        (if (TYPE_MODE (stype) != BLKmode
-             && VECTOR_TYPE_P (ntype)
-             && fmt_old != NULL
-             && fmt_new != NULL)
-         (with
-          {
-            /* If the target doesn't support v1xx vectors, try using
-               scalar mode xx instead.  */
+(for plusminus (plus minus)
+     minusplus (minus plus)
+ (simplify
+  (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4)
+   (if (!VECTOR_INTEGER_TYPE_P (type)
+       && !FLOAT_WORDS_BIG_ENDIAN
+        /* plus is commutative, while minus is not, so :c can't be used.
+          Do equality comparisons by hand and at the end pick the operands
+          from the minus.  */
+       && (operand_equal_p (@0, @2, 0)
+           ? operand_equal_p (@1, @3, 0)
+           : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0)))
+   (with
+    {
+      /* Build a vector of integers from the tree mask.  */
+      vec_perm_builder builder;
+    }
+    (if (tree_to_vec_perm_builder (&builder, @4))
+     (with
+      {
+       /* Create a vec_perm_indices for the integer vector.  */
+       poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+       vec_perm_indices sel (builder, 2, nelts);
+       machine_mode vec_mode = TYPE_MODE (type);
+       machine_mode wide_mode;
+       scalar_mode wide_elt_mode;
+       poly_uint64 wide_nunits;
+       scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
+      }
+      (if (sel.series_p (0, 2, 0, 2)
+          && sel.series_p (1, 2, nelts + 1, 2)
+          && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
+          && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
+          && related_vector_mode (vec_mode, wide_elt_mode,
+                                  wide_nunits).exists (&wide_mode))
+       (with
+       {
+         tree stype
+           = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
+                                             TYPE_UNSIGNED (type));
+         tree ntype = build_vector_type_for_mode (stype, wide_mode);
+
+         /* The format has to be a non-extended ieee format.  */
+         const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
+         const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
+       }
+       (if (TYPE_MODE (stype) != BLKmode
+            && VECTOR_TYPE_P (ntype)
+            && fmt_old != NULL
+            && fmt_new != NULL)
+        (with
+         {
+           /* If the target doesn't support v1xx vectors, try using
+              scalar mode xx instead.  */
             if (known_eq (GET_MODE_NUNITS (wide_mode), 1)
                 && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))
               ntype = stype;
-          }
-          (if (fmt_new->signbit_rw
-               == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
-               && fmt_new->signbit_rw == fmt_new->signbit_ro
-               && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
-               && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype))
-                   || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
-           (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))))))
+         }
+         (if (fmt_new->signbit_rw
+              == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
+              && fmt_new->signbit_rw == fmt_new->signbit_ro
+              && targetm.can_change_mode_class (TYPE_MODE (ntype),
+                                                TYPE_MODE (type), ALL_REGS)
+              && ((optimize_vectors_before_lowering_p ()
+                   && VECTOR_TYPE_P (ntype))
+                  || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
+          (if (plusminus == PLUS_EXPR)
+           (plus (view_convert:type (negate (view_convert:ntype @3))) @2)
+           (minus @0 (view_convert:type
+                       (negate (view_convert:ntype @1))))))))))))))))
  
  (simplify
   (vec_perm @0 @1 VECTOR_CST@2)
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c b/gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c

new file mode 100644 (file)

index 0000000..87424c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**     fneg    v[0-9]+.2d, v[0-9]+.2d
+**     fsub    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+**     fneg    v[0-9]+.4s, v[0-9]+.4s
+**     fsub    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c b/gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c

new file mode 100644 (file)

index 0000000..5b9406a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+**     fsub    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+**     fsub    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+**     fadd    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+**     fsub    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
author	Jakub Jelinek <jakub@redhat.com>
	Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)
committer	Jakub Jelinek <jakub@redhat.com>
	Tue, 18 Apr 2023 09:01:47 +0000 (11:01 +0200)
gcc/match.pd		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c	[new file with mode: 0644]	patch \| blob