]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
AArch64: Handle copysign (x, -1) expansion efficiently
authorTamar Christina <tamar.christina@arm.com>
Thu, 9 Nov 2023 14:04:57 +0000 (14:04 +0000)
committerTamar Christina <tamar.christina@arm.com>
Thu, 9 Nov 2023 14:18:52 +0000 (14:18 +0000)
copysign (x, -1) is effectively fneg (abs (x)) which on AArch64 can be
most efficiently done by doing an OR of the signbit.

The middle-end will optimize fneg (abs (x)) now to copysign as the
canonical form and so this optimizes the expansion.

If the target has an inclusive-OR that takes an immediate, then the transformed
instruction is both shorter and faster.  For those that don't, the immediate
has to be separately constructed, but this still ends up being faster as the
immediate construction is not on the critical path.

Note that this is part of another patch series, the additional testcases
are mutually dependent on the match.pd patch.  As such the tests are added
there insteadof here.

gcc/ChangeLog:

PR tree-optimization/109154
* config/aarch64/aarch64.md (copysign<GPF:mode>3): Handle
copysign (x, -1).
* config/aarch64/aarch64-simd.md (copysign<mode>3): Likewise.
* config/aarch64/aarch64-sve.md (copysign<mode>3): Likewise.

gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/aarch64.md

index 98c418c54a82a348c597310caa23916f9c16f9b6..c6f2d5828373f2a5272b9d1227bfe34365f9fd09 100644 (file)
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
-   (match_operand:VHSDF 2 "register_operand")]
+   (match_operand:VHSDF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx v_bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx v_bitmask = gen_reg_rtx (int_mode);
   int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
 
   emit_move_insn (v_bitmask,
                  aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
                                                     HOST_WIDE_INT_M1U << bits));
+
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  if (!REG_P (operands[2]))
+    {
+      rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
+      if (GET_CODE (op2_elt) == CONST_DOUBLE
+         && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
+       {
+         emit_insn (gen_ior<v_int_equiv>3 (
+           lowpart_subreg (int_mode, operands[0], <MODE>mode),
+           lowpart_subreg (int_mode, operands[1], <MODE>mode), v_bitmask));
+         DONE;
+       }
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], v_bitmask,
                                         operands[2], operands[1]));
   DONE;
index 5a652d8536a0ef9461f40da7b22834e683e73ceb..cb07c6166608487ac363eab142f1cd6de1dc4f39 100644 (file)
 (define_expand "copysign<mode>3"
   [(match_operand:SVE_FULL_F 0 "register_operand")
    (match_operand:SVE_FULL_F 1 "register_operand")
-   (match_operand:SVE_FULL_F 2 "register_operand")]
+   (match_operand:SVE_FULL_F 2 "nonmemory_operand")]
   "TARGET_SVE"
   {
     rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
     rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
     rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
 
-    emit_insn (gen_and<v_int_equiv>3
-              (sign, arg2,
-               aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-                                                  HOST_WIDE_INT_M1U
-                                                  << bits)));
+    rtx v_sign_bitmask
+      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+                                          HOST_WIDE_INT_M1U << bits);
+
+    /* copysign (x, -1) should instead be expanded as orr with the sign
+       bit.  */
+    if (!REG_P (operands[2]))
+      {
+       rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
+       if (GET_CODE (op2_elt) == CONST_DOUBLE
+           && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
+         {
+           emit_insn (gen_ior<v_int_equiv>3 (int_res, arg1, v_sign_bitmask));
+           emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+           DONE;
+         }
+      }
+
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+    emit_insn (gen_and<v_int_equiv>3 (sign, arg2, v_sign_bitmask));
     emit_insn (gen_and<v_int_equiv>3
               (mant, arg1,
                aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
index c6b1506fe7b47dd40741f26ef0cc92692008a631..7be1de38b1c3c19d037ca2b3722812e92704bda9 100644 (file)
 (define_expand "copysign<GPF:mode>3"
   [(match_operand:GPF 0 "register_operand")
    (match_operand:GPF 1 "register_operand")
-   (match_operand:GPF 2 "register_operand")]
+   (match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx bitmask = gen_reg_rtx (<V_INT_EQUIV>mode);
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx bitmask = gen_reg_rtx (int_mode);
   emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
                                    << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
+  /* copysign (x, -1) should instead be expanded as orr with the sign
+     bit.  */
+  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
+  if (GET_CODE (op2_elt) == CONST_DOUBLE
+      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
+    {
+      emit_insn (gen_ior<v_int_equiv>3 (
+       lowpart_subreg (int_mode, operands[0], <MODE>mode),
+       lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
+      DONE;
+    }
+
+  operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
                                       bitmask));
   DONE;