aarch64: Add support for unpacked SVE FP comparisons

author Spencer Abson <spencer.abson@arm.com>

Mon, 16 Jun 2025 16:54:04 +0000 (16:54 +0000)

committer Spencer Abson <spencer.abson@arm.com>

Mon, 7 Jul 2025 09:51:30 +0000 (09:51 +0000)
author Spencer Abson <spencer.abson@arm.com>
Mon, 16 Jun 2025 16:54:04 +0000 (16:54 +0000)
committer Spencer Abson <spencer.abson@arm.com>
Mon, 7 Jul 2025 09:51:30 +0000 (09:51 +0000)
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 87ae4cb0402fb2bcc5c853568f0c9409cb772527..6b5113eb70fef0dbf4d8d3c8ea589224938da87f 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3966,7 +3966,7 @@
  )
  
  ;; Predicated predicate inverse.
-(define_insn "*one_cmpl<mode>3"
+(define_insn "@aarch64_pred_one_cmpl<mode>_z"
    [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
         (and:PRED_ALL
           (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
@@ -8637,8 +8637,8 @@
  (define_expand "vec_cmp<mode><vpred>"
    [(set (match_operand:<VPRED> 0 "register_operand")
         (match_operator:<VPRED> 1 "comparison_operator"
-         [(match_operand:SVE_FULL_F 2 "register_operand")
-          (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+         [(match_operand:SVE_F 2 "register_operand")
+          (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
    "TARGET_SVE"
    {
      aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -8651,10 +8651,10 @@
  (define_insn "@aarch64_pred_fcm<cmp_op><mode>"
    [(set (match_operand:<VPRED> 0 "register_operand")
         (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
            (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand")
-          (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+          (match_operand:SVE_F 3 "register_operand")
+          (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
           SVE_COND_FP_CMP_I0))]
    "TARGET_SVE"
    {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -8667,10 +8667,10 @@
  (define_insn "@aarch64_pred_fcmuo<mode>"
    [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
         (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand" "Upl")
            (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand" "w")
-          (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+          (match_operand:SVE_F 3 "register_operand" "w")
+          (match_operand:SVE_F 4 "register_operand" "w")]
           UNSPEC_COND_FCMUO))]
    "TARGET_SVE"
    "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 0b4cd17c0ef95343b6be4ddd934120d0b3b01984..7960b639f903c77b45fb43757baa129996f043de 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27307,7 +27307,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
                           bool known_ptrue_p, rtx op0, rtx op1)
  {
    rtx flag = gen_int_mode (known_ptrue_p, SImode);
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
                                gen_rtvec (4, pred, flag, op0, op1),
                                aarch64_unspec_cond_code (code));
    emit_set_insn (target, unspec);
@@ -27326,10 +27326,10 @@ static void
  aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
  {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp1 = gen_reg_rtx (pred_mode);
+  machine_mode target_mode = GET_MODE (target);
+  rtx tmp1 = gen_reg_rtx (target_mode);
    aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
-  rtx tmp2 = gen_reg_rtx (pred_mode);
+  rtx tmp2 = gen_reg_rtx (target_mode);
    aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
    aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
  }
@@ -27346,8 +27346,7 @@ static void
  aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
                                  bool known_ptrue_p, rtx op0, rtx op1)
  {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp = gen_reg_rtx (pred_mode);
+  rtx tmp = gen_reg_rtx (GET_MODE (target));
    aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
    aarch64_emit_unop (target, one_cmpl_optab, tmp);
  }
@@ -27359,10 +27358,25 @@ aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
  void
  aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
  {
-  machine_mode pred_mode = GET_MODE (target);
    machine_mode data_mode = GET_MODE (op0);
+  rtx pred = aarch64_sve_fp_pred (data_mode, nullptr);
  
-  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+  /* The governing and destination modes.  */
+  machine_mode pred_mode = GET_MODE (pred);
+  machine_mode target_mode = GET_MODE (target);
+
+  /* For partial vector modes, the choice of predicate mode depends
+     on whether we need to suppress exceptions for inactive elements.
+     If we do need to suppress exceptions, the predicate mode matches
+     the element size rather than the container size and the predicate
+     marks the upper bits in each container as inactive.  The predicate
+     is then a ptrue wrt TARGET_MODE but not wrt PRED_MODE.  It is the
+     latter which matters here.
+
+     If we don't need to suppress exceptions, the predicate mode matches
+     the container size, PRED_MODE == TARGET_MODE, and the predicate is
+     thus a ptrue wrt both TARGET_MODE and PRED_MODE.  */
+  bool known_ptrue_p = pred_mode == target_mode;
    switch (code)
      {
      case UNORDERED:
@@ -27376,12 +27390,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
      case EQ:
      case NE:
        /* There is native support for the comparison.  */
-      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+      aarch64_emit_sve_fp_cond (target, code, pred, known_ptrue_p, op0, op1);
        return;
  
      case LTGT:
        /* This is a trapping operation (LT or GT).  */
-      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
+      aarch64_emit_sve_or_fp_conds (target, LT, GT,
+                                   pred, known_ptrue_p, op0, op1);
        return;
  
      case UNEQ:
@@ -27390,7 +27405,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
           /* This would trap for signaling NaNs.  */
           op1 = force_reg (data_mode, op1);
           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
-                                       ptrue, true, op0, op1);
+                                       pred, known_ptrue_p, op0, op1);
           return;
         }
        /* fall through */
@@ -27400,11 +27415,19 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
      case UNGE:
        if (flag_trapping_math)
         {
-         /* Work out which elements are ordered.  */
-         rtx ordered = gen_reg_rtx (pred_mode);
           op1 = force_reg (data_mode, op1);
-         aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
-                                          ptrue, true, op0, op1);
+
+         /* Work out which elements are unordered.  */
+         rtx uo_tmp = gen_reg_rtx (target_mode);
+         aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED,
+                                   pred, known_ptrue_p, op0, op1);
+
+         /* Invert the result.  Governered by PRED so that we only
+            flip the active bits.  */
+         rtx ordered = gen_reg_rtx (pred_mode);
+         uo_tmp = gen_lowpart (pred_mode, uo_tmp);
+         emit_insn (gen_aarch64_pred_one_cmpl_z (pred_mode, ordered,
+                                                 pred, uo_tmp));
  
           /* Test the opposite condition for the ordered elements,
              then invert the result.  */
@@ -27429,7 +27452,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
  
    /* There is native support for the inverse comparison.  */
    code = reverse_condition_maybe_unordered (code);
-  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
+  aarch64_emit_sve_invert_fp_cond (target, code,
+                                  pred, known_ptrue_p, op0, op1);
  }
  
  /* Return true if:
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c

new file mode 100644 (file)

index 0000000..bf9c127
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
@@ -0,0 +1,602 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 --param=aarch64-autovec-preference=sve-only -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+#define UNLT(A, B) (!__builtin_isgreaterequal (A, B))
+#define UNLE(A, B) (!__builtin_isgreater (A, B))
+#define UNGT(A, B) (!__builtin_islessequal (A, B))
+#define UNGE(A, B) (!__builtin_isless (A, B))
+#define UNEQ(A, B) (!__builtin_islessgreater (A, B))
+
+#define EQ(A, B) ((A) == (B))
+#define NE(A, B) ((A) != (B))
+#define LE(A, B) ((A) <= (B))
+#define LT(A, B) ((A) < (B))
+#define GE(A, B) ((A) >= (B))
+#define GT(A, B) ((A) > (B))
+#define ORDERED(A, B) (!__builtin_isunordered (A, B))
+#define UNORDERED(A, B) (__builtin_isunordered (A, B))
+
+#define b_i b[i]
+
+#define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out,  \
+                                      TYPE1 *__restrict a,     \
+                                      TYPE1 *__restrict b)     \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = CMP (a[i], RHS) ? 3 : out[i];                   \
+  }
+
+#define TEST_CC_REG(CMP)                     \
+  TEST_FCM (uint64_t, float, CMP, b_i, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, b_i, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, b_i, 32)
+
+#define TEST_CC_ALL(CMP)                   \
+  TEST_CC_REG (CMP)                        \
+  TEST_FCM (uint64_t, float, CMP, 0, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, 0, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, 0, 32)
+
+
+/*
+** f_uint64_t_float_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLT)
+
+/*
+** f_uint64_t_float_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLE)
+
+/*
+** f_uint64_t_float_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGT)
+
+/*
+** f_uint64_t_float_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGE)
+
+/*
+** f_uint64_t_float_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNEQ)
+
+/*
+** f_uint64_t_float_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (EQ)
+
+/*
+** f_uint64_t_float_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (NE)
+
+/*
+** f_uint64_t_float_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LE)
+
+/*
+** f_uint64_t_float_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LT)
+
+/*
+** f_uint64_t_float_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GE)
+
+/*
+** f_uint64_t_float_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GT)
+
+/*
+** f_uint64_t_float_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (ORDERED)
+
+/*
+** f_uint64_t_float_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNORDERED)
+
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c

new file mode 100644 (file)

index 0000000..ab210da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 --param=aarch64-autovec-preference=sve-only -fno-trapping-math" } */
+
+#include "unpacked_fcm_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 32 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 32 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 32 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
author	Spencer Abson <spencer.abson@arm.com>
	Mon, 16 Jun 2025 16:54:04 +0000 (16:54 +0000)
committer	Spencer Abson <spencer.abson@arm.com>
	Mon, 7 Jul 2025 09:51:30 +0000 (09:51 +0000)
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c	[new file with mode: 0644]	patch \| blob