aarch64: Add support for fp8 convert and scale

author Saurabh Jha <saurabh.jha@arm.com>

Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)
author Saurabh Jha <saurabh.jha@arm.com>
Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc

index 02c73f11223c43916f18717c3c19256270cb81e9..99be5935c544e9ddf452b1c0d31a981ed5eb5416 100644 (file)
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -198,10 +198,11 @@ const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
  const unsigned int FLAG_READ_MEMORY = 1U << 2;
  const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
  const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+const unsigned int FLAG_USES_FPMR = 1U << 5;
  
  /* Indicates that READ_FPCR and RAISE_FP_EXCEPTIONS should be set for
     floating-point modes but not for integer modes.  */
-const unsigned int FLAG_AUTO_FP = 1U << 5;
+const unsigned int FLAG_AUTO_FP = 1U << 6;
  
  const unsigned int FLAG_QUIET = 0;
  const unsigned int FLAG_DEFAULT = FLAG_AUTO_FP;
@@ -210,6 +211,7 @@ const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS
    | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
  const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY;
  const unsigned int FLAG_LOAD = FLAG_READ_MEMORY;
+const unsigned int FLAG_FP8 = FLAG_FP | FLAG_USES_FPMR;
  
  typedef struct
  {
@@ -783,7 +785,7 @@ typedef struct
    AARCH64_SIMD_BUILTIN_##T##_##N##A,
  
  #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F)          \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F)      \
    AARCH64_##N,
  
  enum aarch64_builtins
@@ -1618,6 +1620,8 @@ enum class aarch64_builtin_signatures
  {
    binary,
    binary_lane,
+  ternary,
+  unary,
  };
  
  namespace {
@@ -1632,6 +1636,8 @@ struct simd_type {
  };
  
  namespace simd_types {
+  constexpr simd_type f8 { V8QImode, qualifier_modal_float };
+  constexpr simd_type f8q { V16QImode, qualifier_modal_float };
    constexpr simd_type p8 { V8QImode, qualifier_poly };
    constexpr simd_type p8q { V16QImode, qualifier_poly };
    constexpr simd_type s8 { V8QImode, qualifier_none };
@@ -1658,7 +1664,11 @@ namespace simd_types {
  
    constexpr simd_type f32 { V2SFmode, qualifier_none };
    constexpr simd_type f32q { V4SFmode, qualifier_none };
+  constexpr simd_type s32 { V2SImode, qualifier_none };
+  constexpr simd_type s32q { V4SImode, qualifier_none };
+
    constexpr simd_type f64q { V2DFmode, qualifier_none };
+  constexpr simd_type s64q { V2DImode, qualifier_none };
  
    constexpr simd_type none { VOIDmode, qualifier_none };
  }
@@ -1666,10 +1676,10 @@ namespace simd_types {
  }
  
  #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
    {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-   simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS, \
-   FLAG_##F},
+   simd_types::T2, simd_types::T3, U, \
+   aarch64_required_extensions::REQUIRED_EXTENSIONS, FLAG_##F},
  
  /* Initialize pragma builtins.  */
  
@@ -1677,7 +1687,7 @@ struct aarch64_pragma_builtins_data
  {
    const char *name;
    aarch64_builtin_signatures signature;
-  simd_type types[3];
+  simd_type types[4];
    int unspec;
    aarch64_required_extensions required_extensions;
    unsigned int flags;
@@ -1701,6 +1711,17 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
        for (int i = 1; i <= 2; ++i)
         arg_types.quick_push (builtin_data.types[i].type ());
        break;
+
+    case aarch64_builtin_signatures::ternary:
+      return_type = builtin_data.types[0].type ();
+      for (int i = 1; i <= 3; ++i)
+       arg_types.quick_push (builtin_data.types[i].type ());
+      break;
+
+    case aarch64_builtin_signatures::unary:
+      return_type = builtin_data.types[0].type ();
+      arg_types.quick_push (builtin_data.types[1].type ());
+      break;
      }
    switch (builtin_data.signature)
      {
@@ -1711,6 +1732,8 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
      default:
        break;
      }
+  if (builtin_data.flags & FLAG_USES_FPMR)
+    arg_types.quick_push (uint64_type_node);
    return build_function_type_array (return_type, arg_types.length (),
                                     arg_types.address ());
  }
@@ -3553,6 +3576,36 @@ aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target)
    return ops[0].value;
  }
  
+/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
+   Do nothing otherwise.  */
+static void
+aarch64_convert_to_v64 (expand_operand *op)
+{
+  if (known_eq (GET_MODE_BITSIZE (op->mode), 128u))
+    {
+      op->mode = aarch64_v64_mode (GET_MODE_INNER (op->mode)).require ();
+      op->value = gen_lowpart (op->mode, op->value);
+    }
+}
+
+/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
+   intrinsic names.  Return the equivalent low unspec.  */
+static int
+aarch64_get_low_unspec (int unspec)
+{
+  switch (unspec)
+    {
+    case UNSPEC_FCVTN2_FP8:
+      return UNSPEC_FCVTN_FP8;
+    case UNSPEC_F1CVTL2_FP8:
+      return UNSPEC_F1CVTL_FP8;
+    case UNSPEC_F2CVTL2_FP8:
+      return UNSPEC_F2CVTL_FP8;
+    default:
+      gcc_unreachable ();
+    }
+}
+
  /* Expand CALL_EXPR EXP, given that it is a call to the function described
     by BUILTIN_DATA, and return the function's return value.  Put the result
     in TARGET if convenient.  */
@@ -3572,14 +3625,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
                             TYPE_MODE (TREE_TYPE (arg)));
      }
  
-  /* LUTI2 treats the first argument as a vector of 4 elements.  The forms
-     with 128-bit inputs are only provided as a convenience; the upper halves
-     don't actually matter.  */
-  if (builtin_data.unspec == UNSPEC_LUTI2
-      && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+  if (builtin_data.flags & FLAG_USES_FPMR)
+    {
+      auto fpm_input = ops.pop ().value;
+      auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+      emit_move_insn (fpmr, fpm_input);
+    }
+
+  switch (builtin_data.unspec)
      {
-      ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
-      ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+    case UNSPEC_F1CVTL_FP8:
+    case UNSPEC_F2CVTL_FP8:
+      /* Convert _low forms (which take 128-bit vectors) to the base
+        64-bit forms.  */
+      aarch64_convert_to_v64 (&ops[1]);
+      break;
+
+    case UNSPEC_LUTI2:
+      /* LUTI2 treats the first argument as a vector of 4 elements.  The forms
+        with 128-bit inputs are only provided as a convenience; the upper
+        halves don't actually matter.  */
+      aarch64_convert_to_v64 (&ops[1]);
+      break;
      }
  
    insn_code icode;
@@ -3587,10 +3654,41 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
      {
      case UNSPEC_FAMAX:
      case UNSPEC_FAMIN:
-      icode = code_for_aarch64 (builtin_data.unspec,
-                               builtin_data.types[0].mode);
+    case UNSPEC_F1CVTL_FP8:
+    case UNSPEC_F2CVTL_FP8:
+    case UNSPEC_FSCALE:
+      icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
+      break;
+
+    case UNSPEC_F1CVTL2_FP8:
+    case UNSPEC_F2CVTL2_FP8:
+      {
+       /* Add a high-part selector for the vec_merge.  */
+       auto src_mode = ops.last ().mode;
+       auto nunits = GET_MODE_NUNITS (src_mode).to_constant ();
+       rtx par = aarch64_simd_vect_par_cnst_half (src_mode, nunits, true);
+       create_fixed_operand (ops.safe_push ({}), par);
+
+       auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+       icode = code_for_aarch64_high (unspec, ops[0].mode);
+       break;
+      }
+
+    case UNSPEC_FCVTN_FP8:
+      icode = code_for_aarch64 (builtin_data.unspec, ops[1].mode);
        break;
  
+    case UNSPEC_FCVTN2_FP8:
+      {
+       auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+       auto mode = ops.last ().mode;
+       if (BYTES_BIG_ENDIAN)
+         icode = code_for_aarch64_high_be (unspec, mode);
+       else
+         icode = code_for_aarch64_high_le (unspec, mode);
+       break;
+      }
+
      case UNSPEC_LUTI2:
      case UNSPEC_LUTI4:
        create_integer_operand (ops.safe_push ({}),
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc

index 11d53ad0613506b0eaa7bdefeb2226b66ceedcf4..ff0e5d21e937fd63ea699c93654c7b43616bb099 100644 (file)
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
    aarch64_def_or_undef (TARGET_SVE_BF16,
                         "__ARM_FEATURE_SVE_BF16", pfile);
  
+  aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+
    aarch64_def_or_undef (TARGET_LS64,
                         "__ARM_FEATURE_LS64", pfile);
    aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def

index bc9a63b968af96677d85dd7fef58c2d4d7bca327..6221652b38ff657f4cd319c660bc106a9e714169 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,11 +20,19 @@
  
  #undef ENTRY_BINARY
  #define ENTRY_BINARY(N, T0, T1, T2, U, F)      \
-  ENTRY (N, binary, T0, T1, T2, U, F)
+  ENTRY (N, binary, T0, T1, T2, none, U, F)
  
  #undef ENTRY_BINARY_LANE
  #define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \
-  ENTRY (N, binary_lane, T0, T1, T2, U, F)
+  ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
+
+#undef ENTRY_TERNARY
+#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
+  ENTRY (N, ternary, T0, T1, T2, T3, U, F)
+
+#undef ENTRY_UNARY
+#define ENTRY_UNARY(N, T0, T1, U, F)   \
+  ENTRY (N, unary, T0, T1, none, none, U, F)
  
  #undef ENTRY_BINARY_VHSDF
  #define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS)                        \
@@ -34,6 +42,14 @@
    ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC, FLAGS)  \
    ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC, FLAGS)
  
+#undef ENTRY_BINARY_VHSDF_SIGNED
+#define ENTRY_BINARY_VHSDF_SIGNED(NAME, UNSPEC, FLAGS)         \
+  ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC, FLAGS)      \
+  ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC, FLAGS)  \
+  ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC, FLAGS)      \
+  ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC, FLAGS)  \
+  ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC, FLAGS)
+
  #undef ENTRY_TERNARY_VLUT8
  #define ENTRY_TERNARY_VLUT8(T)                                 \
    ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8,      \
@@ -64,6 +80,11 @@
    ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q,  \
                      UNSPEC_LUTI4, QUIET)
  
+#undef ENTRY_UNARY_VQ_BHF
+#define ENTRY_UNARY_VQ_BHF(N, T1, UNSPEC, FLAGS)               \
+  ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS)     \
+  ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS)
+
  // faminmax
  #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
  ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP)
@@ -82,3 +103,25 @@ ENTRY_TERNARY_VLUT16 (p)
  ENTRY_TERNARY_VLUT16 (s)
  ENTRY_TERNARY_VLUT16 (u)
  #undef REQUIRED_EXTENSIONS
+
+// fpm conversion
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1, f8, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_high, f8q, UNSPEC_F1CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_low, f8q, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2, f8, UNSPEC_F2CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_high, f8q, UNSPEC_F2CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_low, f8q, UNSPEC_F2CVTL_FP8, FP8)
+
+ENTRY_BINARY (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_FCVTN_FP8, FP8)
+
+ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q,
+              UNSPEC_FCVTN2_FP8, FP8)
+#undef REQUIRED_EXTENSIONS
+
+// fpm scaling
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index fd6965d9e86ef65001f3ca3c4dc00930e6f665dd..e3b4e609a199f144e0a68c7d3fb46cbbdd9c728b 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10024,3 +10024,76 @@
    "TARGET_LUT && INTVAL (operands[4]) == 4"
    "luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
  )
+
+;; fpm unary instructions (low part).
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+       (unspec:VQ_BHF
+        [(match_operand:V8QI 1 "register_operand" "w")
+         (reg:DI FPM_REGNUM)]
+       FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<b><insn>\t%0.<Vtype>, %1.8b"
+)
+
+;; fpm unary instructions (high part).
+(define_insn "@aarch64_<insn><mode>_high"
+  [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+       (unspec:VQ_BHF
+        [(vec_select:V8QI
+           (match_operand:V16QI 1 "register_operand" "w")
+           (match_operand:V16QI 2 "vect_par_cnst_hi_half"))
+         (reg:DI FPM_REGNUM)]
+       FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<b><insn>2\t%0.<Vtype>, %1.16b"
+)
+
+;; fpm binary instructions.
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:<VPACKB> 0 "register_operand" "=w")
+       (unspec:<VPACKB>
+        [(match_operand:VCVTFPM 1 "register_operand" "w")
+         (match_operand:VCVTFPM 2 "register_operand" "w")
+         (reg:DI FPM_REGNUM)]
+        FPM_BINARY_UNS))]
+  "TARGET_FP8"
+  "<insn>\t%0.<VPACKBtype>, %1.<Vtype>, %2.<Vtype>"
+)
+
+;; fpm binary instructions & merge with low.
+(define_insn "@aarch64_<insn><mode>_high_le"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (vec_concat:V16QI
+         (match_operand:V8QI 1 "register_operand" "0")
+         (unspec:V8QI
+           [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+            (match_operand:V4SF_ONLY 3 "register_operand" "w")
+            (reg:DI FPM_REGNUM)]
+           FPM_BINARY_UNS)))]
+  "TARGET_FP8 && !BYTES_BIG_ENDIAN"
+  "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+(define_insn "@aarch64_<insn><mode>_high_be"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (vec_concat:V16QI
+         (unspec:V8QI
+           [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+            (match_operand:V4SF_ONLY 3 "register_operand" "w")
+            (reg:DI FPM_REGNUM)]
+           FPM_BINARY_UNS)
+         (match_operand:V8QI 1 "register_operand" "0")))]
+  "TARGET_FP8 && BYTES_BIG_ENDIAN"
+  "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+;; fscale instructions
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+                      (match_operand:<FCVT_TARGET> 2 "register_operand" "w")]
+                     FSCALE_UNS))]
+  "TARGET_FP8"
+  "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index c7f87c4f6c7f120ac1dc7e6667806b7ea6015034..33cb513390d710c8999429bd7227bcaac5b1d308 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -41,6 +41,7 @@
  ;; Iterators for single modes, for "@" patterns.
  (define_mode_iterator SI_ONLY [SI])
  (define_mode_iterator DI_ONLY [DI])
+(define_mode_iterator V4SF_ONLY [V4SF])
  
  ;; Iterator for all integer modes (up to 64-bit)
  (define_mode_iterator ALLI [QI HI SI DI])
@@ -181,6 +182,9 @@
  ;; Advanced SIMD single Float modes.
  (define_mode_iterator VDQSF [V2SF V4SF])
  
+;; Quad vector float modes with half/bfloat elements.
+(define_mode_iterator VQ_BHF [V8HF V8BF])
+
  ;; Quad vector Float modes with half/single elements.
  (define_mode_iterator VQ_HSF [V8HF V4SF])
  
@@ -430,6 +434,9 @@
  (define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
  (define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
  
+;; Modes available for Advanced SIMD FP8 conversion operations.
+(define_mode_iterator VCVTFPM [V4HF V8HF V4SF])
+
  ;; Iterators for single modes, for "@" patterns.
  (define_mode_iterator VNx16QI_ONLY [VNx16QI])
  (define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -715,6 +722,12 @@
      UNSPEC_ASHIFT_SIGNED       ; Used in aarch-simd.md.
      UNSPEC_ASHIFT_UNSIGNED     ; Used in aarch64-simd.md.
      UNSPEC_ABS         ; Used in aarch64-simd.md.
+    UNSPEC_FCVTN_FP8   ; Used in aarch64-simd.md.
+    UNSPEC_FCVTN2_FP8  ; Used in aarch64-builtins.cc.
+    UNSPEC_F1CVTL_FP8  ; Used in aarch64-simd.md.
+    UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc.
+    UNSPEC_F2CVTL_FP8  ; Used in aarch64-simd.md.
+    UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc.
      UNSPEC_FMAX                ; Used in aarch64-simd.md.
      UNSPEC_FMAXNMV     ; Used in aarch64-simd.md.
      UNSPEC_FMAXV       ; Used in aarch64-simd.md.
@@ -723,6 +736,7 @@
      UNSPEC_FMINV       ; Used in aarch64-simd.md.
      UNSPEC_FADDV       ; Used in aarch64-simd.md.
      UNSPEC_FNEG                ; Used in aarch64-simd.md.
+    UNSPEC_FSCALE      ; Used in aarch64-simd.md.
      UNSPEC_ADDV                ; Used in aarch64-simd.md.
      UNSPEC_SMAXV       ; Used in aarch64-simd.md.
      UNSPEC_SMINV       ; Used in aarch64-simd.md.
@@ -1794,6 +1808,11 @@
  (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
                            (V2DI "4s")])
  
+;; The result of FCVTN on two vectors of the given mode.  The result has
+;; twice as many QI elements as the input.
+(define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
+(define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")])
+
  ;; Widened modes of vector modes.
  (define_mode_attr VWIDE [(V8QI  "V8HI")  (V4HI  "V4SI")
                          (V2SI  "V2DI")  (V16QI "V8HI")
@@ -2551,7 +2570,8 @@
                                  (V8HI "vec") (V2SI "vec") (V4SI "vec")
                                  (V2DI "vec") (DI "offset")])
  
-(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
+(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
+                    (VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
                      (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
                      (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
  
@@ -3798,10 +3818,25 @@
     UNSPEC_FMLALLTB_FP8
     UNSPEC_FMLALLTT_FP8])
  
+;; Iterators for fpm instructions
+
+(define_int_iterator FPM_UNARY_UNS [UNSPEC_F1CVTL_FP8 UNSPEC_F2CVTL_FP8])
+
+(define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8])
+
+(define_int_iterator FSCALE_UNS [UNSPEC_FSCALE])
+
  ;; -------------------------------------------------------------------
  ;; Int Iterators Attributes.
  ;; -------------------------------------------------------------------
  
+;; The AArch64 insn mnemonic associated with an unspec.
+(define_int_attr insn
+  [(UNSPEC_F1CVTL_FP8 "f1cvtl")
+   (UNSPEC_F2CVTL_FP8 "f2cvtl")
+   (UNSPEC_FCVTN_FP8 "fcvtn")
+   (UNSPEC_FSCALE "fscale")])
+
  ;; The optab associated with an operation.  Note that for ANDF, IORF
  ;; and XORF, the optab pattern is not actually defined; we just use this
  ;; name for consistency with the integer patterns.
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c

index afb44f83f60dc177f3c0d434daebbf448eb3373d..635a7eaf4a2cbfa941b60f3ba7289250529288c9 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -5,19 +5,9 @@
  
  #include <arm_acle.h>
  
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
  #pragma GCC push_options
  #pragma GCC target("arch=armv9.4-a+fp8")
  
-/* We do not define __ARM_FEATURE_FP8 until all
-   relevant features have been added. */
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
  /*
  **test_write_fpmr_sysreg_asm_64:
  **     msr     fpmr, x0
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c

index 37bd844f581d38b00fa1ae9cf81ee5457024d059..e5a19aaefb65ca3723024cf15f6aec11aea09cc2 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
@@ -263,3 +263,13 @@
  #ifdef __ARM_FEATURE_GCS
  #error Foo
  #endif
+
+#pragma GCC target "arch=armv9-a"
+#ifdef __ARM_FEATURE_FP8
+#error Foo
+#endif
+
+#pragma GCC target "arch=armv9-a+fp8"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c

new file mode 100644 (file)

index 0000000..d95a861
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vscale_f16:
+**     fscale  v0.4h, v0.4h, v1.4h
+**     ret
+*/
+float16x4_t
+test_vscale_f16 (float16x4_t a, int16x4_t b)
+{
+  return vscale_f16 (a, b);
+}
+
+/*
+** test_vscaleq_f16:
+**     fscale  v0.8h, v0.8h, v1.8h
+**     ret
+*/
+float16x8_t
+test_vscaleq_f16 (float16x8_t a, int16x8_t b)
+{
+  return vscaleq_f16 (a, b);
+}
+
+/*
+** test_vscale_f32:
+**     fscale  v0.2s, v0.2s, v1.2s
+**     ret
+*/
+float32x2_t
+test_vscale_f32 (float32x2_t a, int32x2_t b)
+{
+  return vscale_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f32:
+**     fscale  v0.4s, v0.4s, v1.4s
+**     ret
+*/
+float32x4_t
+test_vscaleq_f32 (float32x4_t a, int32x4_t b)
+{
+  return vscaleq_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f64:
+**     fscale  v0.2d, v0.2d, v1.2d
+**     ret
+*/
+float64x2_t
+test_vscaleq_f64 (float64x2_t a, int64x2_t b)
+{
+  return vscaleq_f64 (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c

new file mode 100644 (file)

index 0000000..3907668
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
@@ -0,0 +1,197 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl2        v0.8h, v0.16b
+**     ret
+*/
+bfloat16x8_t
+test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl2 v0.8h, v0.16b
+**     ret
+*/
+float16x8_t
+test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf2cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf2cvtl2        v0.8h, v0.16b
+**     ret
+*/
+bfloat16x8_t
+test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_f16:
+**     msr     fpmr, x0
+**     f2cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_f16:
+**     msr     fpmr, x0
+**     f2cvtl2 v0.8h, v0.16b
+**     ret
+*/
+float16x8_t
+test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt_f16:
+**     msr     fpmr, x0
+**     fcvtn   v0.8b, v0.4h, v1.4h
+**     ret
+*/
+mfloat8x8_t
+test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvtq_f16:
+**     msr     fpmr, x0
+**     fcvtn   v0.16b, v0.8h, v1.8h
+**     ret
+*/
+mfloat8x16_t
+test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
+{
+  return vcvtq_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_f32:
+**     msr     fpmr, x0
+**     fcvtn   v0.8b, v0.4s, v1.4s
+**     ret
+*/
+mfloat8x8_t
+test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f32_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_high_f32:
+**     msr     fpmr, x0
+**     fcvtn2  v0.16b, v1.4s, v2.4s
+**     ret
+*/
+mfloat8x16_t
+test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
+{
+  return vcvt_high_mf8_f32_fpm(a, b, c, d);
+}
author	Saurabh Jha <saurabh.jha@arm.com>
	Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 10 Dec 2024 13:21:20 +0000 (13:21 +0000)
gcc/config/aarch64/aarch64-builtins.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-c.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-simd-pragma-builtins.def		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/iterators.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/acle/fp8.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c	[new file with mode: 0644]	patch \| blob